def __init__(self, i): self.wordFormer = FormWords() self.indexTasks = i self.constants = ConstantVars() self.relatedDocs = np.array([dict() for i in range(100)]) self.notRelatedDocs = np.array([dict() for i in range(100)]) self.notRelatedCounts = 0
def __init__(self): self.alphabet = [ 'پ', 'چ', 'ج', 'ح', 'خ', 'ه', 'ع', 'غ', 'ف', 'ق', 'ث', 'ص', 'ض', 'گ', 'ک', 'م', 'ن', 'ت', 'ا', 'ل', 'ب', 'آ', 'ی', 'س', 'ش', 'و', 'ئ', 'د', 'ذ', 'ر', 'ز', 'ط', 'ظ', 'أ', 'ژ', '\u200c', 'ُ', 'ّ', 'ة', 'ۀ', 'ؤ', 'ء', 'إ' ] self.constants = ConstantVars() self.after_verbs = {'ام', 'ای', 'ایم', 'اید', 'اند'} self.before_verbs = {'می', 'نمی'} self.suffixes = { 'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام', 'ات', 'اش' } self.expression = [] self.expression.extend([ ('علی ای حال', 'علی\u200cای\u200cحال'), ('بنا بر این', 'بنابراین'), ('بنابر این', 'بنابراین'), ('مع ذالک', 'مع\u200cذالک'), ('فی ما بین', 'فی\u200cمابین'), ('فی مابین', 'فی\u200cمابین'), ('چنان چه', 'چنان\u200cچه'), ('در واقع', 'درواقع'), ('فی کل حال', 'فی\u200cکل\u200cحال'), ]) self.expression = compile_patterns(self.expression)
def __init__(self): self.input = FileInOut() self.wordFormer = FormWords() self.constants = ConstantVars() self.dictionary = dict() self.posting_list = np.array([dict() for j in range(150000)]) self.dicIndex = 0 self.docIndex = 0 self.c = 0
def __init__(self): self.input = FileInOut() self.Dic = self.input.readDic() self.DocID_file = self.input.readDocID() self.posting_file = self.input.readPostingList() self.wordFormer = FormWords() self.constants = ConstantVars() self.relatedDocs = [] self.notRelatedDocs = [] self.relatedDocsPos = [] self.notRelatedDocsPos = [] self.notRelatedCounts = 0
def get_query_termList(query): wordFormer = FormWords() constants = ConstantVars() query = wordFormer.normalize(query) query_tokens = wordFormer.tokenize(query) for token in query_tokens: if token in constants.punctuations( ) or token in constants.StopWords(): query_tokens.remove(token) query_tokens = wordFormer.uniform(query_tokens) # postaged_tokens = wordFormer.posTagging(query_tokens) stemmed_tokens = wordFormer.stemmWords(query_tokens) lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens) lemmatized_tokens = list(filter(lambda a: a != '"', lemmatized_tokens)) return lemmatized_tokens
# # def lemmatize(self, word, pos=''): # if not pos and word in self.words: # return word # # if (not pos or pos == 'V') and word in self.verbs: # return self.verbs[word] # # if pos.startswith('AJ') and word[-1] == 'ی': # + with_nots(present_simples) + with_nots( # present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives) from DataLayer.constants import ConstantVars from BusinessLayer.textOperations import FormWords wordFormer = FormWords() constants = ConstantVars() query_tokens = wordFormer.tokenize("شفاف سازی") print('query tokens') print(query_tokens) postaged_tokens = wordFormer.posTagging(query_tokens) print(postaged_tokens) stemmed_tokens = wordFormer.stemmWords(query_tokens, len(query_tokens)) print(stemmed_tokens) lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens, postaged_tokens, len(query_tokens)) print(lemmatized_tokens) for token in lemmatized_tokens: if token in constants.punctuations() or token in constants.StopWords(): lemmatized_tokens.remove(token) print(lemmatized_tokens)