def get_query_termList(query): wordFormer = FormWords() constants = ConstantVars() query = wordFormer.normalize(query) query_tokens = wordFormer.tokenize(query) for token in query_tokens: if token in constants.punctuations( ) or token in constants.StopWords(): query_tokens.remove(token) query_tokens = wordFormer.uniform(query_tokens) # postaged_tokens = wordFormer.posTagging(query_tokens) stemmed_tokens = wordFormer.stemmWords(query_tokens) lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens) lemmatized_tokens = list(filter(lambda a: a != '"', lemmatized_tokens)) return lemmatized_tokens
# # def lemmatize(self, word, pos=''): # if not pos and word in self.words: # return word # # if (not pos or pos == 'V') and word in self.verbs: # return self.verbs[word] # # if pos.startswith('AJ') and word[-1] == 'ی': # + with_nots(present_simples) + with_nots( # present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives) from DataLayer.constants import ConstantVars from BusinessLayer.textOperations import FormWords wordFormer = FormWords() constants = ConstantVars() query_tokens = wordFormer.tokenize("شفاف سازی") print('query tokens') print(query_tokens) postaged_tokens = wordFormer.posTagging(query_tokens) print(postaged_tokens) stemmed_tokens = wordFormer.stemmWords(query_tokens, len(query_tokens)) print(stemmed_tokens) lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens, postaged_tokens, len(query_tokens)) print(lemmatized_tokens) for token in lemmatized_tokens: if token in constants.punctuations() or token in constants.StopWords(): lemmatized_tokens.remove(token) print(lemmatized_tokens)
class Query: def __init__(self, i): self.wordFormer = FormWords() self.indexTasks = i self.constants = ConstantVars() self.relatedDocs = np.array([dict() for i in range(100)]) self.notRelatedDocs = np.array([dict() for i in range(100)]) self.notRelatedCounts = 0 def processQuery(self, query): query_tokens = self.wordFormer.tokenize(query) for token in query_tokens: if token in self.constants.punctuations( ) or token in self.constants.StopWords(): query_tokens.remove(token) print('query tokens') print(query_tokens) postaged_tokens = self.wordFormer.posTagging(query_tokens) stemmed_tokens = self.wordFormer.stemmWords(query_tokens, len(query_tokens)) lemmatized_tokens = self.wordFormer.lemmatizeWords( stemmed_tokens, postaged_tokens, len(query_tokens)) i = j = 0 k = -1 not_include = False order = False orderTokens = [[] for i in range(5)] for token in lemmatized_tokens: if token == "\"" and order == False: k += 1 order = True continue if token == "\"" and order == True: order = False continue if order: orderTokens[k].append(token) continue if token == "!": not_include = True self.notRelatedCounts += 1 continue if not_include: self.notRelatedDocs[j] = self.indexTasks.getRelatedDocs(token) not_include = False else: self.relatedDocs[i] = self.indexTasks.getRelatedDocs(token) i += 1 print('related docs') print(self.relatedDocs) related_result, relatedPos = self.merge(self.relatedDocs, i) docs = np.array([dict() for i in range(10)]) doc_pos = np.array([dict() for i in range(10)]) j = 0 if related_result != []: docs[j] = related_result doc_pos[j] = relatedPos j += 1 for i in range(0, k - 1): phrase_container, phrase_pos = self.phraseContainerDocs( orderTokens[i]) docs[j] = phrase_container doc_pos[j] = phrase_pos print('phrase') print(phrase_container) print(phrase_pos) j += 1 final_result, final_pos = self.finalMerge(docs, doc_pos, j) relateds_and_not_unrelateds, related_pos = self.notMerge( final_result, final_pos) print(relateds_and_not_unrelateds) print(related_pos) return relateds_and_not_unrelateds, related_pos def merge(self, docs, len): answer = [] postingAns = [] if len == 0: return [], [] elif len == 1: return list(docs[0].keys()), list(docs[0].values()) else: p2 = list(docs[0].keys()) postings2 = [] for docID in p2: postings2.append(docs[0][docID]) i = 1 while i < len: p1 = list(docs[i].keys()) postings1 = [] for docID in p1: postings1.append(docs[i][docID]) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: answer.append(p1[0]) postingAns.append(postings1[0] + postings2[0]) p1.remove(p1[0]) p2.remove(p2[0]) postings1.remove(postings1[0]) postings2.remove(postings2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) postings1.remove(postings1[0]) else: p2.remove(p2[0]) postings2.remove(postings2[0]) p2 = answer postings2 = postingAns print('docc') print(answer) print(postingAns) return answer, postingAns def finalMerge(self, docs, docPos, length): answer = [] docPosAns = [] if length == 0: return [], [] elif length == 1: return list(docs[0]), list(docPos[0]) else: p2 = list(docs[0]) docPos2 = list(docPos[0]) i = 1 while i < length: p1 = list(docs[i]) docPos1 = list(docPos[i]) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: answer.append(p1[0]) docPosAns.append(docPos1[0] + docPos2[0]) p1.remove(p1[0]) p2.remove(p2[0]) docPos1.remove(docPos1[0]) docPos2.remove(docPos2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) docPos1.remove(docPos1[0]) else: p2.remove(p2[0]) docPos2.remove(docPos2[0]) p2 = answer docPos2 = docPosAns print('docc and double quote') print(answer) print(docPosAns) return answer, docPosAns def notMerge(self, relatedDocs, relatedPos): print('no relate') print(self.notRelatedDocs) answer = [] postingAns = [] if self.notRelatedCounts == 0: if len(relatedDocs) != 0: return relatedDocs, list(relatedPos) else: return [], [] else: p1 = relatedDocs posting1 = relatedPos i = 0 while i < self.notRelatedCounts: p2 = list(self.notRelatedDocs[i].keys()) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: p1.remove(p1[0]) posting1.remove(posting1[0]) p2.remove(p2[0]) elif p1[0] < p2[0]: answer.append(p1[0]) postingAns.append(posting1[0]) posting1.remove(posting1[0]) p1.remove(p1[0]) else: p2.remove(p2[0]) for p in p1: answer.append(p) for posting in posting1: postingAns.append(posting) print('finall docc') return answer, postingAns def phraseContainerDocs(self, pharase): # to numbers of pharase length docs = np.array([dict() for i in range(10)]) i = 0 for p in pharase: docs[i] = self.indexTasks.getRelatedDocs(p) i += 1 answer = [] answer_posting = [[] for i in range(50)] length = len(docs) if length == 0: return [], [] elif length == 1: return list(docs[0].keys()), list(docs[0].values()) else: p2 = list(docs[0].keys()) posting2 = list(docs[0].values()) i = 1 index = -1 while i < length: p1 = list(docs[i].keys()) posting1 = list(docs[i].values()) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: for posting in posting2[0]: if (posting + 1) in posting1[0]: if p1[0] not in answer: answer.append(p1[0]) index += 1 answer_posting[index].append(posting + 1) # print({p1[0] : docs[i - 1][p1[0]]}) p1.remove(p1[0]) p2.remove(p2[0]) posting1.remove(posting1[0]) posting2.remove(posting2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) posting1.remove(posting1[0]) else: p2.remove(p2[0]) posting2.remove(posting2[0]) p2 = answer posting2 = answer_posting print('double qoute') print(answer) print(answer_posting) return answer, answer_posting
class Index: def __init__(self): self.input = FileInOut() self.wordFormer = FormWords() self.constants = ConstantVars() self.dictionary = dict() self.posting_list = np.array([dict() for j in range(150000)]) self.dicIndex = 0 self.docIndex = 0 self.c = 0 def Filter(self, string, substr): return [ str if not any(sub == str for sub in substr) else '**' for str in string ] def makeDic(self, value, j): if value not in self.dictionary.keys() and value != '**': # print(self.dicIndex) # print(value) if '\n' in value: pass else: self.dictionary[value] = 1 self.input.writeDic([value]) self.posting_list[self.dicIndex][self.docIndex] = [j] self.dicIndex += 1 elif value in self.dictionary.keys() and value != '**': if self.docIndex in self.posting_list[list( self.dictionary.keys()).index(value)].keys(): self.posting_list[list(self.dictionary.keys()).index(value)][ self.docIndex].append(j) else: self.posting_list[list( self.dictionary.keys()).index(value)][self.docIndex] = [j] def indexData(self): for n in range(15): data = self.input.readData('ir-news-' + str(n) + '.csv') for d in data["content"]: print(self.docIndex) self.docIndex += 1 d = self.cleanContent(d) d = self.wordFormer.normalize(d) tokens = self.wordFormer.tokenize(d) self.c += len(tokens) tokens = list(filter(lambda a: a != '\n', tokens)) tokens = self.wordFormer.uniform(tokens) # postaged_tokens = self.wordFormer.posTagging(tokens) stemmed_tokens = self.wordFormer.stemmWords(tokens) lemmatized_tokens = self.wordFormer.lemmatizeWords( stemmed_tokens) lemmatized_tokens = self.Filter( lemmatized_tokens, self.constants.punctuations() + ['\"', '\"', '!', '', '\n'] + self.constants.StopWords()) list( map(self.makeDic, lemmatized_tokens, [i for i in range(0, len(lemmatized_tokens))])) print('doc' + str(n) + ': ' + str(self.docIndex)) # for i in range(len(list(self.dictionary.keys()))): # print(i) # print(list(self.dictionary.keys()).pop(i)) for i in range(0, len(self.posting_list)): self.input.writeDocID(self.posting_list[i]) self.input.writePostingList([ self.stringmaker(self.posting_list[i][key]) for key in self.posting_list[i].keys() ]) print('number of tokens') print(self.c) print(time.time()) def getRelatedDocs(self, token): if token in self.dictionary: return self.posting_list[np.where(self.dictionary == token)][0] else: return {} def cleanContent(self, raw): cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') cleanText = re.sub(cleaner, ' ', raw) return cleanText def stringmaker(self, list): stri = '' for i in list: stri = stri + str(i) + ' ' return stri
class Tokenizer: def __init__(self): self.alphabet = [ 'پ', 'چ', 'ج', 'ح', 'خ', 'ه', 'ع', 'غ', 'ف', 'ق', 'ث', 'ص', 'ض', 'گ', 'ک', 'م', 'ن', 'ت', 'ا', 'ل', 'ب', 'آ', 'ی', 'س', 'ش', 'و', 'ئ', 'د', 'ذ', 'ر', 'ز', 'ط', 'ظ', 'أ', 'ژ', '\u200c', 'ُ', 'ّ', 'ة', 'ۀ', 'ؤ', 'ء', 'إ' ] self.constants = ConstantVars() self.after_verbs = {'ام', 'ای', 'ایم', 'اید', 'اند'} self.before_verbs = {'می', 'نمی'} self.suffixes = { 'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام', 'ات', 'اش' } self.expression = [] self.expression.extend([ ('علی ای حال', 'علی\u200cای\u200cحال'), ('بنا بر این', 'بنابراین'), ('بنابر این', 'بنابراین'), ('مع ذالک', 'مع\u200cذالک'), ('فی ما بین', 'فی\u200cمابین'), ('فی مابین', 'فی\u200cمابین'), ('چنان چه', 'چنان\u200cچه'), ('در واقع', 'درواقع'), ('فی کل حال', 'فی\u200cکل\u200cحال'), ]) self.expression = compile_patterns(self.expression) def word_tokenize(self, sentence): sentence = self.expression_corrector(sentence) splits = sentence.split(' ') sentence = ' '.join(self.token_spacing(splits)) splits = sentence.split(' ') split = [] i = 0 j = 0 for s in splits: if s != '': string = '' string2 = '' for w in s: if w in self.alphabet: string = string + w else: string2 = string2 + w if s[0] in self.alphabet: if string != '': split.append(string) if string2 != '': string2 = re.sub(r'(.)\1+', r'\1', string2, re.UNICODE) count = 0 for st in string2: if st in self.constants.punctuations() + [ '\"', '!' ]: count += 1 if count == len(string2): for st in string2[:len(string2)]: split.append(st) else: split.append(string2) else: if string2 != '': string2 = re.sub(r'(.)\1+', r'\1', string2, re.UNICODE) count = 0 for st in string2: if st in self.constants.punctuations() + [ '\"', '!' ]: count += 1 if count == len(string2): for st in string2[:len(string2)]: split.append(st) else: split.append(string2) if string != '': split.append(string) return split def token_spacing(self, tokens): result = [] for t, token in enumerate(tokens): joined = False token_pair = '' if result: token_pair = result[-1] + '\u200c' + token if token in self.suffixes: joined = True elif result[-1] in self.before_verbs: joined = True elif token in self.after_verbs: joined = True if joined: result.pop() result.append(token_pair) else: result.append(token) return result def expression_corrector(self, text): for pattern, repl in self.expression: text = pattern.sub(repl, text) return text
class QueryProc: def __init__(self): self.input = FileInOut() self.Dic = self.input.readDic() self.DocID_file = self.input.readDocID() self.posting_file = self.input.readPostingList() self.wordFormer = FormWords() self.constants = ConstantVars() self.relatedDocs = [] self.notRelatedDocs = [] self.relatedDocsPos = [] self.notRelatedDocsPos = [] self.notRelatedCounts = 0 def initializing(self, query): print(query) query = self.wordFormer.normalize(query) print(query) query_tokens = self.wordFormer.tokenize(query) for token in query_tokens: if token in self.constants.punctuations() or token in self.constants.StopWords(): query_tokens.remove(token) query_tokens = self.wordFormer.uniform(query_tokens) # postaged_tokens = self.wordFormer.posTagging(query_tokens) stemmed_tokens = self.wordFormer.stemmWords(query_tokens) lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens) i = j = 0 k = 0 not_include = False order = False orderTokens = [[] for i in range(5)] for token in lemmatized_tokens: print(token) if token == "«" and order == False: print('first') k += 1 order = True continue if token == "»" and order == True: print('second') order = False continue if order: orderTokens[k - 1].append(token) continue if token == "!": not_include = True self.notRelatedCounts += 1 continue if not_include: self.notRelatedDocs.append(self.getRelatedSavedDocs(token)) self.notRelatedDocsPos.append(self.getRelatedSavedpos(token)) not_include = False print('order') print(order) if not not_include and not order: print('hahahaha') self.relatedDocs.append(self.getRelatedSavedDocs(token)) self.relatedDocsPos.append(self.getRelatedSavedpos(token)) # related_result, relatedPos = self.merge(self.relatedDocs, i) related_result = [] relatedPos = [] for res in range(len(self.relatedDocs)): related_result = related_result + self.relatedDocs[res] relatedPos = relatedPos + self.relatedDocsPos[res] related_result = list(set(related_result)) relatedPos = relatedPos[:len(related_result)] docs = [] doc_pos = [] j = 0 if related_result != []: docs.append(related_result) doc_pos.append(relatedPos) j += 1 for i in range(0, k): phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i]) docs.append(phrase_container) doc_pos.append(phrase_pos) j += 1 final_result, final_pos = self.finalMerge(docs, doc_pos, j) relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos) # i += 1 return relateds_and_not_unrelateds,related_position def merge_common_docs(self, common_list, docList1, docList2, indexList1, indexList2): for doc in common_list: i1 = docList1.index(doc) i2 = docList2.index(doc) docList2.pop(i2) indexList1[i1] = indexList1[i1] + indexList2.pop(i2) indexList1 = indexList1 + indexList2 docList1 = docList1 + docList2 return indexList1, docList1 def similarity_merge(self, docLists, indexLists): if len(docLists) == 0: return None, None docs = docLists.pop(0) indexes = list(filter(lambda n: n != [], indexLists.pop(0))) if len(docLists) == 0: return docs, indexes for doc in docLists: i = docLists.index(doc) doci = docLists.pop(i) dociPos = list(filter(lambda n: n != [], indexLists.pop(i))) common = list(set(doci) & set(docs)) indexes, docs = self.merge_common_docs(common, docs, doci, indexes, dociPos) return docs, indexes def processQueryBySimilarity(self, query): print('queryyy') print(query) docList, indexList = self.initializing(query) # related_result, related_pos = self.relatedDocs, self.relatedDocsPos # j = 0 # if related_result != []: # j += 1 # for i in range(0, k): # phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i]) # related_result.append(phrase_container) # related_pos.append(phrase_pos) # j += 1 # relateds_and_not_unrelateds, related_position = self.finalMerge(related_result, related_pos, j) # # relateds_and_not_unrelateds, related_position = self.similarity_merge(related_result, related_pos) # docList, indexList = self.notMerge(relateds_and_not_unrelateds, related_position) return docList, indexList def processQuery(self, query): query = self.wordFormer.normalize(query) query_tokens = self.wordFormer.tokenize(query) for token in query_tokens: if token in self.constants.punctuations() or token in self.constants.StopWords(): query_tokens.remove(token) query_tokens = self.wordFormer.uniform(query_tokens) # postaged_tokens = self.wordFormer.posTagging(query_tokens) stemmed_tokens = self.wordFormer.stemmWords(query_tokens) lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens) i = j = 0 k = 0 not_include = False order = False orderTokens = [[] for i in range(5)] for token in lemmatized_tokens: if token == "\"" and order == False: k += 1 order = True continue if token == "\"" and order == True: order = False continue if order: orderTokens[k - 1].append(token) continue if token == "!": not_include = True self.notRelatedCounts += 1 continue if not_include: self.notRelatedDocs.append(self.getRelatedSavedDocs(token)) self.notRelatedDocsPos.append(self.getRelatedSavedpos(token)) not_include = False else: self.relatedDocs.append(self.getRelatedSavedDocs(token)) self.relatedDocsPos.append(self.getRelatedSavedpos(token)) i += 1 # print('related docs') # print(self.relatedDocs) related_result, relatedPos = self.merge(self.relatedDocs, i) docs = [] doc_pos = [] j = 0 if related_result != []: docs.append(related_result) doc_pos.append(relatedPos) j += 1 for i in range(0, k): phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i]) docs.append(phrase_container) doc_pos.append(phrase_pos) j += 1 final_result, final_pos = self.finalMerge(docs, doc_pos, j) # print("self.notRelatedCounts") # print(self.notRelatedCounts) # print('no relate') # print(self.notRelatedDocs) relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos) # for i in range(len(related_pos)): # related_pos[i] = related_pos[i] # print(relateds_and_not_unrelateds) # print(related_position) return relateds_and_not_unrelateds, related_position def merge(self, docs, leng): answer = [] postingAns = [] if leng == 0: return [], [] elif leng == 1: return docs[0], self.relatedDocsPos[0] else: p2 = docs[0] postings2 = [] for j in range(len(p2)): postings2.append(self.relatedDocsPos[0][j]) i = 1 while i < leng: p1 = docs[i] postings1 = [] for j in range(len(p1)): postings1.append(self.relatedDocsPos[i][j]) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: answer.append(p1[0]) postingAns.append(postings1[0] + postings2[0]) p1.remove(p1[0]) p2.remove(p2[0]) postings1.remove(postings1[0]) postings2.remove(postings2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) postings1.remove(postings1[0]) else: p2.remove(p2[0]) postings2.remove(postings2[0]) p2 = answer postings2 = postingAns print('docc') print(answer) print(postingAns) return answer, postingAns def finalMerge(self, docs, docPos, length): answer = [] docPosAns = [] if length == 0: return [], [] elif length == 1: return list(docs[0]), list(docPos[0]) else: p2 = list(docs[0]) docPos2 = list(docPos[0]) i = 1 while i < length: p1 = list(docs[i]) docPos1 = list(docPos[i]) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: answer.append(p1[0]) docPosAns.append(docPos1[0] + docPos2[0]) p1.remove(p1[0]) p2.remove(p2[0]) docPos1.remove(docPos1[0]) docPos2.remove(docPos2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) docPos1.remove(docPos1[0]) else: p2.remove(p2[0]) docPos2.remove(docPos2[0]) p2 = answer docPos2 = docPosAns # print('docc and double quote') # print(answer) # print(docPosAns) return answer, docPosAns def notMerge(self, relatedDocs, relatedPos): print('no relate') print(self.notRelatedDocs) answer = [] postingAns = [] if self.notRelatedCounts == 0: if len(relatedDocs) != 0: return relatedDocs, list(relatedPos) else: return [], [] else: p1 = relatedDocs posting1 = relatedPos i = 0 while i < self.notRelatedCounts: p2 = self.notRelatedDocs[i] i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: p1.remove(p1[0]) posting1.remove(posting1[0]) p2.remove(p2[0]) elif p1[0] < p2[0]: answer.append(p1[0]) postingAns.append(posting1[0]) posting1.remove(posting1[0]) p1.remove(p1[0]) else: p2.remove(p2[0]) for p in p1: answer.append(p) for posting in posting1: postingAns.append(posting) print('finall docc') return answer, postingAns def phraseContainerDocs(self, pharase): # to numbers of pharase length docs = [] docsPos = [] for p in pharase: docs.append(self.getRelatedSavedDocs(p)) docsPos.append(self.getRelatedSavedpos(p)) answer = [] answer_posting = [[] for k in range(50)] length = len(docs) if length == 0: return [], [] elif length == 1: # print(docs[0]) return docs[0], docsPos[0] else: p2 = docs[0] posting2 = docsPos[0] i = 1 while i < len(pharase): index = -1 answer = [] answer_posting = [[] for k in range(50)] p1 = docs[i] posting1 = docsPos[i] i += 1 while (p1 != [] and p2 != []): if p1[0] == p2[0]: for posting in posting2[0]: if (posting + 1) in posting1[0]: if p1[0] not in answer: answer.append(p1[0]) index += 1 answer_posting[index].append(posting + 1) # print({p1[0] : docs[i - 1][p1[0]]}) p1.remove(p1[0]) p2.remove(p2[0]) posting1.remove(posting1[0]) posting2.remove(posting2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) posting1.remove(posting1[0]) else: p2.remove(p2[0]) posting2.remove(posting2[0]) p2 = answer # print('ans') # print(answer) # print(answer_posting) posting2 = answer_posting # print('double qoute') # print(answer) # print(answer_posting) return answer, answer_posting def getRelatedSavedDocs(self, token): i = 0 if token in self.Dic: # print(self.Dic.index(token)) posting = list(map(int, self.DocID_file[self.Dic.index(token)])) i += 1 print(posting) return posting return [] def getRelatedSavedpos(self, token): i = 0 if token in self.Dic: # print(self.Dic.index(token)) posting = [list(map(int, self.posting_file[self.Dic.index(token)][j].split(' '))) for j in range(len(self.posting_file[self.Dic.index(token)]))] i += 1 return posting return []