def booleanQuery(self): ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them''' #ToDo: return a list of docIDs PostingDict = { } #store key value pair of query term and postings by processing index file boolen = [] #stores list of docid for each queryterm key booleanResult = set() tempDic = {} QueryDic = {} for qterm in Queryterm: plist = InvertedIndex.getPostingsList(qterm) '''since every term in inverted index is unique below code adds the qterm:postings list to Postings Dictionary''' PostingDict.update({qterm: plist}) for qterms in PostingDict.keys(): tempDic[qterms] = len(PostingDict[qterms]) for qterms, cf in tempDic.items(): if cf > 0: if cf < 300: QueryDic[qterms] = cf '''checking for length of query term is it contains only single word it directly posts the result read from inverted index file''' if len(QueryDic) == 1: for key in QueryDic.keys(): booleanResult = PostingDict[key] if not booleanResult: print("Given query has no matched Document", ''.join(Query)) else: print("Result of the search query ", booleanResult) else: keylist = list(QueryDic.keys()) '''iterating over query terms as keys and merging postings list over intersection to find list of postings that contains all query terms''' for key in QueryDic.keys(): 'adding postings list of each queryterm' boolen.append(sorted(PostingDict[key], key=int)) '''checking the intersection result boolean result set ''' booleanResult = set.intersection(*map(set, boolen)) 'If first boolean result is null then we process pairwise intersection of query terms' if booleanResult == set(): for i in range(len(QueryDic) - 1): if not i == len(QueryDic) - 1: p1 = PostingDict[keylist[i]] p2 = PostingDict[keylist[i + 1]] temp = InvertedIndex.mergeList(p1, p2) '''checking for empty result post merge if result is not empty set adding the intersection result boolean result set ''' if not temp == set(): booleanResult.update(temp) return sorted(booleanResult, key=int)
def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors 'Finding TF and IDF of Queryterms and saving the result to TF.json and IDF.json file' termfrequency, IDF = postingobj.term_freq(collectionfile, Queryterm) 'Saving TF,IDF of document for given query' indexobj.save(termfrequency, "TF.json") indexobj.save(IDF, "IDF.json") TF_filename = open("TF.json") TF = json.load(TF_filename) IDF_filename = open("IDF.json") IDF = json.load(IDF_filename) QueryDict = {} Qlen = len(Query) Querytf = {} Querytfidf = {} tempdic = {} DocSim = [] '''processing each query term and calculating TF-IDF of query and passing document and query vector to cosine function to calculate cosine similarity''' for term in Queryterm: plist = InvertedIndex.getPostingsList(term) QueryDict.update({term: plist}) if term not in Querytf.keys(): Querytf[term] = 1 else: Querytf[term] = Querytf[term] + 1 for qterms, posting in QueryDict.items(): for pos in posting: for IDFword in IDF: if qterms == IDFword: if qterms not in Querytfidf.keys(): '''calculating tf of query using query token frequency in query to the total query tokens''' tf = Querytf[qterms] '''calculating td-idf of query where idf of word in query is 1+log(N/n) where N total documents and n is number of documents that contain the term ''' Querytfidf[qterms] = {pos: tf * (1 + IDF[IDFword])} else: Querytfidf[qterms].update( {pos: (tf) * (1 + IDF[IDFword])}) TFwordValues = TF[qterms] '''calculating TF*IDF of document and converting it to vector''' for TFdoc, TFvalues in TFwordValues.items(): for IDFword in IDF: if qterms == IDFword and TFdoc == pos: if qterms not in tempdic.keys(): tempdic[qterms] = { TFdoc: (TFvalues) * IDF[IDFword] } else: tempdic[qterms].update( {TFdoc: TFvalues * IDF[IDFword]}) 'converting Query tf -idf dictionary to matrix/vector' Querymatrix = pd.DataFrame(Querytfidf) 'converting document tf-idf dictionary to matrix/vector' DocTFIDFmatrix = pd.DataFrame(data=tempdic) 'processing the matrix/vector to make feasible for cosine function ' for Qpos, Dpos in zip(list(Querymatrix.index), list(DocTFIDFmatrix.index)): if Qpos == Dpos: Q = np.array(Querymatrix.loc[Qpos]) where_are_NaNs = np.isnan(Q) Q[where_are_NaNs] = 0 D = np.array(DocTFIDFmatrix.loc[Dpos]) where_are_NaNs = np.isnan(D) D[where_are_NaNs] = 0 cosine = QueryProcessor.cosine_similaritys(Q, D) DocSim.append((int(Qpos), cosine)) VectorID = sorted(DocSim, key=lambda x: x[1], reverse=True) TopID = sorted(DocSim[:10], key=lambda x: x[1], reverse=True) #print(VectorID) VectorResult.append({qid: VectorID}) return TopID, k