Python InvertedIndex.getPostingsList Examples

Programming Language: Python

Namespace/Package Name: index

Class/Type: InvertedIndex

Method/Function: getPostingsList

Examples at hotexamples.com: 2

Python InvertedIndex.getPostingsList - 2 examples found. These are the top rated real world Python examples of index.InvertedIndex.getPostingsList extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

InvertedIndex(25)

load(15)

idf(4)

support(3)

indexingCranfield(2)

getPostingsList(2)

find(2)

count(2)

get_items_inverted(1)

get_total_number_Doc(1)

build(1)

create_index(1)

loadData(1)

load_csv(1)

mergeList(1)

prepare_disk(1)

save(1)

tfidf(1)

Example #1

Show file

File: query.py Project: vaishnavivisweswaraiah/InformationRetrieval_searchEngine

    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" 
        for retrieving posting lists and merge them'''
        #ToDo: return a list of docIDs
        PostingDict = {
        }  #store key value pair of query term and postings by processing index file
        boolen = []  #stores list of docid for each queryterm key
        booleanResult = set()
        tempDic = {}
        QueryDic = {}

        for qterm in Queryterm:
            plist = InvertedIndex.getPostingsList(qterm)
            '''since every term in inverted index is unique below code adds the qterm:postings list 
                to Postings Dictionary'''
            PostingDict.update({qterm: plist})
        for qterms in PostingDict.keys():
            tempDic[qterms] = len(PostingDict[qterms])
        for qterms, cf in tempDic.items():
            if cf > 0:
                if cf < 300:
                    QueryDic[qterms] = cf
        '''checking for length of query term is it contains only single word it directly posts 
                the result read from inverted index file'''
        if len(QueryDic) == 1:
            for key in QueryDic.keys():
                booleanResult = PostingDict[key]
                if not booleanResult:
                    print("Given query has no matched Document",
                          ''.join(Query))
                else:
                    print("Result of the search query ", booleanResult)
        else:
            keylist = list(QueryDic.keys())
            '''iterating over query terms as keys and merging postings list over intersection 
                to find list of postings that contains all query terms'''
            for key in QueryDic.keys():
                'adding postings list of each queryterm'
                boolen.append(sorted(PostingDict[key], key=int))
            '''checking the intersection result boolean result set '''
            booleanResult = set.intersection(*map(set, boolen))
        'If first boolean result is null then we process pairwise intersection of query terms'
        if booleanResult == set():
            for i in range(len(QueryDic) - 1):
                if not i == len(QueryDic) - 1:
                    p1 = PostingDict[keylist[i]]
                    p2 = PostingDict[keylist[i + 1]]
                    temp = InvertedIndex.mergeList(p1, p2)
                    '''checking for empty result post merge if result is not empty set adding 
                    the intersection result boolean result set '''
                    if not temp == set():
                        booleanResult.update(temp)
        return sorted(booleanResult, key=int)

Example #2

Show file

File: query.py Project: vaishnavivisweswaraiah/InformationRetrieval_searchEngine

    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        'Finding TF and IDF of Queryterms and saving the result to TF.json and IDF.json file'
        termfrequency, IDF = postingobj.term_freq(collectionfile, Queryterm)
        'Saving TF,IDF of document for given query'
        indexobj.save(termfrequency, "TF.json")
        indexobj.save(IDF, "IDF.json")
        TF_filename = open("TF.json")
        TF = json.load(TF_filename)
        IDF_filename = open("IDF.json")
        IDF = json.load(IDF_filename)
        QueryDict = {}
        Qlen = len(Query)
        Querytf = {}
        Querytfidf = {}
        tempdic = {}
        DocSim = []
        '''processing each query term and calculating TF-IDF of query and passing document 
            and query vector to cosine function to calculate cosine similarity'''
        for term in Queryterm:
            plist = InvertedIndex.getPostingsList(term)
            QueryDict.update({term: plist})
            if term not in Querytf.keys():
                Querytf[term] = 1
            else:
                Querytf[term] = Querytf[term] + 1
        for qterms, posting in QueryDict.items():
            for pos in posting:
                for IDFword in IDF:
                    if qterms == IDFword:
                        if qterms not in Querytfidf.keys():
                            '''calculating tf of query using query token frequency in query to the total query tokens'''
                            tf = Querytf[qterms]
                            '''calculating td-idf of query where idf of word in query is 1+log(N/n) 
                                where N total documents and n is number of documents that contain the term '''
                            Querytfidf[qterms] = {pos: tf * (1 + IDF[IDFword])}
                        else:
                            Querytfidf[qterms].update(
                                {pos: (tf) * (1 + IDF[IDFword])})
                        TFwordValues = TF[qterms]
                        '''calculating TF*IDF of document and converting it to vector'''
                        for TFdoc, TFvalues in TFwordValues.items():
                            for IDFword in IDF:
                                if qterms == IDFword and TFdoc == pos:
                                    if qterms not in tempdic.keys():
                                        tempdic[qterms] = {
                                            TFdoc: (TFvalues) * IDF[IDFword]
                                        }
                                    else:
                                        tempdic[qterms].update(
                                            {TFdoc: TFvalues * IDF[IDFword]})

        'converting Query tf -idf dictionary to matrix/vector'
        Querymatrix = pd.DataFrame(Querytfidf)
        'converting document tf-idf dictionary to matrix/vector'
        DocTFIDFmatrix = pd.DataFrame(data=tempdic)
        'processing the matrix/vector to make feasible for cosine function '
        for Qpos, Dpos in zip(list(Querymatrix.index),
                              list(DocTFIDFmatrix.index)):
            if Qpos == Dpos:
                Q = np.array(Querymatrix.loc[Qpos])
                where_are_NaNs = np.isnan(Q)
                Q[where_are_NaNs] = 0
                D = np.array(DocTFIDFmatrix.loc[Dpos])
                where_are_NaNs = np.isnan(D)
                D[where_are_NaNs] = 0
                cosine = QueryProcessor.cosine_similaritys(Q, D)
                DocSim.append((int(Qpos), cosine))
        VectorID = sorted(DocSim, key=lambda x: x[1], reverse=True)
        TopID = sorted(DocSim[:10], key=lambda x: x[1], reverse=True)
        #print(VectorID)
        VectorResult.append({qid: VectorID})
        return TopID, k