Esempi in Python per Inverted_Index, esempi in Python per bi_inverted_index.Inverted_Index

Esempio n. 1

0

Mostra file

File: boolean_retrieval.py Progetto: parnurzeal/boolean_retrieval_search

 def create(self,folder_path,index_path):
     "create inverted index from all text files in path"
     # clean old inverted index and docList in memory
     if(not self.root == None):
         self.inverted_index.clean(self.root)
         self.inverted_index = Inverted_Index()
         self.root = None
         self.docList[:] =[]
     # check whether folder does exist?
     if(not os.path.isdir(folder_path)):
         return False
     cwd = os.getcwd()
     os.chdir(folder_path)
     docCount=0
     for file in glob.glob("*"):
         fp = open(file)
         self.docList.append((docCount,file))
         #print(self.docList[docCount])
         # find max term count
         max_tc = 0
         for i,line in enumerate(fp):
             for word in line.split():
                 if(self.check_stopword(word)):continue
                 if(self.root==None):
                     self.root = self.inverted_index.createNode(word,self.docList[docCount])
                     self.inverted_index.setRoot(self.root)
                 else:
                     self.inverted_index.insert(self.root,word,self.docList[docCount])
         fp.close()
         docCount=docCount+1
     self.inverted_index.setDocCount(docCount)
     os.chdir(cwd)
     with open(index_path,'wb') as index_file:
         pickle.dump(self.inverted_index,index_file)
     return True

Esempio n. 2

0

Mostra file

File: boolean_retrieval.py Progetto: parnurzeal/boolean_retrieval_search

class Index_System:
    "an inverted index structure with hash table as dictionary and simple list as docID list"
    root, inverted_index, docList = None, None, []
    stop_words=["a","an","and","are","as","at","be","by","for","from","has","he","in","is","it","its","of","on","that","the","to","was","were","will","with"]

    def __init__(self): #constructor
        self.inverted_index = Inverted_Index()
        self.docList = []

    # TODO:
    def normalize(word):pass

    def check_stopword(self,word):
        for stop_word in self.stop_words:
            if(stop_word ==word):
                return True
        return False

    def create(self,folder_path,index_path):
        "create inverted index from all text files in path"
        # clean old inverted index and docList in memory
        if(not self.root == None):
            self.inverted_index.clean(self.root)
            self.inverted_index = Inverted_Index()
            self.root = None
            self.docList[:] =[]
        # check whether folder does exist?
        if(not os.path.isdir(folder_path)):
            return False
        cwd = os.getcwd()
        os.chdir(folder_path)
        docCount=0
        for file in glob.glob("*"):
            fp = open(file)
            self.docList.append((docCount,file))
            #print(self.docList[docCount])
            # find max term count
            max_tc = 0
            for i,line in enumerate(fp):
                for word in line.split():
                    if(self.check_stopword(word)):continue
                    if(self.root==None):
                        self.root = self.inverted_index.createNode(word,self.docList[docCount])
                        self.inverted_index.setRoot(self.root)
                    else:
                        self.inverted_index.insert(self.root,word,self.docList[docCount])
            fp.close()
            docCount=docCount+1
        self.inverted_index.setDocCount(docCount)
        os.chdir(cwd)
        with open(index_path,'wb') as index_file:
            pickle.dump(self.inverted_index,index_file)
        return True

    def show(self):
        "show the inverted index data"
        print("All inverted index data is as below:")
        print("--------------- Inverted Index ----------------")
        self.inverted_index.printTree(self.root)
        print("-----------------------------------------------")
        print()

Esempio n. 3

0

Mostra file

File: boolean_retrieval.py Progetto: parnurzeal/boolean_retrieval_search

 def __init__(self): #constructor
     self.inverted_index = Inverted_Index()

Esempio n. 4

0

Mostra file

File: boolean_retrieval.py Progetto: parnurzeal/boolean_retrieval_search

class Search_System:
    "a search system"
    inverted_index = None

    def __init__(self): #constructor
        self.inverted_index = Inverted_Index()

    def rank_answer(list, score_list):
        mix_list = zip(list,score_list)
        sorted_list = sorted(mix_list,key=lambda value:value[1],reverse = True)
        answer_list = [i for i,j in sorted_list]
        return answer_list

    def search3(self,query):
        "search for many query with boolean expression"

        root = self.inverted_index.root
        query_list = query.split()
        expression = shunting_yard.infixToRPN(query_list)
        expression.reverse()
        stack = []
        plist_num=0
        plist_hashlist={}
        #[2012/08/22 added] to initial variables to find score
        slist_num=0
        slist_hashlist={}

        # just find that term if doing only single word query.
        if len(expression) ==0:
            return []
        elif len(expression) <= 1:
            word = expression.pop()
            answer_last= self.inverted_index.lookup(root,word)
            # get node to find tf-idf
            node_answer_last = self.inverted_index.lookupNode(root,word)
            if(node_answer_last==None):
                return []
            tf_idf_list = node_answer_last.get_tf_idf_list(self.inverted_index.docCount,self.inverted_index.highest_tc)
            ranked_answer = Search_System.rank_answer(answer_last,tf_idf_list)
            return ranked_answer
        # in case of > 1 word query
        while len(expression)!=0:
            token = expression.pop()
            if shunting_yard.isOperator(token):
                word1 = stack.pop()
                if( not word1 in plist_hashlist ):
                    answer_list1 = self.inverted_index.lookup(root,word1)
                    # [2012/08/22 added] to get score of word
                    node1 = self.inverted_index.lookupNode(root,word1)
                    if(node1!=None):
                        score_list1 = node1.get_tf_idf_list(self.inverted_index.docCount,self.inverted_index.highest_tc)
                    else: score_list1 = []
                else:
                    answer_list1 = plist_hashlist[word1]
                    # [2012/08/22 added] to get score of word
                    score_list1 = slist_hashlist[word1]

                word2 = stack.pop()
                if( not word2 in plist_hashlist ):
                    answer_list2 = self.inverted_index.lookup(root,word2)
                    # [2012/08/22 added] to get score of word
                    node2 = self.inverted_index.lookupNode(root,word2)
                    if(node2!=None):
                        score_list2 = node2.get_tf_idf_list(self.inverted_index.docCount,self.inverted_index.highest_tc)
                    else: score_list2 = []
                else:
                    answer_list2 = plist_hashlist[word2]
                    # [2012/08/22 added] to get score of word
                    score_list2 = slist_hashlist[word2]

                if(token=="&&"):
                    answer_last = self.intersect(answer_list1,answer_list2)
                    # [2012/08/22 added] to get score of word
                    score_list_last = self.intersect_score(answer_list1,answer_list2,score_list1,score_list2)
                elif(token=="||"):
                    answer_last = self.union(answer_list1,answer_list2)
                    # [2012/08/22 added] to get score of word
                    score_list_last = self.union_score(answer_list1,answer_list2,score_list1,score_list2)
                else:
                    print("Wrong Operator: program will exit")
                    exit()
                stack.append("plist"+str(plist_num))
                plist_hashlist["plist"+str(plist_num)] = answer_last
                plist_num+=1
                #[2012/08/22 added] to store new score list in hash table
                slist_hashlist["plist"+str(slist_num)] = score_list_last
                slist_num+=1

            else:
                stack.append(token)

        # get all the last answers
        #[2012/08/22 added] to store new score list in hash table
        last_word = stack.pop()
        answer = plist_hashlist[last_word];
        #[2012/08/22 added] to get answer score list in hash table
        s_answer = slist_hashlist[last_word]
        print(answer)
        print(s_answer)
        ranked_answer = Search_System.rank_answer(answer,s_answer)
        return ranked_answer

    def union(self, sorted_plist1, sorted_plist2):
        "union of two postings lists plist1 and plist2, in other word, it is operand OR"
        answer = []
        p1,p2=0,0
        while(p1<len(sorted_plist1) and p2<len(sorted_plist2)):
            if(sorted_plist1[p1]==sorted_plist2[p2]):
                answer.append(sorted_plist1[p1])
                p1+=1;p2+=1
            elif(sorted_plist1[p1][0]<sorted_plist2[p2][0]):
                answer.append(sorted_plist1[p1])
                p1+=1
            else:
                answer.append(sorted_plist2[p2])
                p2+=1
        while(p1<len(sorted_plist1)):
            answer.append(sorted_plist1[p1])
            p1+=1
        while(p2<len(sorted_plist2)):
            answer.append(sorted_plist2[p2])
            p2+=1
        return answer

    # [2012/08/22 added] find union of 2 score lists
    def union_score(self, sorted_plist1, sorted_plist2,slist1,slist2):
        "union of two postings lists plist1 and plist2, in other word, it is operand OR"
        answer = []
        s_answer=[]
        p1,p2=0,0
        while(p1<len(sorted_plist1) and p2<len(sorted_plist2)):
            if(sorted_plist1[p1]==sorted_plist2[p2]):
                answer.append(sorted_plist1[p1])
                s_answer.append(slist1[p1]+slist2[p2])
                p1+=1;p2+=1
            elif(sorted_plist1[p1][0]<sorted_plist2[p2][0]):
                answer.append(sorted_plist1[p1])
                s_answer.append(slist1[p1])
                p1+=1
            else:
                answer.append(sorted_plist2[p2])
                s_answer.append(slist2[p2])
                p2+=1
        while(p1<len(sorted_plist1)):
            answer.append(sorted_plist1[p1])
            s_answer.append(slist1[p1])
            p1+=1
        while(p2<len(sorted_plist2)):
            answer.append(sorted_plist2[p2])
            s_answer.append(slist2[p2])
            p2+=1
        return s_answer

    def intersect(self,sorted_plist1,sorted_plist2):
        "intersection of two postings lists plist1 and plist2, in other word, it is operand AND"
        answer = []
        p1,p2=0,0
        while(p1<len(sorted_plist1) and p2<len(sorted_plist2)):
            if(sorted_plist1[p1]==sorted_plist2[p2]):
                answer.append(sorted_plist1[p1])
                p1+=1; p2+=1
            elif(sorted_plist1[p1][0]<sorted_plist2[p2][0]):
                p1+=1
            else: p2+=1
        return answer

    # [2012/08/22 added] find intersection of 2 score lists
    def intersect_score(self,sorted_plist1,sorted_plist2,slist1,slist2):
        "intersection of two postings lists plist1 and plist2, in other word, it is operand AND"
        answer = []
        s_answer= []
        p1,p2=0,0
        while(p1<len(sorted_plist1) and p2<len(sorted_plist2)):
            if(sorted_plist1[p1]==sorted_plist2[p2]):
                answer.append(sorted_plist1[p1])
                s_answer.append(slist1[p1]+slist2[p2])
                p1+=1; p2+=1
            elif(sorted_plist1[p1][0]<sorted_plist2[p2][0]):
                p1+=1
            else: p2+=1
        return s_answer

    def load_index(self,path):
        "load inverted index to search system"
        self.inverted_index.clean(self.inverted_index.root)
        if(not os.path.isfile(path)):
            print("no such file - "+path)
            return False
        with open(path,'rb') as index_file:
            self.inverted_index = pickle.load(index_file)
        #self.inverted_index.printTree(self.inverted_index.root)
        return True