Ejemplo n.º 1
0
 def __init__(self, arg_stop_words, inverted_index, postingIndexFile):
     self.stop_words = self.stop_words_set(arg_stop_words)
     self.inverted_index = inverted_index
     self.p = PorterStemmer()
     self.postingIndexFile = postingIndexFile
Ejemplo n.º 2
0
 def parse_collection(self, arg_collection):
     pageID = -1
     title = ''
     text = ''
     id_page_dict = {}
     #classify tags
     for line in arg_collection:
         split = []
         if line[0] == '<':
             split = line.split('>', 1)
         if len(split) != 0:
             head = split[0][1:]
             if head == 'page':
                 text = ''
             elif head == 'id':
                 id_str = split[1].split('<')
                 id_str = id_str[0]
                 pageID = int(id_str)
             elif head == 'text':
                 if split[1][-8:-1] == '</text>':
                     text += split[1][:-8]
                 else:
                     text += split[1][:-1]
                     for line in arg_collection:
                         if line[-8:-1] == '</text>':
                             text += (' ' + line[:-8])
                             break
                         else:
                             text += ' ' + line[:-1]
                 title_text = title + '\n' + text
                 id_page_dict[pageID] = title_text
             elif head == 'title':
                 title_list = split[1].split('<')
                 title = title_list[0]
                 self.title_index[pageID] = title
 #lower cases
     for key, value in id_page_dict.items():
         temp = value.lower()
         value = ''
         flag = False
         for c in temp:
             if c >= 'a' and c <= 'z' or c >= '0' and c <= '9':
                 value += c
                 flag = False
             else:
                 if not flag:
                     value += ' '
                     flag = True
         value = value.strip(' ')
         #filter out stop words and porter stemmer
         p = PorterStemmer()
         value_list = value.split(' ')
         value_list = filter(lambda token: token not in self.stop_words,
                             value_list)
         new_value = []
         for s in value_list:
             new_value.append(p.stem(s, 0, len(s) - 1))
         value = ' '.join(new_value)
         id_page_dict[key] = value
 #build inverted index
     self.inverted_index = self.build_inverted_index(id_page_dict)