Exemple #1
0
def stem_file(filename, vocab, stop_word = None):
    p = PorterStemmer()
    word_count = {}
    infile = open(filename, 'r')
    while 1:
        word = ''
        line = infile.readline()
        if line == '':
            break
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if word:
                    stemmed_word = p.stem(word, 0,len(word)-1)
                    word = ''
                    if stemmed_word is None or len(stemmed_word) == 0:
                        continue
                    if stop_word is not None and stop_word.get_id_from_token(stemmed_word) != -1:
                        continue

                    vocab.add_token(stemmed_word)
                    count = word_count.get(stemmed_word, 0) + 1
                    word_count[stemmed_word] = count
    infile.close()
    return word_count
Exemple #2
0
def stem_file(filename, vocab, stop_word=None):
    p = PorterStemmer()
    article = []
    infile = open(filename, 'r')
    while 1:
        word = ''
        line = infile.readline()
        if line == '':
            break
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if word:
                    stemmed_word = p.stem(word, 0, len(word) - 1)
                    word = ''
                    if stemmed_word is None or len(stemmed_word) == 0:
                        continue
                    if stop_word is not None and stop_word.get_id_from_token(
                            stemmed_word) != -1:
                        continue

                    vocab.add_token(stemmed_word)
                    article.append(stemmed_word)
    infile.close()
    # random.shuffle(article)
    return article
Exemple #3
0
class queryIndex:
    def __init__(self, arg_stop_words, inverted_index, postingIndexFile):
        self.stop_words = self.stop_words_set(arg_stop_words)
        self.inverted_index = inverted_index
        self.p = PorterStemmer()
        self.postingIndexFile = postingIndexFile

    #build stop_words_set
    def stop_words_set(self, arg_stop_words):
        stop_words_set = set()
        for line in arg_stop_words:
            line = line.rstrip('\n')
            stop_words_set.add(line)
        return stop_words_set

    def determine_query(self, query):
        if query[0] == '"' and query[-1] == '"':
            return 'PQ'
        word_list = query.split()
        if len(word_list) == 1:
            return 'OWQ'
        elif 'OR' in word_list or 'AND' in word_list:
            return 'BQ'
        else:
            return 'FTQ'

    def owq(self, query):
        query = query.lower()
        tokened_query = ''
        i = 0
        for i in range(len(query)):
            if query[i] >= 'a' and query[i] <= 'z' or query[
                    i] >= '0' and query[i] <= '9':
                tokened_query += query[i]
        stemed_query = self.p.stem(tokened_query, 0, len(tokened_query) - 1)
        if stemed_query in self.stop_words:
            return []
        return [stemed_query]

    def ftq(self, query):
        query = query.lower()
        tokened_query = ''
        flag = False
        i = 0
        for i in range(len(query)):
            if query[i] >= 'a' and query[i] <= 'z' or query[
                    i] >= '0' and query[i] <= '9':
                tokened_query += query[i]
                flag = False
            else:
                if not flag:
                    tokened_query += ' '
                    flag = True
        tokened_query = tokened_query.rstrip()
        query_list = tokened_query.split(' ')
        query_list = filter(lambda token: token not in self.stop_words,
                            query_list)
        if not query_list:
            return []
        stemmed_list = []
        for word in query_list:
            stemmed_list.append(self.p.stem(word, 0, len(word) - 1))
        return stemmed_list

    def bq(self, bool_tuples):
        parsed_list = []
        for word in bool_tuples[1]:
            if isinstance(word, tuple):
                parsed_list.append(self.bq(word))
            else:
                query_list = self.ftq(word)
                if query_list:
                    token = ' '.join(query_list)
                    parsed_list.append(token)
        parsed_tuple = (bool_tuples[0].split(), parsed_list)
        return parsed_tuple

    def parse_query(self, query):
        query_type = self.determine_query(query)
        if query_type == 'BQ':
            res = bool_expr_ast(query)
            return query_type, self.bq(res)
        elif query_type == 'OWQ':
            return query_type, self.owq(query)
        elif query_type == 'FTQ':
            return query_type, self.ftq(query)
        else:
            query = query[1:-1]
            return query_type, self.ftq(query)

    def merge_result_and(self, result1, result2):
        result = set(result1).intersection(set(result2))
        return list(result)

    def merge_result_or(self, result1, result2):
        result = set(result1).union(set(result2))
        return list(result)

    def match_bq_query(self, query_content):
        doc_list = []
        for item in query_content[1]:
            if isinstance(item, tuple):
                doc_list = doc_list + [self.match_bq_query(item)]
            else:
                if item not in self.inverted_index:
                    value_list = []
                else:
                    value_list = self.getPostingList(item)
                doc_list = doc_list + [value_list]
        index = 1
        result_list = doc_list[0]
        while index < len(doc_list):
            if query_content[0] == ['AND']:
                result_list = self.merge_result_and(result_list,
                                                    doc_list[index])
            else:
                result_list = self.merge_result_or(result_list,
                                                   doc_list[index])
            index += 1
        return result_list

    def getPostingList(self, term, query_type=None):
        offset_len = self.inverted_index[term]
        offset_len_list = offset_len.split(' ')
        self.postingIndexFile.seek(int(offset_len_list[0]))
        postingIndex = self.postingIndexFile.read(int(offset_len_list[1]))
        postingIndex = postingIndex.rstrip('\n')
        posting_pos = postingIndex.split('|')
        posting = posting_pos[0].rstrip(' ')
        posting = posting.split(' ')
        if query_type == None:
            return posting
        else:
            pos = posting_pos[1].split(';')
            pos_list = []
            for item in pos:
                if item:
                    pos_list += [item.split(' ')]
            return [posting] + [pos_list]

    def matching_documents(self, query):
        parsed_query = self.parse_query(query)
        query_type = parsed_query[0]
        query_content = parsed_query[1]
        if query_type == 'BQ':
            return self.match_bq_query(query_content)
        elif query_type == 'OWQ':
            if query_content[0] not in self.inverted_index:
                return []
            res_list = self.getPostingList(query_content[0])
            return res_list
        elif query_type == 'FTQ':
            result_list = []
            for word in query_content:
                if word not in self.inverted_index:
                    temp_list = []
                else:
                    temp_list = self.getPostingList(word)
                result_list = self.merge_result_or(result_list, temp_list)
            return result_list
        else:
            if query_content[0] not in self.inverted_index:
                page_list_1 = []
                pos_list_1 = []
            else:
                first_word_list = self.getPostingList(query_content[0], 'PQ')
                page_list_1 = first_word_list[0]
                pos_list_1 = first_word_list[1]
            index = 1
            while index in range(len(query_content)):
                page_list = []
                pos_list = []
                if query_content[index] in self.inverted_index:
                    item_list = self.getPostingList(query_content[index], 'PQ')
                    page_list_2 = item_list[0]
                    pos_list_2 = item_list[1]
                    i = 0
                    j = 0
                    while i < len(page_list_1) and j < len(page_list_2):
                        if int(page_list_1[i]) == int(page_list_2[j]):
                            ii = 0
                            jj = 0
                            temp_list = []
                            while ii < len(pos_list_1[i]) and jj < len(
                                    pos_list_2[j]):
                                if int(pos_list_1[i][ii]) + 1 == int(
                                        pos_list_2[j][jj]):
                                    temp_list.append(int(pos_list_2[j][jj]))
                                    ii += 1
                                    jj += 1
                                elif int(pos_list_1[i][ii]) + 1 < int(
                                        pos_list_2[j][jj]):
                                    ii += 1
                                else:
                                    jj += 1
                            if temp_list:
                                pos_list += [temp_list]
                                page_list.append(page_list_2[j])
                            i += 1
                            j += 1
                        elif int(page_list_1[i]) < int(page_list_2[j]):
                            i += 1
                        else:
                            j += 1
                index += 1
                page_list_1 = page_list
                pos_list_1 = pos_list
            return page_list_1
Exemple #4
0
 def __init__(self, arg_stop_words, inverted_index, postingIndexFile):
     self.stop_words = self.stop_words_set(arg_stop_words)
     self.inverted_index = inverted_index
     self.p = PorterStemmer()
     self.postingIndexFile = postingIndexFile
Exemple #5
0
 def __init__(self, arg_stop_words, inverted_index, postingIndexFile):
     self.stop_words = self.stop_words_set(arg_stop_words)
     self.inverted_index = inverted_index
     self.p = PorterStemmer()
     self.postingIndexFile = postingIndexFile
Exemple #6
0
class queryIndex:
    def __init__(self, arg_stop_words, inverted_index, postingIndexFile):
        self.stop_words = self.stop_words_set(arg_stop_words)
        self.inverted_index = inverted_index
        self.p = PorterStemmer()
        self.postingIndexFile = postingIndexFile
        
    #build stop_words_set
    def stop_words_set(self, arg_stop_words):
        stop_words_set = set()
        for line in arg_stop_words:
            line = line.rstrip('\n')
            stop_words_set.add(line)
        return stop_words_set
    
    def determine_query(self, query):    
        if query[0] == '"' and query[-1] == '"':
            return 'PQ'
        word_list = query.split()
        if len(word_list) == 1:
            return 'OWQ'
        elif 'OR' in word_list or 'AND' in word_list:
            return 'BQ'
        else:
            return 'FTQ'

    def owq(self, query):
        query = query.lower()
        tokened_query = ''
        i = 0
        for i in range(len(query)):
            if query[i] >= 'a' and query[i] <= 'z' or query[i] >='0' and query[i] <= '9':
                tokened_query += query[i]
        stemed_query = self.p.stem(tokened_query, 0, len(tokened_query) - 1)
        if stemed_query in self.stop_words:
            return []
        return [stemed_query]
    
    def ftq(self, query):
        query = query.lower()
        tokened_query = ''
        flag = False
        i = 0
        for i in range(len(query)):
            if query[i] >= 'a' and query[i] <= 'z' or query[i] >='0' and query[i] <= '9':
                tokened_query += query[i]
                flag = False
            else:
                if not flag:
                    tokened_query += ' '
                    flag = True
        tokened_query = tokened_query.rstrip()
        query_list = tokened_query.split(' ')
        query_list = filter(lambda token: token not in self.stop_words, query_list)
        if not query_list:
            return []
        stemmed_list = []
        for word in query_list:
            stemmed_list.append(self.p.stem(word, 0, len(word) - 1))
        return stemmed_list

    def bq(self, bool_tuples):
        parsed_list = []
        for word in bool_tuples[1]:
            if isinstance(word, tuple):
                parsed_list.append(self.bq(word))
            else:
                query_list = self.ftq(word)
                if query_list:
                    token = ' '.join(query_list)
                    parsed_list.append(token)
        parsed_tuple = (bool_tuples[0].split(), parsed_list)
        return parsed_tuple
        
    def parse_query(self, query):
        query_type = self.determine_query(query)
        if query_type == 'BQ':
            res = bool_expr_ast(query)
            return query_type, self.bq(res)
        elif query_type == 'OWQ': 
            return query_type, self.owq(query)
        elif query_type == 'FTQ':
            return query_type, self.ftq(query)         
        else:
            query = query[1:-1]
            return query_type, self.ftq(query)
    
    def merge_result_and(self, result1, result2):
        i = 0
        j = 0
        result = []
        while i < len(result1) and j < len(result2):
            if result1[i][0] < result2[j][0]:
                i += 1
            elif result1[i][0] > result2[j][0]:
                j += 1
            else:
                result = result + [(result1[i][0], result1[i][1] + result2[j][1])]
                i += 1
                j += 1
        return result

    def merge_result_or(self, result1, result2):
        i = 0
        j = 0
        result = []
        while i < len(result1) and j < len(result2):
            if result1[i][0] < result2[j][0]:
                result = result + [result1[i]]
                i += 1
            elif result1[i][0] > result2[j][0]:
                result = result + [result2[j]]
                j += 1
            else:
                result = result + [(result1[i][0], result1[i][1] + result2[j][1])]
                i += 1
                j += 1
        while i < len(result1):
            result = result + [result1[i]]
            i += 1
        while j < len(result2):
            result = result + [result2[j]]
            j += 1
        return result
    
    def match_bq_query(self, query_content):
        doc_list = []
        for item in query_content[1]:
            if isinstance(item, tuple):
                doc_list = doc_list + [self.match_bq_query(item)]
            else:
                if item not in self.inverted_index:
                    value_list = []
                else:
                    value_list = self.getPostingList(item)
                doc_list = doc_list + [value_list]
        index = 1
        result_list = doc_list[0]
        while index < len(doc_list):
            if query_content[0] == ['AND']:
                result_list = self.merge_result_and(result_list, doc_list[index])
            else:
                result_list = self.merge_result_or(result_list, doc_list[index])
            index += 1
        return result_list
        
    def getPostingList(self, term, query_type = None):
        offset_len = self.inverted_index[term]
        offset_len_list = offset_len.split(' ')
        self.postingIndexFile.seek(int(offset_len_list[0]))
        postingIndex = self.postingIndexFile.read(int(offset_len_list[1]))
        postingIndex = postingIndex.rstrip('\n')
        postingIndex = postingIndex.rstrip(' ')
        posting_pos = postingIndex.split('|')
        posting = posting_pos[0].rstrip(' ')
        posting = posting.split(' ')
        posting = map(int, posting)
        tfidf_list = posting_pos[2].split(' ')
        tfidf_list = map(float, tfidf_list)
        res_tfidf_list = zip(posting, tfidf_list)
        if query_type == None:
            return res_tfidf_list
        else:
            pos = posting_pos[1].split(';')
            pos_list = []
            for item in pos:
                if item:
                    pos_list += [item.split(' ')]
            return [posting] + [pos_list] + [tfidf_list]
        
    def matching_documents(self, query):
        parsed_query = self.parse_query(query)
        query_type = parsed_query[0]
        query_content = parsed_query[1]
        if query_type == 'BQ':
            result_tuple = self.match_bq_query(query_content)
            
            if not result_tuple:
                return []
            sorted_list = sorted(result_tuple, key = lambda tup: tup[1], reverse = True)
            res_list, tfidf_list = zip(*sorted_list)
            return res_list, tfidf_list
        elif query_type == 'OWQ':
            if query_content[0] not in self.inverted_index:
                return []
            res_tfidf_list = self.getPostingList(query_content[0])
            sorted_list = sorted(res_tfidf_list, key = lambda tup: tup[1], reverse = True)
            res_list, tfidf_list = zip(*sorted_list)
            return res_list, tfidf_list
        elif query_type == 'FTQ':
            result_tuple = []
            for word in query_content:
                if word not in self.inverted_index:
                    temp_tuple = []
                else:
                    temp_tuple = self.getPostingList(word)
                result_tuple = self.merge_result_or(result_tuple, temp_tuple)
            sorted_list = sorted(result_tuple, key = lambda tup: tup[1], reverse = True)
            res_list, tfidf_list = zip(*sorted_list)
            return res_list, tfidf_list
        else:
            if query_content[0] not in self.inverted_index:
                page_list_1 = []
                pos_list_1 = []
            else:
                first_word_list = self.getPostingList(query_content[0], 'PQ')
                page_list_1 = first_word_list[0]
                pos_list_1 = first_word_list[1]
                score_list_1 = first_word_list[2]
            index = 1
            while index in range(len(query_content)):
                page_list = []
                pos_list = []
                score_list = []
                if query_content[index] in self.inverted_index:
                    item_list = self.getPostingList(query_content[index], 'PQ')
                    page_list_2 = item_list[0]
                    pos_list_2 = item_list[1]
                    score_list_2 = item_list[2]
                    i = 0
                    j = 0
                    while i < len(page_list_1) and j < len(page_list_2):
                        if int(page_list_1[i]) == int(page_list_2[j]):
                            ii = 0
                            jj = 0
                            temp_list = []
                            while ii < len(pos_list_1[i]) and jj < len(pos_list_2[j]):
                                if int(pos_list_1[i][ii]) + 1 == int(pos_list_2[j][jj]):
                                    temp_list.append(int(pos_list_2[j][jj]))
                                    ii += 1
                                    jj += 1
                                elif int(pos_list_1[i][ii]) + 1 < int(pos_list_2[j][jj]):
                                    ii += 1
                                else:
                                    jj += 1
                            if temp_list:
                                pos_list += [temp_list]
                                page_list.append(page_list_2[j])
                                score_list.append(score_list_1[i] + score_list_2[j])
                            i += 1
                            j += 1
                        elif int(page_list_1[i]) < int(page_list_2[j]):
                            i += 1
                        else:
                            j += 1
                index += 1
                page_list_1 = page_list
                pos_list_1 = pos_list
                score_list_1 = score_list
            if not page_list:
                return []
            res_tfidf_list = zip(page_list, score_list)
            sorted_list = sorted(res_tfidf_list, key = lambda tup: tup[1], reverse = True)
            res_list, tfidf_list = zip(*sorted_list)
            return res_list, tfidf_list
 def test_word_stemming(self):
     stemmer = PorterStemmer()
     self.assertEqual('stem', stemmer.stem_word('stem'))
     self.assertEqual('stem', stemmer.stem_word('stemmed'))
     self.assertEqual('stem', stemmer.stem_word('stemming'))
Exemple #8
0
 def parse_collection(self, arg_collection):
     pageID = -1
     title = ''
     text = ''
     id_page_dict = {}
     #classify tags
     for line in arg_collection:
         split = []
         if line[0] == '<':
             split = line.split('>', 1)
         if len(split) != 0:
             head = split[0][1:]
             if head == 'page':
                 text = ''
             elif head == 'id':
                 id_str = split[1].split('<')
                 id_str = id_str[0]
                 pageID = int(id_str)
             elif head == 'text':
                 if split[1][-8:-1] == '</text>':
                     text += split[1][:-8]
                 else:
                     text += split[1][:-1]
                     for line in arg_collection:
                         if line[-8:-1] == '</text>':
                             text += (' ' + line[:-8])
                             break
                         else:
                             text += ' ' + line[:-1]
                 title_text = title + '\n' + text
                 id_page_dict[pageID] = title_text
             elif head == 'title':
                 title_list = split[1].split('<')
                 title = title_list[0]
                 self.title_index[pageID] = title
 #lower cases
     for key, value in id_page_dict.items():
         temp = value.lower()
         value = ''
         flag = False
         for c in temp:
             if c >= 'a' and c <= 'z' or c >= '0' and c <= '9':
                 value += c
                 flag = False
             else:
                 if not flag:
                     value += ' '
                     flag = True
         value = value.strip(' ')
         #filter out stop words and porter stemmer
         p = PorterStemmer()
         value_list = value.split(' ')
         value_list = filter(lambda token: token not in self.stop_words,
                             value_list)
         new_value = []
         for s in value_list:
             new_value.append(p.stem(s, 0, len(s) - 1))
         value = ' '.join(new_value)
         id_page_dict[key] = value
 #build inverted index
     self.inverted_index = self.build_inverted_index(id_page_dict)
Exemple #9
0
 def parse_collection(self, arg_collection):
     pageID = -1
     title = ''
     text = ''
     id_page_dict = {}
 #classify tags
     for line in arg_collection:
         split = []
         if line[0] == '<':
             split = line.split('>', 1)
         if len(split) != 0:
             head = split[0][1:]
             if head == 'page':
                 text = ''
             elif head == 'id':
                 id_str = split[1].split('<')
                 id_str = id_str[0]
                 pageID = int(id_str)
             elif head == 'text':
                 if split[1][-8:-1] == '</text>':
                     text += split[1][:-8]
                 else:
                     text += split[1][:-1]
                     for line in arg_collection:
                         if line[-8:-1] == '</text>': 
                             text += (' ' + line[:-8])
                             break
                         else:
                             text += ' ' + line[:-1]
                 title_text = title + '\n' + text
                 id_page_dict[pageID] = title_text
             elif head == 'title':
                 title_list = split[1].split('<')
                 title = title_list[0]
                 self.title_index[pageID] = title
 #lower cases    
     for key, value in id_page_dict.items():
         temp = value.lower()
         value = ''
         flag = False
         for c in temp:
             if c >= 'a' and c <= 'z' or c >='0' and c <= '9':
                 value += c
                 flag = False
             else:
                 if not flag:
                     value += ' '
                     flag = True
         value = value.strip(' ')
         #filter out stop words and porter stemmer 
         p = PorterStemmer()
         value_list = value.split(' ')
         value_list = filter(lambda token: token not in self.stop_words, value_list)
         new_value = []
         for s in value_list:
             new_value.append(p.stem(s, 0, len(s) - 1))
         value = ' '.join(new_value)
         id_page_dict[key] = value
 #build inverted index
     self.doc_no = len(id_page_dict)
     self.inverted_index = self.build_inverted_index(id_page_dict)
     self.compute_tf(id_page_dict)