コード例 #1
0
 def get_query_per_item(self, item):
     print(
         anserini.tokenizeString(item['Subject'] + item['Content'],
                                 'lucene'))
     return ' '.join([
         item for item in anserini.tokenizeString(
             item['Subject'] + item['Content'], 'lucene')
         if item not in anserini.additional_additional_stopword
         and item not in anserini.stopwords_temp
     ])
コード例 #2
0
 def process_query(self, subject, content, silver_query):
     doc_vector = {}
     for i in anserini.tokenizeString(subject + ' ' + content, 'lucene'):
         if i in doc_vector:
             doc_vector[i] += 1
         else:
             doc_vector[i] = 1
     subject_terms = anserini.tokenizeString(subject, 'lucene')
     content_terms = anserini.tokenizeString(content, 'lucene')
     total_length = len(subject_terms) + len(content_terms)
     self.process_block(subject, subject_terms, 'subject', doc_vector,
                        total_length, silver_query)
     self.process_block(content, content_terms, 'content', doc_vector,
                        total_length, silver_query)
     return subject_terms + content_terms
コード例 #3
0
ファイル: tfidf.py プロジェクト: azin-z/forgetCorpus
    def select_top_words(self, text):
        """selecting top words"""
        terms = list(set(anserini.tokenizeString(text, 'lucene')))
        scores = []
        for i, term in enumerate(terms):
            try:
                tf = anserini.get_term_coll_freq(term)
                idf = 1 / anserini.get_term_doc_freq(term)
            except ZeroDivisionError:
                idf = 0
                if term in anserini.stopwords_temp:
                    terms.pop(i)
                    continue
            except Exception as e:
                print(e)
                continue
            tf_idf = tf * idf
            scores.append(tf_idf)

        picked_words = []
        while len(picked_words) < self.k and len(scores):
            picked_word_index = scores.index(max(scores))
            picked_word = terms[picked_word_index]
            scores.pop(picked_word_index)
            terms.pop(picked_word_index)
            if picked_word not in picked_words and \
                            picked_word != 'remember' and picked_word != 'forget':
                picked_words.append(picked_word)
        return ' '.join(picked_words)
コード例 #4
0
def get_entities(text):
    doc_text = nlp(text)
    map_ent_id = {
        "PERSON": 18,
        "NORP": 1,
        "FAC": 2,
        "ORG": 3,
        "GPE": 4,
        "LOC": 5,
        "PRODUCT": 6,
        "EVENT": 7,
        "WORK_OF_ART": 8,
        "LAW": 9,
        "LANGUAGE": 10,
        "DATE": 11,
        "TIME": 12,
        "PERCENT": 13,
        "MONEY": 14,
        "QUANTITY": 15,
        "ORDINAL": 16,
        "CARDINAL": 17
    }
    entity_words = {}
    for ent in doc_text.ents:
        for word in anserini.tokenizeString(ent.text):
            if ent.label_ != "CARDINAL" and ent.label_ != "QUANTITY":
                entity_words[word] = map_ent_id[ent.label_]
    return entity_words
コード例 #5
0
 def get_query_per_item(self, item):
     itemtext = item['Subject'] + item['Content']
     terms_in_common = []
     terms1 = list(set(anserini.tokenizeString(itemtext, 'lucene')))
     try:
         terms2 = list(
             set(
                 anserini.tokenizeString(
                     self.gold_doc_content_dict[item['KnownItemId']],
                     'lucene')))
     except:
         return ''
     for term in terms1:
         if term not in self.additional_stopwords and term not in anserini.stopwords_temp and term in terms2:
             terms_in_common.append(term)
     return ' '.join(terms_in_common)
コード例 #6
0
 def print_handwritten_stats(self):
     min_len = 1000
     max_len = 0
     sum_len = 0
     for item in self.corpus_gen_white_listed():
         length = len(
             anserini.tokenizeString(item['Content'] + item['Subject'],
                                     'lucene'))
         max_len = max(max_len, length)
         min_len = min(min_len, length)
         sum_len += length
     print('max q length: {}'.format(max_len))
     print('min q length: {}'.format(min_len))
     print('avg q length: {}'.format(sum_len / 476))
コード例 #7
0
 def build_queries(self):
     id_doc_text = Utils.load_from_pickle('cleuweb-webis-id-doc-content-dict.p')
     azzopardifuncs = AzzopardiFunctions()
     for id in tqdm(id_doc_text.keys()):
         doc_vector = {}
         for i in anserini.tokenizeString(id_doc_text[id], 'lucene'):
             if i in doc_vector:
                 doc_vector[i] += 1
             else:
                 doc_vector[i] = 1
         print(self.get_doc_url(id))
         try:
             azzopardifuncs.make_query(id, doc_vector, 10)
         except Exception as e:
             print('error ', e, 'occured in processing', id)
コード例 #8
0
def length_stats_print(saved_pickle='queries-handwritten.p'):
    query_dict = Utils.load_from_pickle(saved_pickle)
    min_len = 1000
    max_len = 0
    sum_len = 0

    for key, value in query_dict.items():
        item = get_item(key)
        print(item['Subject'])
        print(item['Content'])
        print("silver query", value)
        length = len(anserini.tokenizeString(value, 'lucene'))
        max_len = max(max_len, length)
        min_len = min(min_len, length)
        sum_len += length
    print('max length: {}'.format(max_len))
    print('min length: {}'.format(min_len))
    print('avg length: {}'.format(sum_len / len(query_dict.values())))
コード例 #9
0
 def process_block(self, text, terms, block_type, term_doc_count_dict,
                   total_length, silver_query):
     pos_tags = pos.get_pos_tags(terms)
     entity_words = set()
     if self.use_ner:
         entity_words = ner.get_entities(text)
     size = 10
     # prev_prev_features = [0] * size
     prev_features = [0] * size
     next_features = [0] * size
     # nex_next_features = [0] * size
     for i, (term, pos_tag) in enumerate(zip(terms, pos_tags)):
         features = self.process_word(i, term, block_type, pos_tag,
                                      entity_words, term_doc_count_dict,
                                      total_length)
         # if i > 1:
         #     prev_prev_features = self.process_word(i-2, terms[i-2], block_type, pos_tags[i-2], entity_words, term_doc_count_dict, total_length)
         if i > 0:
             prev_features = self.process_word(i - 1, terms[i - 1],
                                               block_type, pos_tags[i - 1],
                                               entity_words,
                                               term_doc_count_dict,
                                               total_length)
         if i < len(terms) - 1:
             next_features = self.process_word(i + 1, terms[i + 1],
                                               block_type, pos_tags[i + 1],
                                               entity_words,
                                               term_doc_count_dict,
                                               total_length)
         # if i < len(terms) - 2:
         #     nex_next_features = self.process_word(i+2, terms[i+2], block_type, pos_tags[i+2], entity_words, term_doc_count_dict, total_length)
         if self.useContext:
             features = prev_features + features + next_features
         is_in_doc = int(
             term in anserini.tokenizeString(silver_query, 'lucene'))
         self.add_sample(is_in_doc, features)
コード例 #10
0
 def count_terms_in_item(self, item):
     for i in anserini.tokenizeString(item, 'lucene'):
         if i in self.doc_vector:
             self.doc_vector[i] += 1
         else:
             self.doc_vector[i] = 1