def get_query_per_item(self, item): print( anserini.tokenizeString(item['Subject'] + item['Content'], 'lucene')) return ' '.join([ item for item in anserini.tokenizeString( item['Subject'] + item['Content'], 'lucene') if item not in anserini.additional_additional_stopword and item not in anserini.stopwords_temp ])
def process_query(self, subject, content, silver_query): doc_vector = {} for i in anserini.tokenizeString(subject + ' ' + content, 'lucene'): if i in doc_vector: doc_vector[i] += 1 else: doc_vector[i] = 1 subject_terms = anserini.tokenizeString(subject, 'lucene') content_terms = anserini.tokenizeString(content, 'lucene') total_length = len(subject_terms) + len(content_terms) self.process_block(subject, subject_terms, 'subject', doc_vector, total_length, silver_query) self.process_block(content, content_terms, 'content', doc_vector, total_length, silver_query) return subject_terms + content_terms
def select_top_words(self, text): """selecting top words""" terms = list(set(anserini.tokenizeString(text, 'lucene'))) scores = [] for i, term in enumerate(terms): try: tf = anserini.get_term_coll_freq(term) idf = 1 / anserini.get_term_doc_freq(term) except ZeroDivisionError: idf = 0 if term in anserini.stopwords_temp: terms.pop(i) continue except Exception as e: print(e) continue tf_idf = tf * idf scores.append(tf_idf) picked_words = [] while len(picked_words) < self.k and len(scores): picked_word_index = scores.index(max(scores)) picked_word = terms[picked_word_index] scores.pop(picked_word_index) terms.pop(picked_word_index) if picked_word not in picked_words and \ picked_word != 'remember' and picked_word != 'forget': picked_words.append(picked_word) return ' '.join(picked_words)
def get_entities(text): doc_text = nlp(text) map_ent_id = { "PERSON": 18, "NORP": 1, "FAC": 2, "ORG": 3, "GPE": 4, "LOC": 5, "PRODUCT": 6, "EVENT": 7, "WORK_OF_ART": 8, "LAW": 9, "LANGUAGE": 10, "DATE": 11, "TIME": 12, "PERCENT": 13, "MONEY": 14, "QUANTITY": 15, "ORDINAL": 16, "CARDINAL": 17 } entity_words = {} for ent in doc_text.ents: for word in anserini.tokenizeString(ent.text): if ent.label_ != "CARDINAL" and ent.label_ != "QUANTITY": entity_words[word] = map_ent_id[ent.label_] return entity_words
def get_query_per_item(self, item): itemtext = item['Subject'] + item['Content'] terms_in_common = [] terms1 = list(set(anserini.tokenizeString(itemtext, 'lucene'))) try: terms2 = list( set( anserini.tokenizeString( self.gold_doc_content_dict[item['KnownItemId']], 'lucene'))) except: return '' for term in terms1: if term not in self.additional_stopwords and term not in anserini.stopwords_temp and term in terms2: terms_in_common.append(term) return ' '.join(terms_in_common)
def print_handwritten_stats(self): min_len = 1000 max_len = 0 sum_len = 0 for item in self.corpus_gen_white_listed(): length = len( anserini.tokenizeString(item['Content'] + item['Subject'], 'lucene')) max_len = max(max_len, length) min_len = min(min_len, length) sum_len += length print('max q length: {}'.format(max_len)) print('min q length: {}'.format(min_len)) print('avg q length: {}'.format(sum_len / 476))
def build_queries(self): id_doc_text = Utils.load_from_pickle('cleuweb-webis-id-doc-content-dict.p') azzopardifuncs = AzzopardiFunctions() for id in tqdm(id_doc_text.keys()): doc_vector = {} for i in anserini.tokenizeString(id_doc_text[id], 'lucene'): if i in doc_vector: doc_vector[i] += 1 else: doc_vector[i] = 1 print(self.get_doc_url(id)) try: azzopardifuncs.make_query(id, doc_vector, 10) except Exception as e: print('error ', e, 'occured in processing', id)
def length_stats_print(saved_pickle='queries-handwritten.p'): query_dict = Utils.load_from_pickle(saved_pickle) min_len = 1000 max_len = 0 sum_len = 0 for key, value in query_dict.items(): item = get_item(key) print(item['Subject']) print(item['Content']) print("silver query", value) length = len(anserini.tokenizeString(value, 'lucene')) max_len = max(max_len, length) min_len = min(min_len, length) sum_len += length print('max length: {}'.format(max_len)) print('min length: {}'.format(min_len)) print('avg length: {}'.format(sum_len / len(query_dict.values())))
def process_block(self, text, terms, block_type, term_doc_count_dict, total_length, silver_query): pos_tags = pos.get_pos_tags(terms) entity_words = set() if self.use_ner: entity_words = ner.get_entities(text) size = 10 # prev_prev_features = [0] * size prev_features = [0] * size next_features = [0] * size # nex_next_features = [0] * size for i, (term, pos_tag) in enumerate(zip(terms, pos_tags)): features = self.process_word(i, term, block_type, pos_tag, entity_words, term_doc_count_dict, total_length) # if i > 1: # prev_prev_features = self.process_word(i-2, terms[i-2], block_type, pos_tags[i-2], entity_words, term_doc_count_dict, total_length) if i > 0: prev_features = self.process_word(i - 1, terms[i - 1], block_type, pos_tags[i - 1], entity_words, term_doc_count_dict, total_length) if i < len(terms) - 1: next_features = self.process_word(i + 1, terms[i + 1], block_type, pos_tags[i + 1], entity_words, term_doc_count_dict, total_length) # if i < len(terms) - 2: # nex_next_features = self.process_word(i+2, terms[i+2], block_type, pos_tags[i+2], entity_words, term_doc_count_dict, total_length) if self.useContext: features = prev_features + features + next_features is_in_doc = int( term in anserini.tokenizeString(silver_query, 'lucene')) self.add_sample(is_in_doc, features)
def count_terms_in_item(self, item): for i in anserini.tokenizeString(item, 'lucene'): if i in self.doc_vector: self.doc_vector[i] += 1 else: self.doc_vector[i] = 1