from ifind.common.language_model import LanguageModel from compute_snippet_len_gain import make_query, get_words_from_snippet, compute_length, compute_info_gain import sys from ifind.search.engines.whooshtrec import Whooshtrec from ifind.search import Query bm25_search_engine = Whooshtrec( whoosh_index_dir='/Users/david/Workspace/indexes/aquaint_test500_whoosh/', stopwords_file='', model=1, newschema=True) bm25_search_engine.snippet_size = 40 def main(): log_file = sys.argv[1] lm = LanguageModel(file='vocab.in') # Interface... 1 2 3 4 snippet_sizes = [2, 0, 1, 4] snippet_surround = [40, 40, 40, 40] with open(log_file) as f: for s in f: fields = s.strip().split() amtid = fields[3] interface = fields[5] order = fields[6] topic = fields[7]
def main(): bm25 = Whooshtrec( whoosh_index_dir='fullindex/', stopwords_file='', model=1, newschema=True) query = Query('Sea Pirates') query.skip = 1 query.top = 5 bm25.snippet_size = 3 response = bm25.search(query) i = 1 for result in response.results: print i,len(result.summary) #print result.summary #print "--------------" soup = BeautifulSoup(result.summary,'html.parser') text = soup.getText() #print text print "--------------" n = extract_nouns(text) print set(n) print "--------------" sentences = nltk.sent_tokenize(text) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] #print tokenized_sentences cat_sentences = [] for ts in tokenized_sentences: for w in ts: cat_sentences.append(w) #print cat_sentences tagged = nltk.pos_tag(cat_sentences) nouns = [word for word,pos in tagged if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')] downcased = [x.lower() for x in nouns] joined = " ".join(downcased).encode('utf-8') into_string = str(nouns) print (into_string) #print tokenized_sentences print "--------------" tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.chunk.ne_chunk_sents(tagged_sentences, binary=True) entity_names = [] for tree in chunked_sentences: # Print results per sentence # print extract_entity_names(tree) entity_names.extend(extract_entity_names(tree)) print set(entity_names) i+=1