コード例 #1
0
from ifind.common.language_model import LanguageModel
from compute_snippet_len_gain import make_query, get_words_from_snippet, compute_length, compute_info_gain
import sys
from ifind.search.engines.whooshtrec import Whooshtrec
from ifind.search import Query

bm25_search_engine = Whooshtrec(
    whoosh_index_dir='/Users/david/Workspace/indexes/aquaint_test500_whoosh/',
    stopwords_file='',
    model=1,
    newschema=True)

bm25_search_engine.snippet_size = 40


def main():
	log_file = sys.argv[1]
	lm = LanguageModel(file='vocab.in')
	
	# Interface...      1   2   3   4
	snippet_sizes    = [2,  0,  1,  4]
	snippet_surround = [40, 40, 40, 40]
	
	with open(log_file) as f:
		
		for s in f:
			fields = s.strip().split()
			amtid = fields[3]
			interface = fields[5]
			order = fields[6]
			topic = fields[7]
コード例 #2
0
ファイル: test_search.py プロジェクト: leifos/treconomics
def main():

    bm25 = Whooshtrec(
    whoosh_index_dir='fullindex/',
    stopwords_file='',
    model=1,
    newschema=True)


    query = Query('Sea Pirates')
    query.skip = 1
    query.top = 5

    bm25.snippet_size = 3


    response = bm25.search(query)
    i = 1
    for result in response.results:
        print i,len(result.summary)
        #print result.summary
        #print "--------------"
        soup = BeautifulSoup(result.summary,'html.parser')
        text = soup.getText()
        #print text


        print "--------------"

        n = extract_nouns(text)
        print set(n)
        print "--------------"

        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

        #print tokenized_sentences
        cat_sentences = []
        for ts in tokenized_sentences:
            for w in ts:
                cat_sentences.append(w)

        #print cat_sentences

        tagged =  nltk.pos_tag(cat_sentences)
        nouns = [word for word,pos in tagged if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
        downcased = [x.lower() for x in nouns]
        joined = " ".join(downcased).encode('utf-8')
        into_string = str(nouns)
        print (into_string)

        #print tokenized_sentences

        print "--------------"
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.chunk.ne_chunk_sents(tagged_sentences, binary=True)
        entity_names = []
        for tree in chunked_sentences:
            # Print results per sentence
            # print extract_entity_names(tree)

            entity_names.extend(extract_entity_names(tree))

        print set(entity_names)

        i+=1