Beispiel #1
0
def print_questions(docs_json):
    qs = load_questions('test.tsv')
    for i in range(200):
        print(" ")
        print(i)
        print(qs.questions[i].question)
        print(docs_json[str(qs.questions[i].document_id)][str(
            qs.questions[i].passages[0])])
        print(process_and_tokenize_string(qs.questions[i].question))
        print(
            process_and_tokenize_string(docs_json[str(
                qs.questions[i].document_id)][str(
                    qs.questions[i].passages[0])]))
Beispiel #2
0
    def execute_query(self, query):
        start = time.time()
        self.logger.info(" Executing Query: '" + str(query) + "'")
        self.logger.debug(" Query tokens: " +
                          str(process_and_tokenize_string(query)))

        # create question doc from query string
        query_tokens = process_and_tokenize_string(query)
        #ngrams = generate_ngrams(query_tokens,3)
        query_tokens_tfidf = self.tf_idf.get_tokens_value(query_tokens)
        avg = mean(query_tokens_tfidf.values())
        query_tokens_tfidf = {
            k: v
            for k, v in query_tokens_tfidf.items() if v >= avg
        }

        relevant_doc_ids = self.posting_list.get_relevant_docs_ids(
            query_tokens_tfidf.keys())

        relevant_docs = [self.docs[i] for i in relevant_doc_ids]
        top_docs = [TopDoc(self.docs[i]) for i in relevant_doc_ids]

        self.logger.debug("filtered: " + str(len(top_docs)) +
                          " docs ( pool: " + str(len(self.docs)) +
                          ") with tokens " + str(query_tokens_tfidf))
        tf_idf_scores = self.tf_idf.query(query_tokens, relevant_docs)

        for i in range(len(top_docs)):
            progbar(i, len(top_docs), 20)

            top_docs[i].update_score(ScoreType.tf_idf, tf_idf_scores[i])

            top_docs[i].update_score(
                ScoreType.proximity,
                self.posting_list.get_proximity_score(
                    query_tokens, top_docs[i].doc, 6) * 0.5 +
                self.posting_list.get_proximity_score(
                    query_tokens, top_docs[i].doc, 10) * 0.4 +
                self.posting_list.get_proximity_score(
                    query_tokens, top_docs[i].doc, 40) * 0.1)

            top_docs[i].calculate_score()
        print(' ')
        top_docs.sort(key=lambda x: x.score, reverse=True)

        end = time.time()
        self.logger.info("execute_query complete. elapsed time: " +
                         str(end - start) + " secs")
        return top_docs
Beispiel #3
0
def single_query(docs_json, document_indexer, passage_indexer):
    docs = {x: ' '.join(docs_json[x].values()) for x in docs_json}
    document_indexer.index(docs, reindex_documents)
    top_docs = document_indexer.execute_query(query_string)
    print("Documents: ")
    print(top_docs[0:5])

    sliced_docs = {
        top_doc.doc.get_id(): docs_json[str(top_doc.doc.get_id())]
        for top_doc in top_docs[0:3]
    }
    passage_indexer.index(sliced_docs, reindex_passages)
    top_passages = passage_indexer.execute_query(query_string)
    print("Passages: ")
    print(top_passages)
    print(docs_json[str(top_docs[0].doc.get_id())][str(
        top_passages[0].passage.get_id())])
    print(
        process_and_tokenize_string(docs_json[str(
            top_docs[0].doc.get_id())][str(top_passages[0].passage.get_id())]))
Beispiel #4
0
    def execute_query(self, query):
        start = time.time()
        query_tokens = process_and_tokenize_string(query)
        unprocessed_query_tokens = split_strings(query)
        self.logger.info(" Executing Query: '" + str(query) +
                         "'  ---- tokens:" + str(query_tokens))
        top_docs = [TopPassage(doc) for doc in self.docs]
        question_class = -1
        for i, wh in enumerate(wh_questions):
            if wh in unprocessed_query_tokens:
                question_class = i
                break

        #pos_list = nltk.pos_tag(unprocessed_query_tokens)
        tokens_synonyms = []
        for token in remove_stop_words(unprocessed_query_tokens):
            tokens_synonyms += get_processed_synonyms(token)

        #print(tokens_synonyms)

        ngrams_vector = self.ngrams.query(query_tokens, self.docs)
        expanded_ngram_vector = self.ngrams.query(tokens_synonyms, self.docs)

        for i in range(len(top_docs)):
            progbar(i, len(top_docs))
            top_docs[i].update_score(ScoreType.ngram, ngrams_vector[i])
            top_docs[i].update_score(ScoreType.expanded_ngram,
                                     expanded_ngram_vector[i])

            top_docs[i].calculate_score()
        print(' ')
        top_docs.sort(key=lambda x: x.score, reverse=True)

        end = time.time()
        self.logger.info("execute_query complete. elapsed time: " +
                         str(end - start) + " secs")
        return top_docs
Beispiel #5
0
 def __init__(self, _id=None, text=None):
     self.tokens = process_and_tokenize_string(text)
     self.text = text
     self._id = _id