Beispiel #1
0
def get_all_multiword_postings(query_tokens):
    results = []

    filenames = get_page_filenames()
    for filename in tqdm.tqdm(filenames):
        file = get_file(filename, "utf8")
        text = pp.get_text(file)
        words = pp.preprocess(text)

        postings = []
        frequency_sum = 0

        for token in query_tokens:
            posting = Posting(token, filename, 0, [])
            for word in words:
                if word[0] == token:
                    posting.frequency += 1
                    posting.indexes.append(word[1])

            if posting.frequency > 0:
                postings.append(posting)
                frequency_sum += posting.frequency

        if len(query_tokens) == len(postings):
            document_name = filename[9:].replace("\\", "/")
            indexes = []
            for p in postings:
                indexes.append(sorted(p.indexes))

            result = Result(document_name, frequency_sum, indexes)
            results.append(result)

    return sorted(results, key=lambda r: r.frequency_sum, reverse=True)
def create_snippets(result, width):
    file = pp.get_file(f"{relative_path}{result.document_name}", 'utf8')
    text = pp.get_text(file)

    snippets = list()
    indexes = list()

    word_count = 0

    for c in range(len(result.indexes)):
        index = int(result.indexes[c][0])
        if index in indexes:
            continue
        indexes.append(index)

        snippet = text[index]
        front = True
        back = True
        try:
            for i in range(index + 1, len(text)):
                if i <= len(text) - 2:
                    stop = text[i:i + 2]
                    if stop == '. ':
                        snippet += '.'
                        back = False
                        break
                char = text[i]
                if char == ' ':
                    word_count += 1
                    if word_count > width:
                        word_count = 0
                        break
                indexes.append(i)
                snippet += char
        except IndexError:
            pass
        try:
            for i in range(index - 1, -1, -1):
                if i >= 2:
                    stop = text[i - 1:i + 1]
                    if stop == '. ':
                        front = False
                        break
                char = text[i]
                if char == ' ':
                    word_count += 1
                    if word_count > width:
                        word_count = 0
                        break
                indexes.append(i)
                snippet = char + snippet
        except IndexError:
            pass
        snippets.append(Snippet(snippet.replace('\n', ''), front, back))
    return snippets
Beispiel #3
0
def build():
    init_database()
    con = connection()

    cur = con.cursor()
    cur.execute("DELETE FROM Posting")
    cur.execute("DELETE FROM IndexWord")
    con.commit()

    filenames = get_page_filenames()
    for filename in tqdm.tqdm(filenames):
        file = get_file(filename, "utf8")
        text = get_text(file)
        words = preprocess(text)

        for word in words:
            add_index(con, word[0],
                      filename.replace("\\", "/").replace("../pages/", ""),
                      word[1])

    con.close()
def console(use_processing=True, n_question=10, n_answers=10, n_top=3):
    while True:
        print("Question: ")

        question_text = input()

        questions = api.search(question_text)
        questions = questions[:n_question]

        answers = api.answers(
            [question.get("question_id") for question in questions])
        answers = answers[:n_answers]
        answers = [answer['body'] for answer in answers]

        if use_processing:
            answers = [
                get_text(answer, delete_code=False) for answer in answers
            ]

        predictions = [
            process(answer, question_text, top_n=1) for answer in answers
        ]

        print_result(predictions, n_top)