def get_all_multiword_postings(query_tokens): results = [] filenames = get_page_filenames() for filename in tqdm.tqdm(filenames): file = get_file(filename, "utf8") text = pp.get_text(file) words = pp.preprocess(text) postings = [] frequency_sum = 0 for token in query_tokens: posting = Posting(token, filename, 0, []) for word in words: if word[0] == token: posting.frequency += 1 posting.indexes.append(word[1]) if posting.frequency > 0: postings.append(posting) frequency_sum += posting.frequency if len(query_tokens) == len(postings): document_name = filename[9:].replace("\\", "/") indexes = [] for p in postings: indexes.append(sorted(p.indexes)) result = Result(document_name, frequency_sum, indexes) results.append(result) return sorted(results, key=lambda r: r.frequency_sum, reverse=True)
def create_snippets(result, width): file = pp.get_file(f"{relative_path}{result.document_name}", 'utf8') text = pp.get_text(file) snippets = list() indexes = list() word_count = 0 for c in range(len(result.indexes)): index = int(result.indexes[c][0]) if index in indexes: continue indexes.append(index) snippet = text[index] front = True back = True try: for i in range(index + 1, len(text)): if i <= len(text) - 2: stop = text[i:i + 2] if stop == '. ': snippet += '.' back = False break char = text[i] if char == ' ': word_count += 1 if word_count > width: word_count = 0 break indexes.append(i) snippet += char except IndexError: pass try: for i in range(index - 1, -1, -1): if i >= 2: stop = text[i - 1:i + 1] if stop == '. ': front = False break char = text[i] if char == ' ': word_count += 1 if word_count > width: word_count = 0 break indexes.append(i) snippet = char + snippet except IndexError: pass snippets.append(Snippet(snippet.replace('\n', ''), front, back)) return snippets
def build(): init_database() con = connection() cur = con.cursor() cur.execute("DELETE FROM Posting") cur.execute("DELETE FROM IndexWord") con.commit() filenames = get_page_filenames() for filename in tqdm.tqdm(filenames): file = get_file(filename, "utf8") text = get_text(file) words = preprocess(text) for word in words: add_index(con, word[0], filename.replace("\\", "/").replace("../pages/", ""), word[1]) con.close()
def console(use_processing=True, n_question=10, n_answers=10, n_top=3): while True: print("Question: ") question_text = input() questions = api.search(question_text) questions = questions[:n_question] answers = api.answers( [question.get("question_id") for question in questions]) answers = answers[:n_answers] answers = [answer['body'] for answer in answers] if use_processing: answers = [ get_text(answer, delete_code=False) for answer in answers ] predictions = [ process(answer, question_text, top_n=1) for answer in answers ] print_result(predictions, n_top)