def search(query_words, document_names): search_results = [] for document_name in document_names: page_path = config.INPUT_PATH + "/" + document_name with open(page_path, "r", encoding="utf-8") as f: # Extract text from page page_html = f.read() page_text = preprocessing.extract_text(page_html) # Process words page_words, page_indexes, page_strings = preprocessing.preprocess_text( page_text) # Find query matches frequency = 0 indexes = [] for page_word, page_index in zip(page_words, page_indexes): if page_word in query_words: indexes.append(page_index) frequency += 1 if frequency > 0: # Get snippets snippets_str = searching.extract_snippets( indexes, page_strings) # Add to search results search_results.append((frequency, document_name, snippets_str)) return search_results
cur = conn.cursor() # Loop over all sites for site in config.INPUT_SITES: site_path = config.INPUT_PATH + "/" + site # Loop over all pages padding = (max([len(x) for x in config.INPUT_SITES]) - len(site)) * " " # Add spaces to align progress bars for page in tqdm(os.listdir(site_path), desc=f"Indexing {site}{padding}", unit="pages", disable=not PRINT_PROGRESS): # Only process html files with the same name as site if not (page.startswith(site) and page.endswith(".html")): continue page_path = site_path + "/" + page with open(page_path, "r", encoding="utf-8") as f: # Extract text from page html = f.read() text = preprocessing.extract_text(html) # Process words words, indexes, _ = preprocessing.preprocess_text(text) for word, index in zip(words, indexes): index_word(cur, word, index, site + "/" + page) # Save DB changes and close connection conn.commit() conn.close()
np.random.shuffle(data) test_data = dataset.read_xml(settings.TEST_PATH) we_size = 256 we_epochs = 50 print('Loading word embeddings...') embeddings = eswc18we.load(we_size, we_epochs) print('Word Embeddings size:', we_size) print('Word Embeddings epochs:', we_epochs) print() VALIDATION_SPLIT = 0.1 texts = [extract_text(record) for record in data] polarities = [record['polarity'] for record in data] labels = [polarity_to_int(polarity) for polarity in polarities] texts_test = [extract_text(record) for record in test_data] print('Tokenizing...') tokenizer = Tokenizer() tokenizer.fit_on_texts(texts + texts_test) sequences = tokenizer.texts_to_sequences(texts) sequences_test = tokenizer.texts_to_sequences(texts_test) lengths = [len(seq) for seq in sequences] max_length = min(max(lengths), 256) word_index = tokenizer.word_index data = pad_sequences([x for x in sequences if x], maxlen=max_length)
# Start search for site in config.INPUT_SITES: site_path = config.INPUT_PATH + "/" + site # Loop over all pages padding = (max([len(x) for x in config.INPUT_SITES]) - len(site)) * " " # Add spaces to align progress bars for page in tqdm(os.listdir(site_path), desc=f"Searching {site}{padding}", unit="pages", disable=not PRINT_PROGRESS): # Only process html files with the same name as site if not (page.startswith(site) and page.endswith(".html")): continue page_path = site_path + "/" + page with open(page_path, "r", encoding="utf-8") as f: # Extract text from page page_html = f.read() page_text = preprocessing.extract_text(page_html) # Process words page_words, page_indexes, page_strings = preprocessing.preprocess_text(page_text) # Find query matches frequency = 0 indexes = [] for page_word, page_index in zip(page_words, page_indexes): if page_word in query_words: indexes.append(page_index) frequency += 1 if frequency > 0: # Get snippets snippets_str = searching.extract_snippets(indexes, page_strings)