コード例 #1
0
ファイル: run-basic-search.py プロジェクト: rokcej/wier-2021
def search(query_words, document_names):
    search_results = []
    for document_name in document_names:
        page_path = config.INPUT_PATH + "/" + document_name
        with open(page_path, "r", encoding="utf-8") as f:
            # Extract text from page
            page_html = f.read()
            page_text = preprocessing.extract_text(page_html)

            # Process words
            page_words, page_indexes, page_strings = preprocessing.preprocess_text(
                page_text)

            # Find query matches
            frequency = 0
            indexes = []
            for page_word, page_index in zip(page_words, page_indexes):
                if page_word in query_words:
                    indexes.append(page_index)
                    frequency += 1

            if frequency > 0:
                # Get snippets
                snippets_str = searching.extract_snippets(
                    indexes, page_strings)

                # Add to search results
                search_results.append((frequency, document_name, snippets_str))

    return search_results
コード例 #2
0
ファイル: run-index.py プロジェクト: rokcej/wier-2021
    cur = conn.cursor()

    # Loop over all sites
    for site in config.INPUT_SITES:
        site_path = config.INPUT_PATH + "/" + site
        # Loop over all pages
        padding = (max([len(x) for x in config.INPUT_SITES]) -
                   len(site)) * " "  # Add spaces to align progress bars
        for page in tqdm(os.listdir(site_path),
                         desc=f"Indexing {site}{padding}",
                         unit="pages",
                         disable=not PRINT_PROGRESS):
            # Only process html files with the same name as site
            if not (page.startswith(site) and page.endswith(".html")):
                continue

            page_path = site_path + "/" + page
            with open(page_path, "r", encoding="utf-8") as f:
                # Extract text from page
                html = f.read()
                text = preprocessing.extract_text(html)

                # Process words
                words, indexes, _ = preprocessing.preprocess_text(text)
                for word, index in zip(words, indexes):
                    index_word(cur, word, index, site + "/" + page)

    # Save DB changes and close connection
    conn.commit()
    conn.close()
コード例 #3
0
np.random.shuffle(data)
test_data = dataset.read_xml(settings.TEST_PATH)

we_size = 256
we_epochs = 50

print('Loading word embeddings...')
embeddings = eswc18we.load(we_size, we_epochs)

print('Word Embeddings size:', we_size)
print('Word Embeddings epochs:', we_epochs)
print()

VALIDATION_SPLIT = 0.1

texts = [extract_text(record) for record in data]
polarities = [record['polarity'] for record in data]
labels = [polarity_to_int(polarity) for polarity in polarities]
texts_test = [extract_text(record) for record in test_data]

print('Tokenizing...')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts + texts_test)
sequences = tokenizer.texts_to_sequences(texts)
sequences_test = tokenizer.texts_to_sequences(texts_test)
lengths = [len(seq) for seq in sequences]
max_length = min(max(lengths), 256)

word_index = tokenizer.word_index

data = pad_sequences([x for x in sequences if x], maxlen=max_length)
コード例 #4
0
	# Start search
	for site in config.INPUT_SITES:
		site_path = config.INPUT_PATH + "/" + site
		# Loop over all pages
		padding = (max([len(x) for x in config.INPUT_SITES]) - len(site)) * " " # Add spaces to align progress bars
		for page in tqdm(os.listdir(site_path), desc=f"Searching {site}{padding}", unit="pages", disable=not PRINT_PROGRESS):
			# Only process html files with the same name as site
			if not (page.startswith(site) and page.endswith(".html")):
				continue

			page_path = site_path + "/" + page
			with open(page_path, "r", encoding="utf-8") as f:
				# Extract text from page
				page_html = f.read()
				page_text = preprocessing.extract_text(page_html)

				# Process words
				page_words, page_indexes, page_strings = preprocessing.preprocess_text(page_text)

				# Find query matches
				frequency = 0
				indexes = []
				for page_word, page_index in zip(page_words, page_indexes):
					if page_word in query_words:
						indexes.append(page_index)
						frequency += 1
				
				if frequency > 0:
					# Get snippets
					snippets_str = searching.extract_snippets(indexes, page_strings)