def create_corpus_and_labels(): negative_corpus = _load_corpus(filename=full_path(DATA_SET["negative"])) positive_corpus = _load_corpus(filename=full_path(DATA_SET["positive"])) negatives_number = len(negative_corpus) positives_number = len(positive_corpus) corpus = [] labels = [] for i in range(min(negatives_number, positives_number)): corpus.append(negative_corpus[i]) labels.append([0]) corpus.append(positive_corpus[i]) labels.append([1]) remainder = negative_corpus if negatives_number > positives_number else positive_corpus label = 0 if negatives_number > positives_number else 1 remainder = remainder[min(negatives_number, positives_number):] labels += [label for _ in range(abs(positives_number - negatives_number))] corpus += remainder if PRINT_STATS: print( "average document length %d, negatives: %d, positives %d, total %d" % get_corpus_stats(corpus, negatives_number, positives_number)) print(corpus[randrange(len(corpus))]) return corpus, labels
def imdb_preprocess(): base_directory = full_path("data/imdb/") pos_subdirectories = ["test/pos", "train/pos"] neg_subdirectories = ["test/neg", "train/neg"] neg_file_path = full_path("data/imdb.neg") pos_file_path = full_path("data/imdb.pos") create_corpus_file(base_directory, neg_file_path, neg_subdirectories) create_corpus_file(base_directory, pos_file_path, pos_subdirectories)
def remove_existing_tenders_from_fetch_list(bulletin_nums_to_fetch: dict): jsons_dir_path = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir']) for json_dir_name in os.listdir(jsons_dir_path): json_subdir_path = jsons_dir_path + "/" + json_dir_name for json_tender_file_name in os.listdir(json_subdir_path): tender_bulletin_num = json_tender_file_name.replace(".json", "") for key in bulletin_nums_to_fetch.keys(): if tender_bulletin_num in bulletin_nums_to_fetch[key]: bulletin_nums_to_fetch[key].remove(tender_bulletin_num) return
def check_tender_lstm_network(): w2v_model = Word2Vec.load(full_path('data/word2vecs/tenders_model')) model = keras.models.load_model( full_path( "lstm-net-backups/tenders_timestep150_drout0.4_rdrout0.4_batch64_short.h5" )) with open(full_path(DATA_SET_TENDERS['positive'])) as positive_corpus: positives_count = 0 overall_count = 0 for line in positive_corpus: tokens_pos = list(utils.tokenize(line, deacc=True, lower=True)) document_pos = list( filter(lambda x: x not in STOP_LIST, tokens_pos)) x_pos = np.array([document_to_batch(document_pos, w2v_model, 150)]) overall_count += 1 if evaluate(model, x_pos) == 'positive': positives_count += 1 print(evaluate(model, x_pos)) print('positives detected: ' + str(positives_count) + '/' + str(overall_count))
def show_document_length_histogram(bin_count=30): if DATA_SET == DATA_SET_TREC: corpus = _load_corpus(full_path("data/trec/trec.corp")) else: corpus, labels = create_corpus_and_labels() sentence_lengths = [] for document in corpus: sentence_lengths.append(len(document)) plt.title(DATA_SET['label']) plt.xlabel("word count") plt.ylabel("document count") plt.hist(sentence_lengths, bins=bin_count) plt.show()
def review_your_review(): print("Wait for the google w2v model to load...") w2v_model = load_google_w2v_model() net_model = keras.models.load_model( full_path( "lstm-net-backups/imdb_timestep150_drout0.4_rdrout0.4_batch64.h5")) print("done") while True: line = input( "Type in your review or \"quit\" to finish then press ENTER: ") if line == 'quit': break tokens_line = list(utils.tokenize(line, deacc=True, lower=True)) document_review = list( filter(lambda x: x not in STOP_LIST, tokens_line)) line_numeric = np.array( [document_to_batch(document_review, w2v_model, 150)]) print('I think this review is: ' + evaluate(net_model, line_numeric)) print('Good bye')
def get_summaries_file_name(corpus_label: str): return full_path("log/{}_summary.log".format(corpus_label))
def get_csv_log_file_name(corpus_label: str): return full_path("log/{}_timestep{}_drout{}_rdrout{}_batch{}.csv".format( corpus_label, str(DATA_SET['time_steps']), str(DATA_SET['dropout']), str(DATA_SET["recurrent_dropout"]), str(DATA_SET["batch_size"])))
def get_vector_labels_file_name(corpus_label: str): return full_path("data/vector_words/" + corpus_label + "_labels.npy")
def get_vector_words_directory_for_dataset(corpus_label: str, dataset): return full_path("data/vector_words/" + corpus_label + "_words_max_timestep" + str(dataset['max_time_steps']))
def get_w2v_file_name(corpus_label: str): return full_path("data/word2vecs/" + corpus_label + "_model")
def get_dictionary_file_name(corpus_label: str): return full_path("data/dicts/" + corpus_label + "_dict")
def get_tfidf_file_name(corpus_label: str): return full_path("data/tfidfs/" + corpus_label + "_tfidf_model")
def get_network_model_snapshot(corpus_label: str): return full_path( "lstm-net/{}_timestep{}_drout{}_rdrout{}_batch{}.h5".format( corpus_label, str(DATA_SET['time_steps']), str(DATA_SET['dropout']), str(DATA_SET["recurrent_dropout"]), str(DATA_SET['batch_size'])))
import numpy as np from src.configuration import get_vector_labels_file_name, DATA_SET_TREC, \ get_batch_file_name_for_dataset, get_vector_words_directory_for_dataset from src.preprocessing.document_as_w2v_groups import document_to_batch from src.preprocessing.w2v_loader import load_google_w2v_model from src.utils.get_file import full_path, create_file_and_folders_if_not_exist LABELS = {'DESC': [], 'HUM': [], 'ENTY': [], 'NUM': [], 'LOC': []} CORPUS_FILE_NAME = full_path("data/trec/trec.corp") # see http://cogcomp.org/Data/QA/QC/ TREC_TEXT_FILE_PATH = full_path("data/trec/trec.txt") WORD_VECTORS_DIRECTORY = get_vector_words_directory_for_dataset( 'trec', DATA_SET_TREC) LABELS_FILE_PATH = get_vector_labels_file_name('trec') def trec_preprocess(): create_corpus_file() def create_corpus_file(): model = load_google_w2v_model() with open(TREC_TEXT_FILE_PATH, 'r', encoding='utf-8', errors='ignore') as file: iter_file = iter(file) for line in iter_file: label = line[:str(line).find(":")] if label in LABELS:
def get_vector_words_directory(corpus_label: str): return full_path("data/vector_words/" + corpus_label + "_words_max_timestep" + str(DATA_SET['max_time_steps']))
def add_line_to_corpus(json_file_path, corpus_file_name): with open(json_file_path, encoding='utf-8') as json_file: json_object = json.load(json_file) data = re.sub('\s+', ' ', json_object['okreslenie_przedmiotu'].strip()) data = re.sub('[^\w\s]|_]', ' ', data) data = unidecode(data) words = data.split() words = [word for word in words if len(word) > 2] with open(corpus_file_name, mode='a+', errors='ignore') as corpus_file: corpus_file.write(' '.join(words) + '\n') # print(words) json_dir_observed = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'] + '/observed_json/') json_dir_viewed = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'] + '/viewed_json/') json_dir_reported = full_path(DATA_SET_TENDERS['bzp_data_jsons_dir'] + '/reported_json/') positive_corpus_path = full_path(DATA_SET_TENDERS['positive']) negative_corpus_path = full_path(DATA_SET_TENDERS['negative']) if DATA_SET_TENDERS == DATA_SET_TENDERS_SHORT: add_lines_to_corpus_short(json_dir_observed, positive_corpus_path) add_lines_to_corpus_short(json_dir_viewed, positive_corpus_path) add_lines_to_corpus_short(json_dir_reported, negative_corpus_path) elif DATA_SET_TENDERS == DATA_SET_TENDERS_LONG: add_lines_to_corpus(json_dir_observed, positive_corpus_path) add_lines_to_corpus(json_dir_viewed, positive_corpus_path)
import json import os from src.configuration import DATA_SET_TENDERS from src.preprocessing.iwium.iwium_bzb_api_client import fetch_data_daily from src.utils.get_file import full_path TRACKER_REPORTED_JSON = full_path(DATA_SET_TENDERS['tracker_dir'] + "/reported-offers.json") TRACKER_OBSERVED_JSON = full_path(DATA_SET_TENDERS['tracker_dir'] + "/observed-offers.json") TRACKER_VIEWED_JSON = full_path(DATA_SET_TENDERS['tracker_dir'] + "/viewed-offers.json") OBSERVED_FILE_PATH_IDS = full_path(DATA_SET_TENDERS['tracker_dir'] + "/ids/observed_ids.txt") REPORTED_FILE_PATH_IDS = full_path(DATA_SET_TENDERS['tracker_dir'] + "/ids/reported_ids.txt") VIEWED_FILE_PATH_IDS = full_path(DATA_SET_TENDERS['tracker_dir'] + "/ids/viewed_ids.txt") OBSERVED_BULLETIN_NUMBERS_PATH = full_path(DATA_SET_TENDERS['bzp_data_dir'] + "/bulletin_nums/observed_nums.txt") REPORTED_BULLETIN_NUMBERS_PATH = full_path(DATA_SET_TENDERS['bzp_data_dir'] + "/bulletin_nums/reported_nums.txt") VIEWED_BULLETIN_NUMBERS_PATH = full_path(DATA_SET_TENDERS['bzp_data_dir'] + "/bulletin_nums/viewed_nums.txt") def parse_tracker_ids(tracker_data_file_path, out_valid_bulletin_numbers_file, out_ids_file): tender_ids = [] with open(tracker_data_file_path, 'r', encoding='utf-8', errors='ignore') as file: tenders_stats = json.load(file) for stat in tenders_stats: tender_ids.append(stat['what']) tender_ids = set(tender_ids) tender_ids = sorted(tender_ids) with open(out_valid_bulletin_numbers_file, 'w') as bulletin_nums_file: