def train_epoch(args, model, device, train_loader, criterion, optimizer, epoch): """ Training function for pytorch models :param args: arguments to be used (for example, from __main__) :param model: model to be trained :param device: device to train on (cuda(gpu) or cpu) :param train_loader: loader of data :param optimizer: optimizer to use :param epoch: current epoch :return: None """ model.train() start = time.time() for batch_idx, (source, destination, min_distance) in enumerate(train_loader, 1): # Move tensors to relevant devices, and handle distances tensor. source, destination, min_distance = \ source, destination, min_distance.float().to(device).unsqueeze(1) optimizer.zero_grad() output = model(source, destination) loss = criterion(output, min_distance) loss.backward() optimizer.step() print_progress_bar( min(batch_idx * train_loader.batch_size, len(train_loader.dataset)), len(train_loader.dataset), time.time() - start, prefix=f"Epoch {epoch},", suffix= f"({batch_idx}/{math.ceil(len(train_loader.dataset) / train_loader.batch_size)}" f" batches) Loss (prev. batch): {loss.item():.4f}", length=50, interval=min(train_loader.batch_size * 4, 1024))
def create_distances_dataframe(dataset_file_path): # Loads the dataset file dataset = pd.read_csv(dataset_file_path, sep=CSV_SEPARATOR).values df = pd.DataFrame(columns=[SRC_NODE, DST_NODE, BFS_DIST, NN_DIST]) with torch.no_grad(): start = time.time() for idx, (source, destination, actual_distance) in enumerate(dataset, 1): df = df.append( { SRC_NODE: source, DST_NODE: destination, BFS_DIST: actual_distance, NN_DIST: model( embedder.embed(source).unsqueeze(0), embedder.embed(destination).unsqueeze( 0)).round().int().item() }, ignore_index=True) print_progress_bar(idx, len(dataset), time.time() - start, prefix=f'Progress: ', length=50) return df.infer_objects()
def choose_categories(pages, percentage, resolution): # Take only pages which are not redirects, and that actually have categories pages_with_categories = [set(page[ENTRY_CATEGORIES]) for page in pages if (ENTRY_REDIRECT_TO not in page and len(page[ENTRY_CATEGORIES]))] num_pages = len(pages_with_categories) categories_counter = Counter([category for categories in pages_with_categories for category in categories]) covered_pages_counter = 0 # taken_categories is used to see how many categories are required for full coverage, not only required percentage taken_categories = set() # chosen_categories is the set only of the categories required to cover percentage of the pages chosen_categories = None exact_coverage = None coverage_percentages = [] # uncovered_pages is used to track pages which haven't been covered by a category yet. Initially, that's all pages uncovered_pages = pages_with_categories start_time = time.time() while covered_pages_counter < num_pages and len(categories_counter): top_categories = [category[0] for category in categories_counter.most_common(resolution)] for category in top_categories: del categories_counter[category] taken_categories.update(top_categories) added_covered_pages, uncovered_pages = categories_pages_coverage(taken_categories, uncovered_pages) covered_pages_counter += added_covered_pages if covered_pages_counter / num_pages >= percentage and chosen_categories is None: chosen_categories = taken_categories.copy() exact_coverage = covered_pages_counter / num_pages if len(taken_categories) % REPORT_RESOLUTION == 0: coverage_percentages.append(covered_pages_counter / num_pages) print_progress_bar(covered_pages_counter, num_pages, time.time() - start_time, length=50, prefix="Categories Choosing") if len(taken_categories) % REPORT_RESOLUTION != 0: coverage_percentages.append(covered_pages_counter / num_pages) print( f"Chose {len(chosen_categories)} categories covering {exact_coverage * 100:.2f}% of the DB") return chosen_categories, coverage_percentages, len(taken_categories)
}, ENTRY_REDIRECT_TO: { "$exists": False } } pages = mongo_handler_pages.get_pages( pages_without_embeddings_query).batch_size(args.batch) len_pages = pages.count() start = time.time() embedded_vectors = [] for idx, page in enumerate(pages, 1): page_value = { '_id': page[ENTRY_ID], ENTRY_TITLE: page[ENTRY_TITLE], 'last_modified': datetime.datetime.now().__str__(), ENTRY_EMBEDDING: embedder._encode_vector(embedder._embed(page)) } embedded_vectors.append(page_value) if idx == len_pages or idx % args.batch == 0: mongo_handler_embeddings.insert_data(embedded_vectors) embedded_vectors = [] print_progress_bar(idx, len_pages, time.time() - start, prefix=f'Embedding {embedding}', length=50)
BFS_PATH: print_path(bfs_path).replace("->", "\n->"), NN_DIST: nn_dist, NN_TIME: nn_time, NN_DEVELOPED: nn_developed, NN_H_DEVELOPED: astar_nn._heuristic.count, NN_PATH: print_path(nn_path).replace("->", "\n->") }, ignore_index=True) # Print out the statistics as tabulate statistics_df_tabulate = \ tabulate.tabulate(statistics_df, headers='keys', tablefmt='fancy_grid', floatfmt='.5f') with open(statistics_file_path, 'w', encoding='utf8') as f: f.write(statistics_df_tabulate) print_progress_bar(idx, dataset_len, time.time() - start, prefix=f'A*', length=50) statistics_df.drop(columns=[BFS_PATH, NN_PATH]).to_csv( statistics_file_path_csv, sep=CSV_SEPARATOR) # Creates the distance-time statistics width = 0.3 bfs_distances_time, bfs_average_times, bfs_std_time = calculate_averages( bfs_distance_times) nn_distances_time, nn_average_times, nn_std_time = calculate_averages( nn_distance_times) plt.figure() plt.title("A* running times") plt.xlabel("Distance")
for i in range(num_records): dest = None source = None desired_distance = rnd_generator.randint(1, args.max_distance) distance, runtime, developed = 0, 0, 0 while dest is None: # This is to make sure that the source node actually has neighbors in the first place source = rnd_generator.choice(graph_keys) dest, distance, developed, runtime = find_at_distance( graph, graph.get_node(source), desired_distance) distances[dataset_type].append(distance) dataset.append((source, dest.title, distance)) runtimes_per_distance[distance].append(runtime) developed_per_distance[distance].append(developed) print_progress_bar(i + 1, num_records, time.time() - dataset_start, prefix=dataset_type.capitalize(), length=50) print( f'-INFO- {dataset_type.capitalize()}: {num_records} datapoints created.' ) # Create dataframe from dataset df = pd.DataFrame.from_records( dataset, columns=['source', 'destination', 'min_distance']) # Define path to save dataset to dataset_path = os.path.abspath( os.path.join(args.out, dataset_type + '.csv')) # Save dataset (through dataframe) df.to_csv(dataset_path, header=True, index=False, sep='\t') runtimes[dataset_type] = time.time() - dataset_start
args = parser.parse_args() pages_handler = MongoHandler(WIKI_LANG, PAGES) all_pages = pages_handler.get_all_documents() cache = Cache() pages_text = cache["word_frequency_pages_text"] if pages_text is None: pages_text = [] all_pages_len = all_pages.count() start_time = time.time() for i, page in enumerate(all_pages): if ENTRY_TEXT in page: pages_text.append(FastText.tokenize_text(page[ENTRY_TEXT])) print_progress_bar(i, all_pages_len, time.time() - start_time, length=50) print() cache["word_frequency_pages_text"] = pages_text pages_text_flattened = [word for page in pages_text for word in page] pages_text_no_repeats = [set(page_text) for page_text in pages_text] pages_text_no_repeats_flattened = [ word for page in pages_text_no_repeats for word in page ] counter = Counter(pages_text_flattened) counter_no_repeats = Counter(pages_text_no_repeats_flattened) total_words = sum(counter.values()) total_words_no_repeats = len(counter_no_repeats) bin_edges_no_repeat = [
parser.add_argument("-a", "--amount_per_distance", default=10, help="amount of couples per distance") args = parser.parse_args() test_dataset = pd.read_csv(args.test, sep=CSV_SEPARATOR).values dataset_len = len(test_dataset) # Dictionary where the distances are the keys and the value is a list of couples where the keyed # distance is the distance between them max_distance = args.max_distance distances_couples = {idx: [] for idx in range(1, max_distance + 1)} start = time.time() for idx, (source, destination, distance) in enumerate(test_dataset, 1): distances_couples[distance].append((source, destination, distance)) print_progress_bar( idx, dataset_len, time.time() - start, prefix=f'Collecting distances\' couples', length=50) rnd_generator = random.Random() randomed_couples_per_distance = [] couples_amount_per_distance = args.amount_per_distance for idx in range(1, max_distance + 1): couples_amount_per_distance = args.amount_per_distance if len(distances_couples[idx]) < couples_amount_per_distance: couples_amount_per_distance = len(distances_couples[idx]) randomed_couples_per_distance.extend( rnd_generator.sample(distances_couples[idx], couples_amount_per_distance)) randomed_couples_per_distance_df = pd.DataFrame.from_records(randomed_couples_per_distance, columns=['source', 'destination', 'min_distance']) randomed_couples_per_distance_df.to_csv(