Exemple #1
0
def train_epoch(args, model, device, train_loader, criterion, optimizer,
                epoch):
    """
    Training function for pytorch models
    :param args: arguments to be used (for example, from __main__)
    :param model: model to be trained
    :param device: device to train on (cuda(gpu) or cpu)
    :param train_loader: loader of data
    :param optimizer: optimizer to use
    :param epoch: current epoch
    :return: None
    """
    model.train()
    start = time.time()
    for batch_idx, (source, destination,
                    min_distance) in enumerate(train_loader, 1):
        # Move tensors to relevant devices, and handle distances tensor.
        source, destination, min_distance = \
            source, destination, min_distance.float().to(device).unsqueeze(1)
        optimizer.zero_grad()
        output = model(source, destination)
        loss = criterion(output, min_distance)
        loss.backward()
        optimizer.step()
        print_progress_bar(
            min(batch_idx * train_loader.batch_size,
                len(train_loader.dataset)),
            len(train_loader.dataset),
            time.time() - start,
            prefix=f"Epoch {epoch},",
            suffix=
            f"({batch_idx}/{math.ceil(len(train_loader.dataset) / train_loader.batch_size)}"
            f" batches) Loss (prev. batch): {loss.item():.4f}",
            length=50,
            interval=min(train_loader.batch_size * 4, 1024))
Exemple #2
0
def create_distances_dataframe(dataset_file_path):
    # Loads the dataset file
    dataset = pd.read_csv(dataset_file_path, sep=CSV_SEPARATOR).values

    df = pd.DataFrame(columns=[SRC_NODE, DST_NODE, BFS_DIST, NN_DIST])
    with torch.no_grad():
        start = time.time()
        for idx, (source, destination,
                  actual_distance) in enumerate(dataset, 1):
            df = df.append(
                {
                    SRC_NODE:
                    source,
                    DST_NODE:
                    destination,
                    BFS_DIST:
                    actual_distance,
                    NN_DIST:
                    model(
                        embedder.embed(source).unsqueeze(0),
                        embedder.embed(destination).unsqueeze(
                            0)).round().int().item()
                },
                ignore_index=True)
            print_progress_bar(idx,
                               len(dataset),
                               time.time() - start,
                               prefix=f'Progress: ',
                               length=50)

    return df.infer_objects()
def choose_categories(pages, percentage, resolution):
    # Take only pages which are not redirects, and that actually have categories
    pages_with_categories = [set(page[ENTRY_CATEGORIES]) for page in pages
                             if (ENTRY_REDIRECT_TO not in page and len(page[ENTRY_CATEGORIES]))]
    num_pages = len(pages_with_categories)
    categories_counter = Counter([category for categories in pages_with_categories for category in categories])

    covered_pages_counter = 0
    # taken_categories is used to see how many categories are required for full coverage, not only required percentage
    taken_categories = set()
    # chosen_categories is the set only of the categories required to cover percentage of the pages
    chosen_categories = None
    exact_coverage = None
    coverage_percentages = []
    # uncovered_pages is used to track pages which haven't been covered by a category yet. Initially, that's all pages
    uncovered_pages = pages_with_categories
    start_time = time.time()
    while covered_pages_counter < num_pages and len(categories_counter):
        top_categories = [category[0] for category in categories_counter.most_common(resolution)]
        for category in top_categories:
            del categories_counter[category]
        taken_categories.update(top_categories)
        added_covered_pages, uncovered_pages = categories_pages_coverage(taken_categories, uncovered_pages)
        covered_pages_counter += added_covered_pages
        if covered_pages_counter / num_pages >= percentage and chosen_categories is None:
            chosen_categories = taken_categories.copy()
            exact_coverage = covered_pages_counter / num_pages
        if len(taken_categories) % REPORT_RESOLUTION == 0:
            coverage_percentages.append(covered_pages_counter / num_pages)
        print_progress_bar(covered_pages_counter, num_pages, time.time() - start_time,
                           length=50, prefix="Categories Choosing")
    if len(taken_categories) % REPORT_RESOLUTION != 0:
        coverage_percentages.append(covered_pages_counter / num_pages)

    print(
        f"Chose {len(chosen_categories)} categories covering {exact_coverage * 100:.2f}% of the DB")
    return chosen_categories, coverage_percentages, len(taken_categories)
Exemple #4
0
            },
            ENTRY_REDIRECT_TO: {
                "$exists": False
            }
        }
        pages = mongo_handler_pages.get_pages(
            pages_without_embeddings_query).batch_size(args.batch)
        len_pages = pages.count()

        start = time.time()
        embedded_vectors = []
        for idx, page in enumerate(pages, 1):

            page_value = {
                '_id': page[ENTRY_ID],
                ENTRY_TITLE: page[ENTRY_TITLE],
                'last_modified': datetime.datetime.now().__str__(),
                ENTRY_EMBEDDING: embedder._encode_vector(embedder._embed(page))
            }

            embedded_vectors.append(page_value)
            if idx == len_pages or idx % args.batch == 0:
                mongo_handler_embeddings.insert_data(embedded_vectors)
                embedded_vectors = []

            print_progress_bar(idx,
                               len_pages,
                               time.time() - start,
                               prefix=f'Embedding {embedding}',
                               length=50)
Exemple #5
0
                    BFS_PATH: print_path(bfs_path).replace("->", "\n->"),
                    NN_DIST: nn_dist,
                    NN_TIME: nn_time,
                    NN_DEVELOPED: nn_developed,
                    NN_H_DEVELOPED: astar_nn._heuristic.count,
                    NN_PATH: print_path(nn_path).replace("->", "\n->")
                },
                ignore_index=True)
            # Print out the statistics as tabulate
            statistics_df_tabulate = \
                tabulate.tabulate(statistics_df, headers='keys', tablefmt='fancy_grid', floatfmt='.5f')
            with open(statistics_file_path, 'w', encoding='utf8') as f:
                f.write(statistics_df_tabulate)
            print_progress_bar(idx,
                               dataset_len,
                               time.time() - start,
                               prefix=f'A*',
                               length=50)
    statistics_df.drop(columns=[BFS_PATH, NN_PATH]).to_csv(
        statistics_file_path_csv, sep=CSV_SEPARATOR)
    # Creates the distance-time statistics
    width = 0.3

    bfs_distances_time, bfs_average_times, bfs_std_time = calculate_averages(
        bfs_distance_times)
    nn_distances_time, nn_average_times, nn_std_time = calculate_averages(
        nn_distance_times)

    plt.figure()
    plt.title("A* running times")
    plt.xlabel("Distance")
        for i in range(num_records):
            dest = None
            source = None
            desired_distance = rnd_generator.randint(1, args.max_distance)
            distance, runtime, developed = 0, 0, 0
            while dest is None:  # This is to make sure that the source node actually has neighbors in the first place
                source = rnd_generator.choice(graph_keys)
                dest, distance, developed, runtime = find_at_distance(
                    graph, graph.get_node(source), desired_distance)
            distances[dataset_type].append(distance)
            dataset.append((source, dest.title, distance))
            runtimes_per_distance[distance].append(runtime)
            developed_per_distance[distance].append(developed)
            print_progress_bar(i + 1,
                               num_records,
                               time.time() - dataset_start,
                               prefix=dataset_type.capitalize(),
                               length=50)
        print(
            f'-INFO- {dataset_type.capitalize()}: {num_records} datapoints created.'
        )

        # Create dataframe from dataset
        df = pd.DataFrame.from_records(
            dataset, columns=['source', 'destination', 'min_distance'])
        # Define path to save dataset to
        dataset_path = os.path.abspath(
            os.path.join(args.out, dataset_type + '.csv'))
        # Save dataset (through dataframe)
        df.to_csv(dataset_path, header=True, index=False, sep='\t')
        runtimes[dataset_type] = time.time() - dataset_start
    args = parser.parse_args()

    pages_handler = MongoHandler(WIKI_LANG, PAGES)
    all_pages = pages_handler.get_all_documents()
    cache = Cache()
    pages_text = cache["word_frequency_pages_text"]
    if pages_text is None:
        pages_text = []
        all_pages_len = all_pages.count()
        start_time = time.time()

        for i, page in enumerate(all_pages):
            if ENTRY_TEXT in page:
                pages_text.append(FastText.tokenize_text(page[ENTRY_TEXT]))
            print_progress_bar(i,
                               all_pages_len,
                               time.time() - start_time,
                               length=50)
        print()
        cache["word_frequency_pages_text"] = pages_text

    pages_text_flattened = [word for page in pages_text for word in page]
    pages_text_no_repeats = [set(page_text) for page_text in pages_text]
    pages_text_no_repeats_flattened = [
        word for page in pages_text_no_repeats for word in page
    ]
    counter = Counter(pages_text_flattened)
    counter_no_repeats = Counter(pages_text_no_repeats_flattened)
    total_words = sum(counter.values())
    total_words_no_repeats = len(counter_no_repeats)

    bin_edges_no_repeat = [
    parser.add_argument("-a", "--amount_per_distance", default=10, help="amount of couples per distance")

    args = parser.parse_args()

    test_dataset = pd.read_csv(args.test, sep=CSV_SEPARATOR).values
    dataset_len = len(test_dataset)

    # Dictionary where the distances are the keys and the value is a list of couples where the keyed
    # distance is the distance between them
    max_distance = args.max_distance
    distances_couples = {idx: [] for idx in range(1, max_distance + 1)}

    start = time.time()
    for idx, (source, destination, distance) in enumerate(test_dataset, 1):
        distances_couples[distance].append((source, destination, distance))
        print_progress_bar(
            idx, dataset_len, time.time() - start, prefix=f'Collecting distances\' couples', length=50)

    rnd_generator = random.Random()
    randomed_couples_per_distance = []
    couples_amount_per_distance = args.amount_per_distance
    for idx in range(1, max_distance + 1):
        couples_amount_per_distance = args.amount_per_distance
        if len(distances_couples[idx]) < couples_amount_per_distance:
            couples_amount_per_distance = len(distances_couples[idx])
        randomed_couples_per_distance.extend(
            rnd_generator.sample(distances_couples[idx], couples_amount_per_distance))

    randomed_couples_per_distance_df = pd.DataFrame.from_records(randomed_couples_per_distance,
                                                                 columns=['source', 'destination', 'min_distance'])

    randomed_couples_per_distance_df.to_csv(