Beispiel #1
0
def prepare_k_cross_random_datesets(edges, settings, vx_count, edge_count,
                                    base_path):
    ds_name = settings["name"]
    k = settings["k_subset_count"]
    k_frac = 1 / k
    k_size = 0

    random.shuffle(edges)
    for i in range(1, k + 1):
        full_path = path.join(base_path,
                              '{:03}_{}.cross.csv'.format(i, ds_name))
        start = int(((i - 1) * k_frac) * edge_count)
        end = int((i * k_frac) * edge_count)
        k_size = end - start
        util.write_edges_to_file(edges[start:end], full_path)

    metadata = {
        "name": ds_name,
        "vertices": vx_count,
        "edges": edge_count,
        "set_count": k,
        "format_type": "basic-edge-list",
        "split_method": settings["split_method"],
        "training_sets_size": k_size * (k - 1),
        "test_sets_size": k_size,
        "created": util.now_as_string()
    }
    meta_path = path.join(base_path, '{}_meta.json'.format(ds_name))
    util.write_to_json(metadata, meta_path)

    print('Data files for "{}" dataset succesfully created '.format(ds_name) +
          '({} vertices, {} edges).'.format(vx_count, edge_count))
    print('For details, see: {}'.format(meta_path))
Beispiel #2
0
def prepare_chrono_perc_dataset(ts_edges, settings, vx_count, edge_count,
                                base_path):
    ds_name = settings["name"]
    test_frac = settings["test_perc"] / 100
    test_edges_count = int(edge_count * test_frac)
    train_edges_count = edge_count - test_edges_count
    test_path = path.join(base_path, '{:03}_{}.test.csv'.format(1, ds_name))
    train_path = path.join(base_path, '{:03}_{}.train.csv'.format(1, ds_name))

    edges = util.triples_to_rear_pairs(ts_edges)

    util.write_edges_to_file(edges[:train_edges_count], train_path)
    util.write_edges_to_file(edges[train_edges_count:], test_path)

    metadata = {
        "name": ds_name,
        "vertices": vx_count,
        "edges": edge_count,
        "set_count": 1,
        "format_type": "basic-edge-list",
        "split_method": settings["split_method"],
        "training_sets_size": train_edges_count,
        "test_sets_size": test_edges_count,
        "created": util.now_as_string()
    }
    meta_path = path.join(base_path, '{}_meta.json'.format(ds_name))
    util.write_to_json(metadata, meta_path)

    print('Data files for "{}" dataset succesfully created '.format(ds_name) +
          '({} vertices, {} edges).'.format(vx_count, edge_count))
    print('For details, see: {}'.format(meta_path))
Beispiel #3
0
def prepare_chrono_from_dataset(ts_edges, settings, vx_count, edge_count,
                                base_path):
    ds_name = settings["name"]
    test_path = path.join(base_path, '{:03}_{}.test.csv'.format(1, ds_name))
    train_path = path.join(base_path, '{:03}_{}.train.csv'.format(1, ds_name))

    split_ts = util.str_to_utc_ts(settings["test_from"])
    split_index = util.find_utc_edges_split_index(ts_edges, split_ts)
    edges = util.triples_to_rear_pairs(ts_edges)

    util.write_edges_to_file(edges[:split_index], train_path)
    util.write_edges_to_file(edges[split_index:], test_path)

    metadata = {
        "name": ds_name,
        "vertices": vx_count,
        "edges": edge_count,
        "set_count": 1,
        "format_type": "basic-edge-list",
        "split_method": settings["split_method"],
        "training_sets_size": split_index,
        "test_sets_size": edge_count - split_index,
        "created": util.now_as_string()
    }
    meta_path = path.join(base_path, '{}_meta.json'.format(ds_name))
    util.write_to_json(metadata, meta_path)

    print('Data files for "{}" dataset succesfully created '.format(ds_name) +
          '({} vertices, {} edges).'.format(vx_count, edge_count))
    print('For details, see: {}'.format(meta_path))
def update_yearly_data(category, year, filepath):
    if not is_cached(filepath):
        print('Raw data for {}:{}. Cache not available.'.format(
            category, year))

        total = fetch_article_count(category, year)
        articles = []
        for offset in range(0, total, ARXIV_DATA_CHUNK_SIZE):
            print_progress_info(category, year, total, offset)
            chunk_size = ARXIV_DATA_CHUNK_SIZE
            if offset + chunk_size > total:
                chunk_size = total - offset
            data = fetch_data_chunk(category, year, offset, chunk_size)
            parse_data_chunk(category, articles, data)
            wait()
        util.write_to_json(articles, filepath)
        print('Raw data for {}:{}. Cache updated ({} of {} articles).'.format(
            category, year, len(articles), total))
    else:
        print('Raw data for {}:{}. Cache present.'.format(category, year))