コード例 #1
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def parse_ogbl(files, has_rel, output_dir, num_partitions=1):
    if has_rel is True:
        train_idx = torch.load(str(files[0]))
        valid_idx = torch.load(str(files[1]))
        test_idx = torch.load(str(files[2]))
        train_list = np.array([
            train_idx.get("head"),
            train_idx.get("relation"),
            train_idx.get("tail")
        ]).T
        valid_list = np.array([
            valid_idx.get("head"),
            valid_idx.get("relation"),
            valid_idx.get("tail")
        ]).T
        test_list = np.array([
            test_idx.get("head"),
            test_idx.get("relation"),
            test_idx.get("tail")
        ]).T
    else:
        train_list = torch.load(files[0]).get("edge")
        valid_list = torch.load(files[1]).get("edge")
        test_list = torch.load(files[2]).get("edge")

    np.savetxt(str(Path(output_dir) / Path("train.txt")),
               train_list,
               fmt="%s",
               delimiter="\t",
               newline="\n")
    np.savetxt(str(Path(output_dir) / Path("valid.txt")),
               valid_list,
               fmt="%s",
               delimiter="\t",
               newline="\n")
    np.savetxt(str(Path(output_dir) / Path("test.txt")),
               test_list,
               fmt="%s",
               delimiter="\t",
               newline="\n")
    print("Conversion completed.")

    if has_rel is True:
        stats, num_nodes, num_edges = general_parser(
            [
                str(Path(output_dir) / Path("train.txt")),
                str(Path(output_dir) / Path("valid.txt")),
                str(Path(output_dir) / Path("test.txt"))
            ], ["srd"], [output_dir],
            num_partitions=num_partitions)
    else:
        stats, num_nodes, num_edges = general_parser(
            [
                str(Path(output_dir) / Path("train.txt")),
                str(Path(output_dir) / Path("valid.txt")),
                str(Path(output_dir) / Path("test.txt"))
            ], ["sd"], [output_dir],
            num_partitions=num_partitions)
    return stats, num_nodes, num_edges
コード例 #2
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def parse_ogbn(files, output_dir, num_partitions=1):
    splits = []
    for file in files[0:-1]:
        nodes = pd.read_csv(file, compression='gzip', header=None)
        splits.append(nodes)

    edges = pd.read_csv(files[-1], compression='gzip', header=None)

    train_edges = edges.loc[np.in1d(edges[0], splits[0])]
    valid_edges = edges.loc[np.in1d(edges[0], splits[1])]
    test_edges = edges.loc[np.in1d(edges[0], splits[2])]

    train_edges.to_csv(str(Path(output_dir) / Path("train.txt")),
                       sep="\t",
                       header=False,
                       index=False)
    valid_edges.to_csv(str(Path(output_dir) / Path("valid.txt")),
                       sep="\t",
                       header=False,
                       index=False)
    test_edges.to_csv(str(Path(output_dir) / Path("test.txt")),
                      sep="\t",
                      header=False,
                      index=False)

    stats, num_nodes, num_edges = general_parser([
        str(Path(output_dir) / Path("train.txt")),
        str(Path(output_dir) / Path("valid.txt")),
        str(Path(output_dir) / Path("test.txt"))
    ], ["sd"], [output_dir],
                                                 num_partitions=num_partitions)
    return stats, num_nodes, num_edges
コード例 #3
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def kinships(output_dir, num_partitions=1, split=(.05, .05)):
    KINSHIPS_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/kinship/kinship.data"
    download_path = download_file(KINSHIPS_URL, output_dir)
    edges = []
    pattern = re.compile("^(?P<rel>[a-z]+)" +
                         r"\((?P<n1>[A-Za-z]+).{2}(?P<n2>[A-Za-z]+)\)\n$")

    f = open(download_path, "r")
    lines = f.readlines()
    for line in lines:
        if '\n' == line[0]:
            continue
        m = pattern.match(line)
        rel = m.group("rel")
        node_1 = m.group("n1")
        node_2 = m.group("n2")
        edges.append([node_1, rel, node_2])

    if (Path(output_dir) / Path("sample_edges.txt")).exists():
        (Path(output_dir) / Path("sample_edges.txt")).unlink()
    np.random.shuffle(edges)
    np.savetxt((Path(output_dir) / Path("sample_edges.txt")),
               edges,
               fmt="%s",
               delimiter="\t",
               newline="\n")

    return general_parser([str(Path(output_dir) / Path("sample_edges.txt"))],
                          ["srd"], [output_dir],
                          dataset_split=split)
コード例 #4
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def live_journal(output_dir, num_partitions=1, split=(.05, .05)):
    LIVE_JOURNAL_URL = "https://snap.stanford.edu/data/soc-LiveJournal1.txt.gz"
    download_path = download_file(LIVE_JOURNAL_URL, output_dir)
    extract_file(download_path)
    return general_parser(
        [str(Path(output_dir) / Path("soc-LiveJournal1.txt"))], ["sd"],
        [output_dir],
        num_partitions=num_partitions,
        dataset_split=split)
コード例 #5
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def drkg(output_dir, num_partitions=1, split=(.05, .05)):
    DRKG_URL = "https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz"
    download_path = download_file(DRKG_URL, output_dir)
    extract_file(download_path)

    return general_parser([str(Path(output_dir) / Path("drkg.tsv"))], ["srd"],
                          [output_dir],
                          num_partitions=num_partitions,
                          dataset_split=split)
コード例 #6
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def twitter(output_dir, num_partitions=1, split=(.05, .05)):
    TWITTER_URL = "https://snap.stanford.edu/data/twitter-2010.txt.gz"
    download_path = download_file(TWITTER_URL, output_dir)
    extract_file(download_path)

    return general_parser([str(Path(output_dir) / Path("twitter-2010.txt"))],
                          ["srd"], [output_dir],
                          num_partitions=num_partitions,
                          dataset_split=split,
                          num_line_skip=1)
コード例 #7
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def hetionet(output_dir, num_partitions=1, split=(.05, .05)):
    HETIONET_URL = "https://github.com/hetio/hetionet/raw/master/hetnet/tsv/hetionet-v1.0-edges.sif.gz"
    download_path = download_file(HETIONET_URL, output_dir)
    extract_file(download_path)

    return general_parser(
        [str(Path(output_dir) / Path("hetionet-v1.0-edges.sif"))], ["srd"],
        [output_dir],
        num_partitions=num_partitions,
        dataset_split=split)
コード例 #8
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def fb15k_237(output_dir, num_partitions=1):
    FB15K_237 = "https://data.deepai.org/FB15K-237.2.zip"
    download_path = download_file(FB15K_237, output_dir)
    extract_file(download_path)
    for file in (output_dir / Path("Release")).iterdir():
        file.rename(output_dir / Path(file.name))
    (output_dir / Path("Release")).rmdir()

    return general_parser([
        str(Path(output_dir) / Path("train.txt")),
        str(Path(output_dir) / Path("valid.txt")),
        str(Path(output_dir) / Path("test.txt"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions)
コード例 #9
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def wn18(output_dir, num_partitions=1):
    WN18_URL = "https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:wordnet-mlj12.tar.gz"
    download_path = download_file(WN18_URL, output_dir)
    extract_file(download_path)
    for file in (output_dir / Path("wordnet-mlj12")).iterdir():
        file.rename(output_dir / Path(file.name))
    (output_dir / Path("wordnet-mlj12")).rmdir()

    return general_parser([
        str(Path(output_dir) / Path("wordnet-mlj12-train.txt")),
        str(Path(output_dir) / Path("wordnet-mlj12-valid.txt")),
        str(Path(output_dir) / Path("wordnet-mlj12-test.txt"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions)
コード例 #10
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def freebase86m(output_dir, num_partitions=1):
    FREEBASE86M_URL = "https://data.dgl.ai/dataset/Freebase.zip"
    download_path = download_file(FREEBASE86M_URL, output_dir)
    extract_file(download_path)
    for file in (output_dir / Path("Freebase")).iterdir():
        file.rename(output_dir / Path(file.name))
    (output_dir / Path("Freebase")).rmdir()

    return general_parser([
        str(Path(output_dir) / Path("train.txt")),
        str(Path(output_dir) / Path("valid.txt")),
        str(Path(output_dir) / Path("test.txt"))
    ], ["sdr"], [output_dir],
                          num_partitions=num_partitions)
コード例 #11
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def fb15k(output_dir, num_partitions=1):
    FB15K_URL = "https://dl.fbaipublicfiles.com/starspace/fb15k.tgz"
    download_path = download_file(FB15K_URL, output_dir)
    extract_file(download_path)
    for file in (output_dir / Path("FB15k")).iterdir():
        file.rename(output_dir / Path(file.name))
    (output_dir / Path("FB15k")).rmdir()

    return general_parser([
        str(Path(output_dir) / Path("freebase_mtr100_mte100-train.txt")),
        str(Path(output_dir) / Path("freebase_mtr100_mte100-valid.txt")),
        str(Path(output_dir) / Path("freebase_mtr100_mte100-test.txt"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions)
コード例 #12
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def openbiolink_hq(output_dir, num_partitions=1):
    OPENBIOLINK_HQ_URL = "https://zenodo.org/record/3834052/files/HQ_DIR.zip?download=1"
    download_path = download_file(OPENBIOLINK_HQ_URL, output_dir)
    extract_file(download_path)

    return general_parser([
        str(
            Path(output_dir) /
            Path("HQ_DIR/train_test_data/train_sample.csv")),
        str(Path(output_dir) / Path("HQ_DIR/train_test_data/val_sample.csv")),
        str(Path(output_dir) / Path("HQ_DIR/train_test_data/test_sample.csv"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions,
                          num_line_skip=0)
コード例 #13
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def codex_l(output_dir, num_partitions=1):
    CODEX_L_TRAIN_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/train.txt"
    CODEX_L_VALID_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/valid.txt"
    CODEX_L_TEST_URL = "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/codex-l/test.txt"
    download_path = download_file(CODEX_L_TRAIN_URL, output_dir)
    download_path = download_file(CODEX_L_VALID_URL, output_dir)
    download_path = download_file(CODEX_L_TEST_URL, output_dir)

    return general_parser([
        str(Path(output_dir) / Path("train.txt")),
        str(Path(output_dir) / Path("valid.txt")),
        str(Path(output_dir) / Path("test.txt"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions)
コード例 #14
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def wn18rr(output_dir, num_partitions=1):
    WN18RR_URL = "https://data.dgl.ai/dataset/wn18rr.zip"
    download_path = download_file(WN18RR_URL, output_dir)
    extract_file(download_path)
    for file in (output_dir / Path("wn18rr")).iterdir():
        file.rename(output_dir / Path(file.name))
    (output_dir / Path("wn18rr")).rmdir()

    return general_parser([
        str(Path(output_dir) / Path("train.txt")),
        str(Path(output_dir) / Path("valid.txt")),
        str(Path(output_dir) / Path("test.txt"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions)
コード例 #15
0
ファイル: preprocess.py プロジェクト: cthoyt/marius
def openbiolink_lq(output_dir, num_partitions=1):
    OPENBIOLINK_LQ_URL = "https://samwald.info/res/OpenBioLink_2020_final/ALL_DIR.zip"
    download_path = download_file(OPENBIOLINK_LQ_URL, output_dir)
    extract_file(download_path)

    return general_parser([
        str(
            Path(output_dir) /
            Path("ALL_DIR/train_test_data/train_sample.csv")),
        str(Path(output_dir) / Path("ALL_DIR/train_test_data/val_sample.csv")),
        str(
            Path(output_dir) / Path("ALL_DIR/train_test_data/test_sample.csv"))
    ], ["srd"], [output_dir],
                          num_partitions=num_partitions,
                          num_line_skip=0)
コード例 #16
0
    def test_basic(self):
        """
        Check the preprocessor executes on the test data without error
        """

        output_dir = "test/test_data/"

        train_file = "train_edges.txt"
        valid_file = "valid_edges.txt"
        test_file = "test_edges.txt"

        stats, num_nodes, num_rels = general_parser(
                                    [str(Path(output_dir) / train_file),
                                     str(Path(output_dir) / valid_file),
                                     str(Path(output_dir) / test_file)],
                                    ["srd"], [output_dir], num_partitions=1)
        assert(stats[0] == 1000)
        assert(stats[1] == 100)
        assert(stats[2] == 100)
        assert(num_nodes == 100)
        assert(num_rels == 10)