Beispiel #1
0
 def test_chazutsu(self):
     path = os.path.join(os.path.dirname(__file__), "../")
     storage = Storage(path)
     r = chazutsu.datasets.DUC2004().download(storage.data_path("raw"))
     df = storage.chazutsu(r.root).data()
     print(df.head(5))
     shutil.rmtree(r.root)
Beispiel #2
0
    def test_word_vector_resource(self):
        path = os.path.join(os.path.dirname(__file__), "./data")
        storage = Storage(path)

        vocab = Vocabulary()
        vocab.set(["you", "loaded", "word", "vector", "now"])

        vector_size = 50
        word2vec = [
            "you " + " ".join(["0"] * vector_size),
            "word " + " ".join(["1"] * vector_size),
            "now " + " ".join(["2"] * vector_size),
        ]
        word2vec_file = Path(storage.path("external/word2vec_dummyr.txt"))
        with word2vec_file.open(mode="w", encoding="utf-8") as f:
            f.write("\n".join(word2vec))

        wv = WordVector(word2vec_file)
        key_vector = wv.load()
        for k in key_vector:
            self.assertTrue(k in vocab.get())
            self.assertEqual(len(key_vector[k]), vector_size)

        embed = vocab.make_embedding(word2vec_file)
        self.assertEqual(embed.shape, (len(vocab.get()), vector_size))
Beispiel #3
0
    def test_dataframe(self):
        path = os.path.join(os.path.dirname(__file__), "./data")
        storage = Storage(path)
        df = storage.read("raw/corpus.csv",
                          delimiter="\t",
                          names=["summary", "text"])

        preprocessor = Preprocessor(
            tokenizer=ct.Tokenizer("ja"),
            text_transformers=[ct.text.UnicodeNormalizer()],
            vocabulary=ct.Vocabulary(vocab_size=50))

        preprocessor.fit(df[["summary", "text"]])
        joblib.dump(preprocessor, "test_preprocessor.pkl")

        preprocessor = joblib.load("test_preprocessor.pkl")
        transformed = preprocessor.transform(df)
        inversed = preprocessor.inverse_transform(transformed)

        for c in df.columns:
            for o, i in zip(df[c], inversed[c]):
                self.assertEqual(o, "".join(i))

        print(inversed)
        os.remove("test_preprocessor.pkl")
    def test_feed(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        dp = DatasetPreprocessor()
        dp.process("review")\
            .by(ct.text.UnicodeNormalizer())\
            .by(ct.Tokenizer("en"))\
            .by(ct.token.StopwordFilter("en"))\
            .by(ct.Vocabulary(min_df=0, max_df=1.0))\
            .by(ct.formatter.Padding(length=5))\
            .fit(df.loc[:, ["review", "comment"]])
        dp.process("label")\
            .by(ct.formatter.CategoricalLabel(),
                reference=dp.process("review"))

        adjusted = dp(df).preprocess().format().processed
        self.assertEqual(len(adjusted["label"][0]),
                         dp.process("review").preprocessor.vocabulary.count)

        # Iterate
        for batch in dp(df).preprocess().iterate(batch_size=1, epoch=1):
            self.assertEqual(len(batch), 3)
            self.assertEqual(len(batch["review"][0]), 5)

            inversed = dp.inverse(batch)
            self.assertEqual(inversed["label"][0], np.argmax(batch["label"]))
            self.assertLessEqual(len(inversed["review"][0]), 5)
Beispiel #5
0
 def test_download(self, mock_download):
     path = os.path.join(os.path.dirname(__file__), "./data")
     storage = Storage(path)
     storage.chakin(lang="Japanese")
     vec_path = storage.chakin(name="fastText(ja)")
     self.assertTrue(os.path.exists(vec_path))
     os.remove(vec_path)
Beispiel #6
0
 def test_download(self):
     url = "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png"
     root = os.path.join(os.path.dirname(__file__), "./data")
     storage = Storage(root)
     path = storage.download(url, "raw/image.png")
     self.assertTrue(os.path.exists(path))
     correct_path = os.path.join(root, "raw/image.png")
     self.assertEqual(resolve(path), resolve(correct_path))
     os.remove(path)
Beispiel #7
0
 def test_download_zip(self, mock_download):
     path = os.path.join(os.path.dirname(__file__), "./data")
     storage = Storage(path)
     vec_path = storage.chakin(name="fastText(ja)")
     print(vec_path)
     self.assertTrue(os.path.exists(vec_path))
     self.assertTrue(
         Path(vec_path).joinpath("word2vec_dummy2.txt").exists())
     shutil.rmtree(vec_path)
Beispiel #8
0
    def test_read_file(self):
        path = os.path.join(os.path.dirname(__file__), "../")
        storage = Storage(path)
        csv = DataFile(storage.data_path("raw/sample_dataset.csv"))

        content = csv.to_array()
        fetched = list(csv.fetch(progress=True))
        for c, f in zip(content, fetched):
            self.assertEqual(c, f)
Beispiel #9
0
 def test_read_pdf_as_frame(self):
     path = os.path.join(os.path.dirname(__file__), "../_data")
     storage = Storage(path)
     url = "https://global.toyota/pages/global_toyota/ir/library/annual/2019_001_annual_jp.pdf"
     file_path = storage.download(url, "./test_pdf.pdf")
     reader = PDFReader()
     df = reader.read_to_frame(file_path)
     self.assertGreater(len(df), 1)
     os.remove(file_path)
Beispiel #10
0
 def test_preprocess(self):
     path = os.path.join(os.path.dirname(__file__), "../_data")
     storage = Storage(path)
     url = "https://global.toyota/pages/global_toyota/ir/library/annual/2019_001_annual_en.pdf"
     file_path = storage.download(url, "./test_pdf_preprocess.pdf")
     reader = PDFReader()
     df = reader.read_to_frame(file_path)
     df = reader.preprocess_frame(df)
     df.to_csv("sample.csv", index=False)
     self.assertGreater(len(df), 1)
     os.remove(file_path)
 def __init__(self, root, kind="cora"):
     self.storage = Storage(root)
     self.kind = kind
     self.download_url = "https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/graph/"  # noqa
     if kind == "cora":
         self.download_url += "cora.zip"
     elif kind == "citeseer":
         self.download_url += "citeseer.zip"
     elif kind == "pubmed":
         self.download_url += "pubmed.zip"
     else:
         raise Exception("Graph dataset {} is not supported.".format(kind))
Beispiel #12
0
    def test_read_pdf_text(self):
        path = os.path.join(os.path.dirname(__file__), "../_data")
        storage = Storage(path)
        url = "https://global.toyota/pages/global_toyota/ir/library/annual/2019_001_annual_jp.pdf"
        file_path = storage.download(url, "./test_pdf.pdf")
        reader = PDFReader()
        text = reader.read(file_path, html=True)
        self.assertEqual(text[:6], "<html>")

        text = reader.read(file_path)
        self.assertTrue("目次" in text[:6])
        os.remove(file_path)
Beispiel #13
0
    def test_series(self):
        path = os.path.join(os.path.dirname(__file__), "./")
        storage = Storage(path)
        df = storage.read("raw/corpus_multi.csv", delimiter="\t",
                          names=["label", "review", "comment"])

        preprocessor = Preprocessor(
                            tokenizer=ct.Tokenizer("en"),
                            text_transformers=[ct.text.UnicodeNormalizer()],
                            token_transformers=[ct.token.StopwordFilter("en")],
                            vocabulary=ct.Vocabulary(min_df=0, max_df=1.0))

        preprocessor.fit(df["review"])
        transformed = preprocessor.transform(df["comment"])
        self.assertEqual(len(transformed), 3)
Beispiel #14
0
    def test_setup_data_dir(self):
        root = os.path.join(os.path.dirname(__file__), "./tmp_root")
        os.mkdir(root)

        storage = Storage.setup_data_dir(root)
        self.assertTrue(os.path.exists(storage.data_path("raw")))
        self.assertTrue(os.path.exists(storage.data_path("processed")))
        self.assertTrue(os.path.exists(storage.data_path("interim")))
        self.assertTrue(os.path.exists(storage.data_path("external")))
        shutil.rmtree(root)
Beispiel #15
0
    def test_setup_data_dir(self):
        root = os.path.join(os.path.dirname(__file__), "./tmp_root")
        os.mkdir(root)

        storage = Storage.setup_data_dir(root)
        self.assertTrue(os.path.exists(storage.raw()))
        self.assertTrue(os.path.exists(storage.processed()))
        self.assertTrue(os.path.exists(storage.interim()))
        self.assertTrue(os.path.exists(storage.external()))
        shutil.rmtree(root)
Beispiel #16
0
    def __init__(self,
                 root="",
                 lang=None,
                 min_df=5,
                 max_df=sys.maxsize,
                 unknown="<unk>",
                 preprocessor_name="preprocessor",
                 log_dir=""):
        default_root = os.path.join(os.path.dirname(__file__), "../../")
        _root = root if root else default_root

        self.storage = Storage(_root)
        self.preprocessor_name = preprocessor_name
        self._base_log_dir = log_dir
        self._built = False
        self.preprocessor = Preprocessor(text_transformers=[
            ct.text.UnicodeNormalizer(),
            ct.text.LowerNormalizer()
        ],
                                         tokenizer=ct.Tokenizer(lang=lang),
                                         vocabulary=ct.Vocabulary(
                                             min_df=min_df,
                                             max_df=max_df,
                                             unknown=unknown))
Beispiel #17
0
    def test_convert(self):
        path = os.path.join(os.path.dirname(__file__), "../")
        storage = Storage(path)
        csv = DataFile(storage.data_path("raw/sample_dataset.csv"))

        path_changed = csv.convert(data_dir_to="interim")
        correct = os.path.join(path, "./data/interim/sample_dataset.csv")
        self.assertEqual(resolve(path_changed.path), resolve(correct))

        attr_added = csv.convert(add_attribute="preprocessed")
        correct = storage.data_path("raw/sample_dataset__preprocessed.csv")
        self.assertEqual(resolve(attr_added.path), resolve(correct))

        attr_converted = attr_added.convert(
            attribute_to={"preprocessed": "converted"})
        correct = storage.data_path("raw/sample_dataset__converted.csv")
        self.assertEqual(resolve(attr_converted.path), resolve(correct))

        ext_changed = csv.convert(ext_to=".txt")
        correct = storage.data_path("raw/sample_dataset.txt")
        self.assertEqual(resolve(ext_changed.path), resolve(correct))
class GraphDataset():
    def __init__(self, root, kind="cora"):
        self.storage = Storage(root)
        self.kind = kind
        self.download_url = "https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/graph/"  # noqa
        if kind == "cora":
            self.download_url += "cora.zip"
        elif kind == "citeseer":
            self.download_url += "citeseer.zip"
        elif kind == "pubmed":
            self.download_url += "pubmed.zip"
        else:
            raise Exception("Graph dataset {} is not supported.".format(kind))

    @property
    def data_root(self):
        return self.storage.data_path("raw/{}".format(self.kind))

    @property
    def download_file_path(self):
        return self.storage.data_path("raw/{}.zip".format(self.kind))

    def download(self, return_mask=True):
        # Check downloaded file
        if os.path.isdir(self.data_root):
            print("{} dataset is already downloaded.".format(self.kind))
            return self.load(return_mask)

        # Download dataset
        resp = requests.get(self.download_url, stream=True)
        with open(self.download_file_path, "wb") as f:
            chunk_size = 1024
            for data in resp.iter_content(chunk_size=chunk_size):
                f.write(data)

        # Expand file
        with zipfile.ZipFile(self.download_file_path) as z:
            z.extractall(path=self.data_root)
        os.remove(self.download_file_path)

        return self.load(return_mask)

    def load(self, return_mask):
        """
        Loads input data (reference from: https://github.com/tkipf/gcn/blob/master/gcn/utils.py)
        ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
        ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
        ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
            (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
        ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
        ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
        ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
        ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
            object;
        ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
        All objects above must be saved using python pickle module.
        :param dataset_str: Dataset name
        :return: All data input files loaded (as well the training/test data).
        """

        names = ["x", "y", "tx", "ty", "allx", "ally", "graph", "test.index"]
        objects = []
        for n in names:
            file_path = os.path.join(self.data_root,
                                     "ind.{}.{}".format(self.kind, n))

            if n != "test.index":
                with open(file_path, "rb") as f:
                    objects.append(pkl.load(f, encoding="latin1"))
            else:
                with open(file_path, encoding="latin1") as f:
                    lines = f.readlines()
                    indices = [int(ln.strip()) for ln in lines]
                objects.append(indices)

        x, y, tx, ty, allx, ally, graph, test_idx = tuple(objects)
        test_idx_range = np.sort(test_idx)

        if self.kind == "citeseer":
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(min(test_idx), max(test_idx) + 1)
            tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
            tx_extended[test_idx_range - min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
            ty_extended[test_idx_range - min(test_idx_range), :] = ty
            ty = ty_extended

        features = sp.vstack((allx, tx)).tolil()
        features[test_idx, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((ally, ty))
        labels[test_idx, :] = labels[test_idx_range, :]

        idx_test = test_idx_range
        idx_train = np.array(range(len(y)))
        idx_val = np.array(range(len(y), len(y) + 500))

        if return_mask:
            train_mask = self.sample_mask(idx_train, labels.shape[0])
            val_mask = self.sample_mask(idx_val, labels.shape[0])
            test_mask = self.sample_mask(idx_test, labels.shape[0])

            y_train = np.zeros(labels.shape)
            y_val = np.zeros(labels.shape)
            y_test = np.zeros(labels.shape)
            y_train[train_mask, :] = labels[train_mask, :]
            y_val[val_mask, :] = labels[val_mask, :]
            y_test[test_mask, :] = labels[test_mask, :]

            return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
        else:

            y_train = labels[idx_train, :]
            y_val = labels[idx_val, :]
            y_test = labels[idx_test, :]
            return adj, features, y_train, y_val, y_test, idx_train, idx_val, idx_test

    def sample_mask(self, idx, length):
        """Create mask."""
        mask = np.zeros(length)
        mask[idx] = 1
        return np.array(mask, dtype=np.bool)
def run_experiment(original=True, attention=True):
    # Read data
    root = os.path.join(os.path.dirname(__file__), "../../")
    storage = Storage(root)
    gd = GraphDataset(root, kind="cora")
    data = gd.download(return_mask=original)
    A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = data

    # Parameters
    N = X.shape[0]  # Number of nodes in the graph
    F = X.shape[1]  # Original feature dimension
    n_classes = Y_train.shape[1]  # Number of classes
    F_ = 8  # Output size of first GraphAttention layer
    n_attn_heads = 8  # Number of attention heads in first GAT layer
    dropout_rate = 0.6  # Dropout rate (between and inside GAT layers)
    l2_reg = 5e-4 / 2  # Factor for l2 regularization
    learning_rate = 5e-3  # Learning rate for Adam
    epochs = 120  # Number of training epochs
    es_patience = 100  # Patience fot early stopping
    l2 = K.regularizers.l2
    node_size = 32

    # Preprocessing operations
    X = preprocess_features(X)
    A = A + np.eye(A.shape[0])  # Add self-loops

    # Model definition (as per Section 3.3 of the paper)
    if original:
        from gcn.layers.graph_attention_layer_original import GraphAttentionLayer
        X_in = K.layers.Input(shape=(F, ))
        A_in = K.layers.Input(shape=(N, ))
    else:
        from gcn.layers.graph_attention_layer import GraphAttentionLayer
        X_in = K.layers.Input(shape=(N, F))
        A_in = K.layers.Input(shape=(N, N))

    I_in = K.layers.Input(shape=(node_size, ), dtype="int32")

    dropout1 = K.layers.Dropout(dropout_rate)(X_in)

    graph_attention_1 = GraphAttentionLayer(
        feature_units=F_,
        attn_heads=n_attn_heads,
        attn_heads_reduction="concat",
        dropout_rate=dropout_rate,
        activation="elu",
        kernel_regularizer=l2(l2_reg),
        attention=attention,
        attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in])

    dropout2 = K.layers.Dropout(dropout_rate)(graph_attention_1)
    graph_attention_2 = GraphAttentionLayer(
        n_classes,
        attn_heads=1,
        attn_heads_reduction="average",
        dropout_rate=dropout_rate,
        activation="softmax",
        kernel_regularizer=l2(l2_reg),
        attention=attention,
        attn_kernel_regularizer=l2(l2_reg))([dropout2, A_in])

    # Build model
    optimizer = K.optimizers.Adam(lr=learning_rate)

    if original:
        model = K.models.Model(inputs=[X_in, A_in], outputs=graph_attention_2)
        model.compile(optimizer=optimizer,
                      loss="categorical_crossentropy",
                      weighted_metrics=["acc"])
    else:
        output = K.layers.Lambda(lambda x: tf.reshape(tf.batch_gather(
            x, I_in), (-1, node_size, n_classes)))(graph_attention_2)
        model = K.models.Model(inputs=[X_in, A_in, I_in], outputs=output)
        model.compile(optimizer=optimizer,
                      loss="categorical_crossentropy",
                      metrics=["acc"])

    model.summary()

    # Callbacks
    experiment_dir = "log/gan_experiment"
    monitor = "val_acc"
    if original:
        experiment_dir += "_o"
        monitor = "val_weighted_acc"
    if not attention:
        experiment_dir += "_na"

    experiment_dir = storage.data_path(experiment_dir)
    model_path = os.path.join(experiment_dir, "best_model.h5")
    es_callback = K.callbacks.EarlyStopping(monitor=monitor,
                                            patience=es_patience)
    tb_callback = K.callbacks.TensorBoard(log_dir=experiment_dir)
    mc_callback = K.callbacks.ModelCheckpoint(model_path,
                                              monitor=monitor,
                                              save_best_only=True,
                                              save_weights_only=True)

    def batch_generator(indices, label):
        if len(indices) != len(label):
            raise Exception("Does not match length")
        batch_size = len(indices)
        batch_size = batch_size // node_size

        def generator():
            while True:
                for i in range(batch_size):
                    _X = np.array([X])
                    _A = np.array([A])
                    samples = np.random.randint(len(indices), size=node_size)
                    _i = np.array([indices[samples]])
                    _label = np.array([label[samples]])
                    yield [_X, _A, _i], _label

        return generator(), batch_size

    if original:
        validation_data = ([X, A], Y_val, idx_val)
        model.fit(
            [X, A],
            Y_train,
            sample_weight=idx_train,
            epochs=epochs,
            batch_size=N,
            validation_data=validation_data,
            shuffle=False,  # Shuffling data means shuffling the whole graph
            callbacks=[es_callback, tb_callback, mc_callback])

        # Load best model
        model.load_weights(model_path)

        # Evaluate model
        eval_results = model.evaluate([X, A],
                                      Y_test,
                                      sample_weight=idx_test,
                                      batch_size=N,
                                      verbose=0)
    else:
        val_generator, val_steps = batch_generator(idx_val, Y_val)
        train_generator, train_steps = batch_generator(idx_train, Y_train)

        model.fit_generator(train_generator,
                            train_steps,
                            validation_data=val_generator,
                            validation_steps=val_steps,
                            epochs=epochs,
                            callbacks=[es_callback, tb_callback, mc_callback])

        # Load best model
        model.load_weights(model_path)

        # Evaluate model
        test_generator, test_steps = batch_generator(idx_test, Y_test)
        eval_results = model.evaluate_generator(test_generator,
                                                test_steps,
                                                verbose=0)

    print("Done.\n"
          "Test loss: {}\n"
          "Test accuracy: {}".format(*eval_results))
Beispiel #20
0
class BaseTrainer():
    def __init__(self,
                 root="",
                 lang=None,
                 min_df=5,
                 max_df=sys.maxsize,
                 unknown="<unk>",
                 preprocessor_name="preprocessor",
                 log_dir=""):
        default_root = os.path.join(os.path.dirname(__file__), "../../")
        _root = root if root else default_root

        self.storage = Storage(_root)
        self.preprocessor_name = preprocessor_name
        self._base_log_dir = log_dir
        self._built = False
        self.preprocessor = Preprocessor(text_transformers=[
            ct.text.UnicodeNormalizer(),
            ct.text.LowerNormalizer()
        ],
                                         tokenizer=ct.Tokenizer(lang=lang),
                                         vocabulary=ct.Vocabulary(
                                             min_df=min_df,
                                             max_df=max_df,
                                             unknown=unknown))

    def load_preprocessor(self):
        if os.path.exists(self.preprocessor_path):
            self._built = True
            self.preprocessor = joblib.load(self.preprocessor_path)

    @property
    def preprocessor_path(self):
        if self._base_log_dir:
            path = self._log_dir + "/{}.pkl".format(self.preprocessor_name)
            return self.storage.data_path(path)
        else:
            path = "interim/{}.pkl".format(self.preprocessor_name)
            return self.storage.data_path(path)

    @property
    def _log_dir(self):
        folder = "/" + self._base_log_dir if self._base_log_dir else ""
        log_dir = "log{}".format(folder)
        if not os.path.exists(self.storage.data_path(log_dir)):
            os.mkdir(self.storage.data_path(log_dir))

        return log_dir

    @property
    def log_dir(self):
        return self.storage.data_path(self._log_dir)

    @property
    def model_path(self):
        return self.storage.data_path(self._log_dir + "/model.h5")

    @property
    def tensorboard_dir(self):
        return self.storage.data_path(self._log_dir)

    def download(self):
        raise Exception("You have to specify what kinds of data you use.")

    def build(self, data_kind="train", field="", save=True):
        if not self._built:
            self.load_preprocessor()
        if self._built:
            print("Load existing preprocessor {}.".format(
                os.path.basename(self.preprocessor_path)))
            return 0

        r = self.download()
        if data_kind == "test":
            data = r.test_data()
        elif data_kind == "valid":
            data = r.valid_data()
        else:
            data = r.train_data()

        print("Building Dictionary from {} data...".format(data_kind))
        if not field:
            self.preprocessor.fit(data)
        else:
            self.preprocessor.fit(data[field])

        if save:
            joblib.dump(self.preprocessor, self.preprocessor_path)
        self._built = True
        print("Done!")
Beispiel #21
0
class MultiNLIDataset():
    def __init__(self, root, min_word_count=3, max_word_count=25, prefix=""):
        self.storage = Storage(root)
        self.nlp = spacy.load("en", parser=False, entity=False)
        self.min_word_count = min_word_count
        self.max_word_count = max_word_count
        self.prefix = prefix

    def train_data(self):
        return pd.read_csv(self.processed_file("train"))

    def test_data(self):
        return pd.read_csv(self.processed_file("test"))

    @classmethod
    def labels(self):
        return [
            "fiction", "government", "slate", "telephone", "travel",
            "nineeleven", "facetoface", "letters", "oup", "verbatim"
        ]

    def download(self):
        download_dir = self.storage.data_path("raw")
        matched = chazutsu.datasets.MultiNLI.matched().download(download_dir)
        mismatched = chazutsu.datasets.MultiNLI.mismatched().download(
            download_dir)

        for kind in ["train", "test"]:
            data = self._merge_data(matched, mismatched, kind)
            data.to_csv(self.interim_file(kind))
            preprocessed = self.preprocess(data)
            preprocessed = pd.concat(
                [preprocessed["text"], preprocessed["label"]], axis=1)
            preprocessed.to_csv(self.processed_file(kind), index=False)
        return self

    def interim_file(self, kind):
        if self.prefix:
            p = "interim/{}_multi_nli_{}.csv".format(self.prefix, kind)
        else:
            p = "interim/multi_nli_{}.csv".format(kind)

        return self.storage.data_path(p)

    def processed_file(self, kind):
        if self.prefix:
            p = "processed/{}_multi_nli_{}.csv".format(self.prefix, kind)
        else:
            p = "processed/multi_nli_{}.csv".format(kind)

        return self.storage.data_path(p)

    def preprocess(self, df):
        # Drop duplicates
        except_d = df.drop_duplicates(["text"])

        # Count words
        word_count = except_d["text"].apply(lambda x: len(self.nlp(x)))
        except_d = except_d.assign(word_count=pd.Series(word_count).values)

        limited = except_d[(self.min_word_count <= except_d["word_count"])
                           & (except_d["word_count"] <= self.max_word_count)]

        # Equalize data count
        min_count = limited["label"].value_counts().min()
        selected = limited.groupby("label").apply(
            lambda x: x.sample(n=min_count))
        selected = selected.drop(columns=["label", "index"]).reset_index()

        # Convert label to index
        selected["label"] = selected["label"].apply(
            lambda x: self.labels().index(x))

        return selected

    def _merge_data(self, matched, mismatched, kind="train"):
        dataset = []
        for d in [matched, mismatched]:
            if kind == "train":
                _d = d.dev_data()
            else:
                _d = d.test_data()

            _d = pd.concat([_d["genre"], _d["sentence1"]], axis=1)
            dataset.append(_d)
        merged = pd.concat(dataset).reset_index()
        merged.rename(columns={
            "sentence1": "text",
            "genre": "label"
        },
                      inplace=True)
        return merged
Beispiel #22
0
 def test_path(self):
     root = os.path.join(os.path.dirname(__file__), "../../")
     storage = Storage(root)
     correct_path = os.path.join(root, "data/raw")
     self.assertEqual(resolve(storage.data_path("raw")),
                      resolve(correct_path))
Beispiel #23
0
 def __init__(self, root, min_word_count=3, max_word_count=25, prefix=""):
     self.storage = Storage(root)
     self.nlp = spacy.load("en", parser=False, entity=False)
     self.min_word_count = min_word_count
     self.max_word_count = max_word_count
     self.prefix = prefix