def df_from_file(path):
    data = load_json(path)
    meta = [["experiment_setup", "subcategory"],
            ["experiment_setup", "method"], ["experiment_setup", "embeddings"]]
    df = json_normalize(data, meta=meta)
    # df["reciprocal_rank"] = 1 / (df["rank"] + 1)
    return df
Beispiel #2
0
def load_dataset_infos():
    for f_meta in gen_metadata_snippets(Path(dir_datasets)):
        # print("visiting", f_meta.parent)
        metadata = load_json(f_meta)
        if "name" in metadata:
            metadata["local_path"] = f_meta.parent
            resources[metadata["name"]] = metadata
Beispiel #3
0
    def _load_dataset(self, dataset):
        """Loading the vocabulary file from the location specified in the
        ldt config file. If vecto-style metadata is found, it will also be
        bundled with the experiment metadata automatically.

        Args:
            dataset (str): either a full path to the dataset or a subfolder
            of "experiments/vocab_samples" folder in the general ldt
            resources location.

        Returns:
            None
        """
        dataset_metadata_path = \
            os.path.join(config["path_to_resources"], "experiments",
                         "vocab_samples", dataset, "metadata.json")
        if os.path.isfile(dataset_metadata_path):
            self.metadata["dataset"] = load_json(dataset_metadata_path)
        else:
            self.metadata["dataset"] = dataset
        dataset_path = dataset_metadata_path.strip("metadata.json")
        # assume there is a single ".vocab" file in the dataset folder

        file = [x for x in os.listdir(dataset_path) if x.endswith(".vocab")][0]
        dataset = load_resource(os.path.join(dataset_path, file),
                                format="vocab")
        self.dataset = list(dataset)
Beispiel #4
0
    def __init__(self, experiment_name=config["experiments"]["experiment_name"],
                 extra_metadata=None,
                 dataset=config["experiments"]["vocab_sample"],
                 embeddings=config["experiments"]["embeddings"],
                 output_dir=os.path.join(config["path_to_resources"],
                                         "experiments"),
                 overwrite=config["experiments"]["overwrite"],
                 experiment_subfolder=None):
        """ Initializing an Experiment.

        Args:
            experiment_name (str): the human-readable name of an experiment
                (e.g. "Profiling CBOW with window size 2-10")
            extra_metadata (dict): any extra fields to be added to the
                experiment metadata (overwriting any previously existing fields)
            embeddings (list of str or None): a list of paths to input
                data (each containing a metadata.json file). If set to None,
                the config parameters will be ignored (for experiments where
                embedding metadata has already been processed and can be just
                copied over from the previous step.)
            output_dir (str): the *existing* path for saving the *subfolder*
                named with the specified experiment_name, where the output data
                and metadata.json file will be saved.
            dataset (str): the location of the dataset to be used in the
                experiment.
            overwrite (bool): if True, any previous data for the same
                experiment will be overwritten, and the experiment will be
                re-started. If metadata from previous experiment is not
                found, this setting is disregarded.
            experiment_subfolder (str): if provided, the experiment results
                will be saved to this subfolder of the "experiments" folder
        """

        if not isinstance(experiment_name, str):
            raise ValueError("Please specify experiment_name argument: a short "
                             "description of the experiment you're conducting.")

        self.output_dir = check_output(output_dir, experiment_subfolder,
                                       experiment_name)
        self.message = None
        if embeddings:
            self.embeddings = check_input(input_data=embeddings)

        self._overwrite = overwrite
        if self._overwrite:
            self._init_metadata(embeddings)

        else:
            metadata_path = os.path.join(self.output_dir, "metadata.json")
            if os.path.isfile(metadata_path):
                self.metadata = load_json(metadata_path)
            else:
                self._init_metadata(embeddings)
                self._overwrite = True

        self._load_dataset(dataset=dataset)
        if isinstance(extra_metadata, dict):
            self.metadata.update(extra_metadata)
Beispiel #5
0
def df_from_file(path):
    data = load_json(path)
    meta = [["experiment_setup", "subcategory"],
            ["experiment_setup", "method"], ["experiment_setup", "embeddings"]]
    dframe = json_normalize(data, meta=meta)
    if "details" in dframe:
        dframe.drop("details", axis="columns", inplace=True)
    dframe["result"] = dframe[
        "result." + dframe["experiment_setup.default_measurement"].unique()[0]]
    # df["reciprocal_rank"] = 1 / (df["rank"] + 1)
    return dframe
Beispiel #6
0
def df_from_file(path):
    data = load_json(path)
    meta = [["experiment_setup", "task"], ["experiment_setup", "subcategory"],
            ["experiment_setup", "method"], ["experiment_setup", "embeddings"]]
    dframe = json_normalize(data, meta=meta)
    if "details" in dframe:
        dframe.drop("details", axis="columns", inplace=True)
    default_measurement = "accuracy"
    try:
        default_measurement = dframe[
            "experiment_setup.default_measurement"].unique()[0]
    except:
        logger.warning(f"default_measurement not specified in {path}")
    dframe["result"] = dframe["result." + default_measurement]
    # df["reciprocal_rank"] = 1 / (df["rank"] + 1)
    return dframe
Beispiel #7
0
    def _init_metadata(self, embeddings):
        """Metadata Initialization helper"""
        self.metadata = {}

        self.metadata["timestamp"] = {}
        self.metadata["version"] = "ldt v. " + __version__
        self.metadata["class"] = "experiment"
        if hasattr(self, "embeddings"):
            self.metadata["embeddings"] = []
            shared_subpath = check_shared_subpath(embeddings, "")
            for embedding in embeddings:

                meta_path = os.path.join(embedding, "metadata.json")
                if os.path.isfile(meta_path):
                    embedding_metadata = load_json(meta_path)
                    embedding_metadata["path"] = embedding
                else:
                    embedding_metadata = create_metadata_stub(embedding, shared_subpath)

                    save_json(embedding_metadata, meta_path)
                self.metadata["embeddings"].append(embedding_metadata)
    def get_result(self,
                   embs,
                   path_dataset,
                   path_output='/tmp/text_classification/'):
        self.out = path_output
        self.unit = embs.matrix.shape[1]

        if not os.path.isdir(path_output):
            os.makedirs(path_output)

        # Load a dataset
        self.path_dataset = path_dataset
        if self.path_dataset == 'dbpedia':
            train, test, vocab = text_datasets.get_dbpedia(
                char_based=self.char_based,
                vocab=embs.vocabulary.dic_words_ids,
                shrink=self.shrink)
        elif self.path_dataset.startswith('imdb.'):
            train, test, vocab = text_datasets.get_imdb(
                fine_grained=self.path_dataset.endswith('.fine'),
                char_based=self.char_based,
                vocab=embs.vocabulary.dic_words_ids,
                shrink=self.shrink)
        elif self.path_dataset in [
                'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
                'rt-polarity', 'subj'
        ]:
            train, test, vocab = text_datasets.get_other_text_dataset(
                self.path_dataset,
                char_based=self.char_based,
                vocab=embs.vocabulary.dic_words_ids,
                shrink=self.shrink)
        else:  # finallly, if file is not downloadable, load from local path
            train, test, vocab = text_datasets.get_dataset_from_path(
                path_dataset,
                vocab=embs.vocabulary.dic_words_ids,
                char_based=self.char_based,
                shrink=self.shrink)

        print('# train data: {}'.format(len(train)))
        print('# test  data: {}'.format(len(test)))
        print('# vocab: {}'.format(len(vocab)))
        n_class = len(set([int(d[1]) for d in train]))
        print('# class: {}'.format(n_class))

        train_iter = chainer.iterators.SerialIterator(train, self.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     self.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        # Setup a model
        if self.model == 'rnn':
            Encoder = nets.RNNEncoder
        elif self.model == 'cnn':
            Encoder = nets.CNNEncoder
        elif self.model == 'bow':
            Encoder = nets.BOWMLPEncoder
        encoder = Encoder(n_layers=self.layer,
                          n_vocab=len(vocab),
                          n_units=self.unit,
                          dropout=self.dropout,
                          wv=embs.matrix)
        model = nets.TextClassifier(encoder, n_class)
        if self.gpu >= 0:
            # Make a specified GPU current
            chainer.backends.cuda.get_device_from_id(self.gpu).use()
            model.to_gpu()  # Copy the model to the GPU

        # Setup an optimizer
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

        # Set up a trainer
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=convert_seq,
                                           device=self.gpu)
        trainer = training.Trainer(updater, (self.epoch, 'epoch'),
                                   out=self.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=convert_seq,
                                 device=self.gpu))

        # Take a best snapshot
        record_trigger = training.triggers.MaxValueTrigger(
            'validation/main/accuracy', (1, 'epoch'))
        trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                       trigger=record_trigger)

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        # Save vocabulary and model's setting
        if not os.path.isdir(self.out):
            os.mkdir(self.out)
        vocab_path = os.path.join(self.out, 'vocab.json')
        with open(vocab_path, 'w') as f:
            json.dump(vocab, f)
        model_path = os.path.join(self.out, 'best_model.npz')
        experiment_setup = self.__dict__
        experiment_setup['vocab_path'] = vocab_path
        experiment_setup['model_path'] = model_path
        experiment_setup['n_class'] = n_class
        experiment_setup['datetime'] = self.current_datetime
        with open(os.path.join(self.out, 'args.json'), 'w') as f:
            json.dump(self.__dict__, f)

        # Run the training
        trainer.run()

        result = {}
        result['experiment_setup'] = experiment_setup
        result['log'] = load_json(os.path.join(self.out, 'log'))
        result['result'] = result['log'][-1]['validation/main/accuracy']
        return result
Beispiel #9
0
    def get_result(self,
                   embeddings,
                   path_dataset,
                   path_output='/tmp/text_classification/'):
        self.out = path_output
        self.unit = embeddings.matrix.shape[1]

        if not os.path.isdir(path_output):
            os.makedirs(path_output)

        # TODO: move this to protonn ds management
        self.path_dataset = path_dataset
        # if self.path_dataset == 'dbpedia':
        #     train, test, vocab = text_datasets.get_dbpedia(
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset.startswith('imdb.'):
        #     train, test, vocab = text_datasets.get_imdb(
        #         fine_grained=self.path_dataset.endswith('.fine'),
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine',
        #                            'custrev', 'mpqa', 'rt-polarity', 'subj']:
        #     train, test, vocab = text_datasets.get_other_text_dataset(
        #         self.path_dataset,
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # else:  # finallly, if file is not downloadable, load from local path
        print(path_dataset)
        path_adapter = os.path.join(path_dataset, "adapter.py")
        if os.path.isfile(path_adapter):
            spec = importlib.util.spec_from_file_location(
                "ds_adapter", path_adapter)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            adapter = module.Adapter()
            train, test, _ = adapter.read()
            vocab = embeddings.vocabulary.dic_words_ids
            train = nlp_utils.transform_to_array(train, vocab)
            test = nlp_utils.transform_to_array(test, vocab)

            # exit(0)
        else:
            train, test, vocab = text_datasets.get_dataset_from_path(
                path_dataset,
                vocab=embeddings.vocabulary.dic_words_ids,
                char_based=self.char_based,
                shrink=self.shrink)

        print('# cnt train samples: {}'.format(len(train)))
        print('# cnt test  samples: {}'.format(len(test)))
        print('# size vocab: {}'.format(len(vocab)))
        n_class = len(set([int(d[1]) for d in train]))
        print('# cnt classes: {}'.format(n_class))
        # print(train[0])
        # exit(0)

        train_iter = chainer.iterators.SerialIterator(train, self.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     self.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        # Setup a model
        if self.model == 'rnn':
            Encoder = nets.RNNEncoder
        elif self.model == 'cnn':
            Encoder = nets.CNNEncoder
        elif self.model == 'bow':
            Encoder = nets.BOWMLPEncoder
        encoder = Encoder(n_layers=self.layer,
                          n_vocab=len(vocab),
                          n_units=self.unit,
                          dropout=self.dropout,
                          wv=embeddings.matrix)
        model = nets.TextClassifier(encoder, n_class)
        if self.gpu >= 0:
            # Make a specified GPU current
            chainer.backends.cuda.get_device_from_id(self.gpu).use()
            model.to_gpu()  # Copy the model to the GPU

        # Setup an optimizer
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

        # Set up a trainer
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=nlp_utils.convert_seq,
                                           device=self.gpu)
        trainer = training.Trainer(updater, (self.epoch, 'epoch'),
                                   out=self.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=nlp_utils.convert_seq,
                                 device=self.gpu))

        # Take a best snapshot
        record_trigger = training.triggers.MaxValueTrigger(
            'validation/main/accuracy', (1, 'epoch'))
        trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                       trigger=record_trigger)

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        # Save vocabulary and model's setting
        if not os.path.isdir(self.out):
            os.mkdir(self.out)
        vocab_path = os.path.join(self.out, 'vocab.json')
        with open(vocab_path, 'w') as f:
            json.dump(vocab, f)
        model_path = os.path.join(self.out, 'best_model.npz')
        experiment_setup = self.__dict__
        # TODO: move all this to the parent class
        experiment_setup['task'] = "text classification"
        experiment_setup['vocab_path'] = vocab_path
        experiment_setup['model_path'] = model_path
        experiment_setup['n_class'] = n_class
        experiment_setup['datetime'] = self.current_datetime
        with open(os.path.join(self.out, 'args.json'), 'w') as f:
            json.dump(self.__dict__, f)

        # Run the training
        trainer.run()

        result = {}
        result['experiment_setup'] = experiment_setup
        result['experiment_setup']['default_measurement'] = 'accuracy'
        result['experiment_setup']['dataset'] = os.path.basename(
            os.path.normpath(path_dataset))
        result['experiment_setup']['method'] = self.model
        result['experiment_setup']['embeddings'] = embeddings.metadata
        result['log'] = load_json(os.path.join(self.out, 'log'))

        # TODO: old version was returning last test value, make a footnote
        # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']}
        accuracy = max(_["validation/main/accuracy"] for _ in result['log'])
        result['result'] = {"accuracy": accuracy}
        return [result]
Beispiel #10
0
    def __init__(self,
                 experiment_name=config["experiments"]["experiment_name"],
                 extra_metadata=None,
                 overwrite=config["experiments"]["overwrite"],
                 ld_scores="main",
                 output_dir=os.path.join(config["path_to_resources"],
                                         "experiments")):
        """ Annotating pre-computed top *n* neighbors for a given vocab sample

        Args:
            experiment_name (str): the human-readable name for the
                current experiment, which will be used to make a subfolder
                storing the generated data. If None, the folder will be simply
                timestamped.
            extra_metadata (dict): any extra fields to be added to the
                experiment metadata (overwriting any previously existing fields)
            output_dir (str): the *existing* path for saving the *subfolder*
                named with the specified experiment_name, where the output data
                and metadata.json file will be saved.
            overwrite (bool): if True, any previous data for the same
                experiment will be overwritten, and the experiment will be
                re-started.
            ld_scores (str or list of str): "all" for all supported scores,
                or a list of ld_scores. Supported values are:

                    - "SharedPOS",
                    - "SharedMorphForm",
                    - "SharedDerivation",
                    - "NonCooccurring",
                    - "CloseNeighbors",
                    - "FarNeighbors",
                    - "LowFreqNeighbors",
                    - 'HighFreqNeighbors',
                    - "GDeps",
                    - "TargetFrequency",
                    - "NeighborFrequency",
                    - "Associations",
                    - "ShortestPathMedian",
                    - "CloseInOntology",
                    - "Synonyms",
                    - "Antonyms",
                    - "Meronyms",
                    - "Hyponyms",
                    - "Hypernyms",
                    - "OtherRelations",
                    - "Numbers",
                    - "ProperNouns",
                    - "Misspellings",
                    - "URLs",
                    - "Filenames",
                    - "ForeignWords",
                    - "Hashtags",
                    - "Noise".

        Returns:

            (None): a table with ld scores for all available variables,
                together with the experiment metadata.

        """

        super(LDScoring, self).__init__(
            experiment_name=experiment_name, extra_metadata=extra_metadata, \
            overwrite=overwrite, embeddings=None, output_dir=output_dir,
            dataset=None, experiment_subfolder="analysis")

        self.metadata["task"] = "ld_scores_analysis"
        self.metadata["uuid"] = str(uuid.uuid4())
        self._load_dataset(dataset=None)
        neighbors_metadata_path = self.output_dir.replace(
            "analysis", "neighbors_annotated")
        neighbors_metadata_path = os.path.join(neighbors_metadata_path,
                                               "metadata.json")
        if not os.path.isfile(neighbors_metadata_path):
            raise IOError("The metadata for the annotated neighborhood files "
                          "was not found at " + neighbors_metadata_path)
        else:
            neighbors_metadata = load_json(neighbors_metadata_path)
            self.metadata["embeddings"] = neighbors_metadata["embeddings"]
            self.metadata["annotation"] = neighbors_metadata
            del self.metadata["annotation"]["embeddings"]
            self.embeddings = []
            for embedding in self.metadata["embeddings"]:
                self.embeddings.append(embedding["path"])

        self.supported_vars = [
            "SharedPOS", "SharedMorphForm", "SharedDerivation",
            "NonCooccurring", "GDeps", "TargetFrequency", "NeighborFrequency",
            "Associations", "ShortestPath", "Synonyms", "Antonyms", "Meronyms",
            "Hyponyms", "Hypernyms", "OtherRelations", "Numbers",
            "ProperNouns", "Misspellings", "URLs", "Filenames", "ForeignWords",
            "Hashtags", "Noise"
        ]

        self.continuous_vars = [
            'ShortestPath', 'TargetFrequency', 'NeighborFrequency',
            'CloseNeighbors', 'FarNeighbors'
        ]

        self.binary_vars = [x for x in self.supported_vars if not \
            x in self.continuous_vars]

        output_vars = [
            "Model", "SharedPOS", "SharedMorphForm", "SharedDerivation",
            "NonCooccurring", "CloseNeighbors", "FarNeighbors",
            "LowFreqNeighbors", 'HighFreqNeighbors', "GDeps", "Associations",
            "ShortestPathMedian", "CloseInOntology", "Synonyms", "Antonyms",
            "Meronyms", "Hyponyms", "Hypernyms", "OtherRelations", "Numbers",
            "ProperNouns", "Misspellings", "URLs", "Filenames", "ForeignWords",
            "Hashtags", "Noise"
        ]

        # corpus_specific = ["NonCooccurring", "LowFreqNeighbors",
        #                    "HighFreqNeighbors"]
        #
        # if not config["corpus"]:
        #     output_vars = [x for x in output_vars if not x in corpus_specific]

        output_scores_error = "The ld_scores argument is invalid. It should " \
                              "be 'all' for all supported relations, " \
                              "or a list with one or more of the following " \
                              "values:\n" + ", ".join(output_vars)

        if ld_scores == "all":
            self.output_vars = output_vars
        elif ld_scores == "main":
            exclude = [
                "ShortestPathMedian", "URLs", "Filenames", "Hashtags", "Noise"
            ]
            if not config["corpus"]:
                exclude += [
                    "NonCooccurring", "LowFreqNeighbors", 'HighFreqNeighbors',
                    "GDeps"
                ]

            self.output_vars = [x for x in output_vars if not x in exclude]

        else:
            if isinstance(ld_scores, list):
                unsupported = [x for x in ld_scores if not x in output_vars]
                if unsupported:
                    raise ValueError(output_scores_error)
                else:
                    self.output_vars = [
                        x for x in output_vars if x in ld_scores
                    ]
                    self.output_vars = ["Model"] + self.output_vars
            else:
                raise ValueError(output_scores_error)
        self.metadata["ld_scores"] = self.output_vars
        self.message = None  #"\n Annotation done! Analyzing the data now."
Beispiel #11
0
    def __init__(self,
                 experiment_name=config["experiments"]["experiment_name"],
                 extra_metadata=None,
                 overwrite=config["experiments"]["overwrite"],
                 ld_scores="main",
                 output_dir=os.path.join(config["path_to_resources"],
                                         "experiments"),
                 ldt_analyzer=None,
                 multiprocessing=config["experiments"]["multiprocessing"],
                 debugging=False):
        """ Annotating pre-computed top *n* neighbors for a given vocab sample

        Args:
            experiment_name (str): the human-readable name for the
                current experiment, which will be used to make a subfolder
                storing the generated data. If None, the folder will be simply
                timestamped.
            extra_metadata (dict): any extra fields to be added to the
                experiment metadata (overwriting any previously existing fields)
            output_dir (str): the *existing* path for saving the *subfolder*
                named with the specified experiment_name, where the output data
                and metadata.json file will be saved.
            overwrite (bool): if True, any previous data for the same
                experiment will be overwritten, and the experiment will be
                re-started.
            ldt_analyzer: :class:`~ldt.relations.pair.RelationsInPair`
                instance, with lexicographic, morphological and normalization
                resources set up as desired (see tutorial and
                class documentation). If None, default settings for English
                will be used.
            ld_scores (str or list of str): "all" for all supported scores,
                or a list of ld_scores. Supported values are:

                    - "SharedPOS",
                    - "SharedMorphForm",
                    - "SharedDerivation",
                    - "NonCooccurring",
                    - "GDeps",
                    - "TargetFrequency",
                    - "NeighborFrequency",
                    - "Associations",
                    - "ShortestPath",
                    - "Synonyms",
                    - "Antonyms",
                    - "Meronyms",
                    - "Hyponyms",
                    - "Hypernyms",
                    - "OtherRelations",
                    - "Numbers",
                    - "ProperNouns",
                    - "Noise",
                    - "URLs",
                    - "Filenames",
                    - "ForeignWords",
                    - "Hashtags"
                    - 'TargetFrequency',
                    - 'NeighborFrequency'.

        See more details for these scores `here
        <http://ldtoolkit.space/ldscores/>`_.

        Returns:
            (None): the annotated neighbors file will be written to disk
                together with the experiment metadata.

        """

        super(AnnotateVectorNeighborhoods, self).__init__(
            experiment_name=experiment_name, extra_metadata=extra_metadata, \
            overwrite=overwrite, embeddings=None, output_dir=output_dir,
            dataset=None, experiment_subfolder="neighbors_annotated")

        self.metadata["task"] = "annotate_neighbors"
        self.metadata["uuid"] = str(uuid.uuid4())
        self.metadata["ldt_config"] = config
        self.metadata["output_dir"] = self.output_dir
        self.metadata["debugging"] = debugging
        self.metadata["multiprocessing"] = multiprocessing

        self._load_dataset(dataset=None)
        neighbors_metadata_path = self.output_dir.replace(
            "neighbors_annotated", "neighbors")

        neighbors_metadata_path = os.path.join(neighbors_metadata_path,
                                               "metadata.json")
        if not os.path.isfile(neighbors_metadata_path):
            raise IOError("The metadata for the neighborhood generation task "
                          "was not found at " + neighbors_metadata_path)
        else:
            self.metadata["neighbors_metadata_path"] = neighbors_metadata_path
            neighbors_metadata = load_json(neighbors_metadata_path)
            self.metadata["embeddings"] = neighbors_metadata["embeddings"]
            self.embeddings = []
            for embedding in self.metadata["embeddings"]:
                self.embeddings.append(embedding["path"])

        self.message = "\n\nStarting LD annotation. This will take a while " \
                       "for " \
                       "the first files, but the remainder should go faster, " \
                       "because many neighbor pairs will be the same."

        # self.metadata["failed_pairs"] = []
        self.metadata["missed_pairs"] = []
        self.metadata["total_pairs"] = 0

        self.supported_vars = [
            "SharedPOS", "SharedMorphForm", "SharedDerivation",
            "NonCooccurring", "GDeps", "TargetFrequency", "NeighborFrequency",
            "Associations", "ShortestPath", "Synonyms", "Antonyms", "Meronyms",
            "Hyponyms", "Hypernyms", "OtherRelations", "Numbers",
            "ProperNouns", "Misspellings", "URLs", "Filenames", "ForeignWords",
            "Hashtags", "Noise"
        ]

        self.continuous_vars = [
            'ShortestPath', 'TargetFrequency', 'NeighborFrequency'
        ]

        corpus_specific = [
            "NonCooccurring", "TargetFrequency", "NeighborFrequency"
        ]
        if not config["corpus"]:
            for i in [self.supported_vars, self.continuous_vars]:
                i = [x for x in i if not i in corpus_specific]

        self.binary_vars = [x for x in self.supported_vars if not \
            x in self.continuous_vars]

        ld_scores_error = "The ld_scores argument is invalid. It should be " \
                          "'all' for all supported relations, or a list with " \
                          "one or more of the following values:\n" + \
                          ", ".join(self.supported_vars)

        if ld_scores == "all":
            self._ld_scores = self.supported_vars

        elif ld_scores == "main":
            exclude = [
                "ShortestPath", "URLs", "Filenames", "Hashtags", "Noise"
            ]
            if not config["corpus"]:
                exclude += [
                    "NonCooccurring", "GDeps", "TargetFrequency",
                    "NeighborFrequency"
                ]
            self._ld_scores = [
                x for x in self.supported_vars if not x in exclude
            ]
        else:
            if isinstance(ld_scores, list):
                unsupported = [
                    x for x in ld_scores if not x in self.supported_vars
                ]
                if unsupported:
                    raise ValueError(ld_scores_error)
                else:
                    self._ld_scores = [
                        x for x in self.supported_vars if x in ld_scores
                    ]
            else:
                raise ValueError(ld_scores_error)

        self.metadata["ld_scores"] = self._ld_scores
        self.metadata["continuous_vars"] = self.continuous_vars
        self.metadata["binary_vars"] = self.binary_vars

        self.ldt_analyzer = ldt_analyzer