def __init__(self, name, config):
        """
        Initializes task object. Calls base constructor. Downloads the dataset if not present and loads the adequate files depending on the mode.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructors of parent classes.
        LanguageIdentification.__init__(self, name, WiLYLanguageIdentification,
                                        config)

        # Get absolute path.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Generate the dataset (can be turned off).
        filenames = ["x_train.txt", "y_train.txt", "x_test.txt", "y_test.txt"]
        if not io.check_files_existence(self.data_folder, filenames):
            # Download and unpack.
            url = "https://zenodo.org/record/841984/files/wili-2018.zip?download=1"
            zipfile_name = "wili-2018.zip"
            io.download_extract_zip_file(self.logger, self.data_folder, url,
                                         zipfile_name)

        # Select set.
        if self.config['use_train_data']:
            inputs_file = "x_train.txt"
            targets_file = "y_train.txt"
        else:
            inputs_file = "x_test.txt"
            targets_file = "y_test.txt"

        # Load files.
        self.inputs = io.load_string_list_from_txt_file(
            self.data_folder, inputs_file)
        self.targets = io.load_string_list_from_txt_file(
            self.data_folder, targets_file)

        # Assert that they are equal in size!
        assert len(self.inputs) == len(
            self.targets
        ), "Number of inputs loaded from {} not equal to number of targets loaded from {}!".format(
            inputs_file, targets_file)
Beispiel #2
0
    def __init__(self, name, config):
        """
        Initializes task object. Calls base constructor.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructors of parent classes.
        Task.__init__(self, name, WiLYNGramLanguageModeling, config) 

        # Set key mappings.
        self.key_inputs = self.stream_keys["inputs"]
        self.key_targets = self.stream_keys["targets"]

        # Get absolute path.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get size of context.
        self.context = self.config['context']

        # Select set.
        if self.config['use_train_data']:
            inputs_file = "x_train.txt"
            ngrams_file =  "ngrams_train.txt"
        else:
            inputs_file = "x_test.txt"
            ngrams_file =  "ngrams_test.txt"

        # Check if we can load ngrams.
        if not io.check_file_existence(self.data_folder, ngrams_file):
            # Sadly not, we have to generate them.
            if not io.check_file_existence(self.data_folder, inputs_file):
                # Even worst - we have to download wily.
                url = "https://zenodo.org/record/841984/files/wili-2018.zip?download=1"
                zipfile_name = "wili-2018.zip"
                io.download_extract_zip_file(self.logger, self.data_folder, url, zipfile_name)


            # Load file.
            inputs = io.load_string_list_from_txt_file(self.data_folder, inputs_file)

            self.logger.info("Please wait, generating n-grams...")
            self.ngrams_sent = []
            # Now we have to split sentencese into n-grams.
            for sentence in inputs:
                # Split sentence into words.
                words = sentence.split()
                
                # Build a list of ngrams.
                for i in range(len(words) - self.context):
                    ngram = [words[j] for j in range(i, i+1+self.context)]
                    self.ngrams_sent.append(' '.join(ngram))

            # Assert that they are any ngrams there!
            assert len(self.ngrams_sent) > 0, "Number of n-grams generated on the basis of '{}' must be greater than 0!".format(inputs_file)
            # Done.
            self.logger.info("Generated {} n-grams, example:\n{}".format(len(self.ngrams_sent), self.ngrams_sent[0]))

            self.logger.info("Saving {} n-grams to file '{}'".format(len(self.ngrams_sent), ngrams_file))
            # N-grams generated, save them to file.
            io.save_string_list_to_txt_file(self.data_folder, ngrams_file, self.ngrams_sent)
        else:
            self.logger.info("Please wait, loading n-grams from file '{}'".format(ngrams_file))
            # Load file.
            self.ngrams_sent = io.load_string_list_from_txt_file(self.data_folder, ngrams_file)

            # Assert that they are equal in size!
            assert len(self.ngrams_sent) > 0, "Number of n-grams loaded from {} must be greater than 0!".format(ngrams_file)
            # Done.
            self.logger.info("Loaded {} n-grams, example:\n{}".format(len(self.ngrams_sent), self.ngrams_sent[0]))
        
        # Split words in n-grams.
        self.ngrams = [ngram.split() for ngram in self.ngrams_sent]
Beispiel #3
0
def load_pretrained_embeddings(logger, folder, embeddings_name, word_to_ix,
                               embeddings_size):
    """
    Creates embedding vector for words from the provided (word:index) mappings (dictionary).

    Loads the pretrained embeddings from the GloVe project - for the words found in the dictionary.
    
    For words out of dictionary initializes random vectors.

    Available embeddings:
        - glove.6B.50d.txt
        - glove.6B.100d.txt
        - glove.6B.200d.txt
        - glove.6B.300d.txt
        - glove.42B.300d.txt
        - glove.840B.300d.txt
        - glove.twitter.27B.txt
        - fasttext.mimic.300d.txt

    :param logger: Logger object.

    :param folder: Relative path to to the folder.
    :type folder: str

    :param word_to_ix: (word:index) mappings
    :type word_to_ix: dict

    :param embeddings_size: Embeddings size. Warning: must match the length of vector in the selected file.

    :return: Torch tensor with loaded (or random) vectors.
    """
    # https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
    # http://ronny.rest/blog/post_2017_08_04_glove/

    # Check th presence of the file.
    # Available options.
    # https://nlp.stanford.edu/projects/glove/
    pretrained_embeddings_urls = {}
    pretrained_embeddings_urls["glove.6B.50d.txt"] = (
        "http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")
    pretrained_embeddings_urls["glove.6B.100d.txt"] = (
        "http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")
    pretrained_embeddings_urls["glove.6B.200d.txt"] = (
        "http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")
    pretrained_embeddings_urls["glove.6B.300d.txt"] = (
        "http://nlp.stanford.edu/data/glove.6B.zip", "glove.6B.zip")
    pretrained_embeddings_urls["glove.42B.300d.txt"] = (
        "http://nlp.stanford.edu/data/glove.42B.300d.zip",
        "glove.42B.300d.zip")
    pretrained_embeddings_urls["glove.840B.300d.txt"] = (
        "http://nlp.stanford.edu/data/glove.840B.300d.zip",
        "glove.840B.300d.zip")
    pretrained_embeddings_urls["glove.twitter.27B.txt"] = (
        "http://nlp.stanford.edu/data/glove.twitter.27B.zip",
        "glove.twitter.27B.zip")
    pretrained_embeddings_urls["mimic.fastText.no_clean.300d.pickled"] = (
        "https://mednli.blob.core.windows.net/shared/word_embeddings/mimic.fastText.no_clean.300d.pickled",
        "mimic.fastText.no_clean.300d.pickled")

    if (embeddings_name not in pretrained_embeddings_urls.keys()):
        logger.error(
            "Cannot load the indicated pretrained embeddings (current '{}' must be one of {})"
            .format(embeddings_name, pretrained_embeddings_urls.keys()))
        exit(1)

    logger.info("Initializing embeddings in folder {}".format(folder))

    num_loaded_embs = 0
    # Set random embeddings for words "out of vocabulary".
    # embeddings = np.zeros((len(word_to_ix), embeddings_size))
    embeddings = np.random.normal(scale=0.6,
                                  size=(len(word_to_ix), embeddings_size))

    # Open the embeddings file.
    if embeddings_name == "mimic.fastText.no_clean.300d.pickled":
        # Check if pickle exists.
        file_name = pretrained_embeddings_urls[embeddings_name][1]
        if not io.check_file_existence(folder, file_name):
            # Try to download the pickle.
            url = pretrained_embeddings_urls[embeddings_name][0]
            logger.info("Downloading file '{}' from {}".format(file_name, url))
            io.download(folder, file_name, url)
        else:
            logger.info("File '{}' found in {}".format(embeddings_name,
                                                       folder))

        # Load word embeddings map.
        word_embedding_map = io.load_pickle(logger,
                                            os.path.join(folder, file_name))
        # Iterate over map and cherry pick the vectors that fit our vocabulary.
        for w, index in word_to_ix.items():
            if w in word_embedding_map:
                vector = word_embedding_map[w]
                assert (
                    len(vector) == embeddings_size
                ), "Embeddings size must be equal to the size of pretrained embeddings!"
                embeddings[index] = vector
                num_loaded_embs += 1

    else:

        # Check presence of the file.
        if not io.check_file_existence(folder, embeddings_name):
            # Download and extract wikitext zip.
            io.download_extract_zip_file(
                logger, folder, pretrained_embeddings_urls[embeddings_name][0],
                pretrained_embeddings_urls[embeddings_name][1])
        else:
            logger.info(
                "File '{}' containing pretrained embeddings found in '{}' folder"
                .format(embeddings_name, folder))

        # Get number of lines/vectors.
        num_lines = sum(
            [1 for line in open(os.path.join(folder, embeddings_name))])
        t = tqdm.tqdm(total=num_lines)

        with open(os.path.join(folder, embeddings_name)) as f:
            # Parse file and cherry pick the vectors that fit our vocabulary.
            for line in f.readlines():
                values = line.split()
                if len(values) > embeddings_size + 1:
                    #print(len(values))
                    # Case: two (or more) words!
                    num_words = len(values) - embeddings_size
                    words = values[0:num_words]
                    word = ' '.join(words)
                    #print(word)
                    # Get remaining vector.
                    vector = np.array(values[num_words:], dtype='float32')
                else:
                    # Get word.
                    word = values[0]
                    # Get remaining vector.
                    vector = np.array(values[1:], dtype='float32')
                # Get index.
                index = word_to_ix.get(word)
                if index:
                    assert (
                        len(vector) == embeddings_size
                    ), "Embeddings size must be equal to the size of pretrained embeddings!"
                    # Ok, set vector.
                    embeddings[index] = vector
                    # Increment counter.
                    num_loaded_embs += 1
                t.update()
            t.close()

    logger.info(
        "Loaded {} pretrained embeddings for vocabulary of size {} from {}".
        format(num_loaded_embs, len(word_to_ix), embeddings_name))

    # Return matrix with embeddings.
    return torch.from_numpy(embeddings).float()
Beispiel #4
0
    def __init__(self, name, config):
        """
        The init method downloads the required files, loads the file associated with a given subset (train/valid/test), 
        concatenates all sencentes and tokenizes them using NLTK's WhitespaceTokenizer.

        It also stores the intermediate results, so for example, it file with tokenized set is found, it simply loads it.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructor of parent classes.
        Task.__init__(self, name, WikiTextLanguageModeling, config) 

        # Set streams key mappings.
        self.key_sources = self.stream_keys["sources"]
        self.key_targets = self.stream_keys["targets"]

        # Get absolute path to data folder.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get dataset.
        if (self.config['dataset'] is None) or (self.config['dataset'] not in ["wikitext-2", "wikitext-103"]):
            raise ConfigurationError("Task supports two 'dataset' options: 'wikitext-2', 'wikitext-103' ")
        dataset = self.config['dataset']

        # Get (sub)set: train/valid/test.
        if (self.config['subset'] is None) or (self.config['subset'] not in ['train', 'valid', 'test']):
            raise ConfigurationError("Task supports three 'subset' options: 'train', 'valid', 'test' ")
        subset = self.config['subset']

        # Check if file with tokenized words exists.
        filename_tokenized_words = "wiki."+self.config['subset']+".tokenized_words"

        if not io.check_files_existence(self.data_folder, filename_tokenized_words):
            # If not, we must generate (and save it) using source files.

            # Names of files used by this task.
            filenames = ["wiki.train.tokens", "wiki.valid.tokens", "wiki.test.tokens"]

            # Initialize dataset if files do not exist.
            if not io.check_files_existence(self.data_folder, filenames):
                # Set url and source filename depending on dataset.
                if dataset == "wikitext-2":
                    url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip"
                    zipfile_name = "wikitext-2-v1.zip"
                else: 
                    url = "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
                    zipfile_name = "wikitext-103-v1.zip"

                # Download and extract wikitext zip.
                io.download_extract_zip_file(self.logger, self.data_folder, url, zipfile_name)

                # Move extracted files to the right folder.
                io.move_files_between_dirs(self.logger, os.path.join(self.data_folder, dataset) , self.data_folder, filenames)
            else:
                self.logger.info("Files {} found in folder '{}'".format(filenames, self.data_folder))


            # Load the whole sentences.
            sentences = io.load_string_list_from_txt_file(self.data_folder, "wiki."+subset+".tokens")
            self.logger.info("Loaded {} sentences from the 'wiki.{}.tokens' subset".format(len(sentences), subset))

            # Generate text full of tokens.
            self.logger.info("Please wait, using NLTK to tokenize the loaded sentences...")
            # Create a single text by replacing newlines with <eos> tokens.
            text = " <eos> ".join(sentences)
            # Tokenize.
            tokenizer = WhitespaceTokenizer()
            self.tokens = tokenizer.tokenize(text)
            # Save fo file.
            io.save_string_list_to_txt_file(self.data_folder, filename_tokenized_words, self.tokens)
            self.logger.info("Created text consisting of {} tokens and saved it to '{}'".format(len(self.tokens), filename_tokenized_words))
        else:
            # Ok, file with tokens exists, load it.
            self.tokens = io.load_string_list_from_txt_file(self.data_folder, filename_tokenized_words)
            self.logger.info("Load text consisting of {} tokens from '{}'".format(len(self.tokens), filename_tokenized_words))

        # Get the required sample length.
        self.sentence_length = self.config['sentence_length']
        # Calculate the size of dataset.
        self.dataset_length = len(self.tokens) - self.sentence_length - 1 # as target is "shifted" by 1.

        # Display exemplary sample.
        self.logger.info("Exemplary sample:\n  source: {}\n  target: {}".format(self.tokens[0:self.sentence_length], self.tokens[1:self.sentence_length+1]))
Beispiel #5
0
    def __init__(self, name, config):
        """
        The init method downloads the required files, loads the file associated with a given subset (train/valid/test), 
        concatenates all sencentes and tokenizes them using NLTK's WhitespaceTokenizer.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructor of parent classes.
        Task.__init__(self, name, TranslationPairs, config) 

        # Set streams key mappings.
        self.key_sources = self.stream_keys["sources"]
        self.key_targets = self.stream_keys["targets"]

        # Get absolute path to data folder.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get dataset.
        if (self.config['dataset'] is None) or (self.config['dataset'] not in ["eng-fra", "eng-pol"]):
            raise ConfigurationError("Task supports only 'dataset' options: 'eng-fra', 'eng-pol'")
        dataset = self.config['dataset']

        # Get (sub)set: train/valid/test.
        if (self.config['subset'] is None) or (self.config['subset'] not in ['train', 'valid', 'test']):
            raise ConfigurationError("Task supports one 'subset' options: 'train', 'valid', 'test' ")
        subset = self.config['subset']

        # Extract source and target language name
        self.lang_source = self.config['dataset'].split('-')[0]
        self.lang_target = self.config['dataset'].split('-')[1]


        # Names of files used by this task.
        filenames = [
            self.lang_source + ".train.txt",
            self.lang_target + ".train.txt", 
            self.lang_source + ".valid.txt", 
            self.lang_target + ".valid.txt", 
            self.lang_source + ".test.txt", 
            self.lang_target + ".test.txt"
            ]

        # Initialize dataset if files do not exist.
        if not io.check_files_existence(os.path.join(self.data_folder, dataset), filenames):
            # Set url and source filename depending on dataset.
            url = "https://www.manythings.org/anki/" + self.lang_target + "-" + self.lang_source + ".zip"
            zipfile_name = "translate_" + self.lang_target + "_" + self.lang_source + ".zip"

            with tempfile.TemporaryDirectory() as tmpdirname:
                # Download and extract wikitext zip.
                io.download_extract_zip_file(self.logger, tmpdirname, url, zipfile_name)

                # Create train, valid, test files from the downloaded file
                lines = io.load_string_list_from_txt_file(tmpdirname, self.lang_target + ".txt")

                # Shuffle the lines
                random.seed(42)
                random.shuffle(lines)

                # Split english and french pairs
                lines_source = [self.normalizeString(l.split('\t')[0]) for l in lines]
                lines_target = [self.normalizeString(l.split('\t')[1]) for l in lines]

                # Cut dataset into train (90%), valid (5%), test (5%) files
                test_index = len(lines) // 20
                valid_index = test_index + (len(lines) // 20)

                os.makedirs(os.path.join(self.data_folder, dataset), exist_ok=True)
                
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".test.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_source[0:test_index]))
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".test.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_target[0:test_index]))

                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".valid.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_source[test_index:valid_index]))
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".valid.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_target[test_index:valid_index]))

                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_source + ".train.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_source[valid_index:]))
                with open(os.path.join(os.path.join(self.data_folder, dataset), self.lang_target + ".train.txt"), mode='w+') as f:
                    f.write('\n'.join(lines_target[valid_index:]))

        else:
            self.logger.info("Files {} found in folder '{}'".format(filenames, self.data_folder))


        # Load the lines
        lines_source = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_source + "."+subset+".txt")
        lines_target = io.load_string_list_from_txt_file(os.path.join(self.data_folder, dataset), self.lang_target + "."+subset+".txt")

        # Get the required sample length.
        self.sentence_length = self.config['sentence_length']

        # Separate into src - tgt sentence pairs + tokenize
        tokenizer = WhitespaceTokenizer()
        self.sentences_source = []
        self.sentences_target = []
        for s_src, s_tgt in zip(lines_source, lines_target):
            src = tokenizer.tokenize(s_src)
            tgt = tokenizer.tokenize(s_tgt)
            # Keep only the pairs that are shorter or equal to the requested length
            # If self.sentence_length < 0, then give all the pairs regardless of length
            if (len(src) <= self.sentence_length and len(tgt) <= self.sentence_length) \
                or self.sentence_length < 0:
                self.sentences_source += [src]
                self.sentences_target += [tgt]

        self.logger.info("Load text consisting of {} sentences".format(len(self.sentences_source)))

        # Calculate the size of dataset.
        self.dataset_length = len(self.sentences_source)

        # Display exemplary sample.
        self.logger.info("Exemplary sample:\n  source: {}\n  target: {}".format(self.sentences_source[0], self.sentences_target[0]))