コード例 #1
0
    def test_loaded_polyglot_embeddings(self):

        data = pickle_call('data/embeddings/polyglot-en.pkl')

        dl = DataLoader(embeddings_initial='Polyglot',
                        embedding_loading='in_dict')
        dl.load('data/pickles/')
        dl.get_all_and_dump('data/pickles/')

        all_true = None

        for i in range(len(data[0])):

            term = data[0][i]
            embedding = data[1][i]

            if term in dl.embedding.vocab_dict:
                position = dl.embedding.vocab_dict[term]
                stored_embedding = dl.embedding.embeddings.weight[
                    position].data.numpy()

                if all_true is None:
                    all_true = np.array_equal(embedding, stored_embedding)
                else:
                    all_true = all_true and np.array_equal(
                        embedding, stored_embedding)

        self.assertTrue(all_true)
コード例 #2
0
def split_train_dev_test(path, nr_train, nr_dev, nr_test):
    """
    Here we split the data sets based on the defined number of data points
    :param path: path of the original data set
    :param nr_train: number of data points to train on
    :param nr_dev: number of data points to validate on
    :param nr_test: number of data points to test on
    :return:
    """

    # retrieve the data set from file
    data = pickle_call(path)

    #shuffle it
    random.shuffle(data)

    # split it based on the defined numbers
    train = data[:nr_train]
    dev = data[nr_train:nr_train + nr_dev]
    test = data[nr_train + nr_dev:nr_train + nr_dev + nr_test]

    # dump it to file
    pickle_dump(path + "_train", train)
    pickle_dump(path + "_dev", dev)
    pickle_dump(path + "_test", test)
コード例 #3
0
    def load_top_k(self, K, preload=False):
        """
        Option for loading strategy: Only load top k of embeddings assuming that they are ordered in frequency
        :param K: Number of top k embeddings to be retrieved
        :return:embeddings matrix as numpy
        """

        # This embedding dataset does not have PAD UNK START and END tokens pretrained that is why we initialize them
        # ourselves and only load K - 4 embeddings

        # we load the token dictionary from file
        token_dict = pickle_call(self.path)

        # to load only the top most frequent words, we sort them
        token_list = [[token, counts] for token, counts in token_dict.items()]
        token_list = sorted(token_list, key=lambda x: -x[1])

        # because we also need to add special embeddings, we reduce K by 4
        K = K - 4

        # if K is defined larger than what is available, set K to the maximum length
        K = min(K, len(token_list))

        embeddings = []

        # add all the special embeddings and define the one-hot-encoded positions
        self.add_term("<S>", preload=preload)
        embedding = np.zeros(K+4, dtype=np.float64)
        embedding[0] = 1.0
        embeddings.append(embedding)
        self.add_term("</S>", preload=preload)
        embedding = np.zeros(K+4, dtype=np.float64)
        embedding[1] = 1.0
        embeddings.append(embedding)
        self.add_term("<PAD>", preload=preload)
        embedding = np.zeros(K+4, dtype=np.float64)
        embedding[2] = 1.0
        embeddings.append(embedding)
        self.add_term("<UNK>", preload=preload)
        embedding = np.zeros(K+4, dtype=np.float64)
        embedding[3] = 1.0
        embeddings.append(embedding)

        # then loop through the sorted tokens and also define the one-hot-encoded vectors
        for k in range(K):
            token = token_list[k][0]
            self.add_term(token, preload=preload)
            embedding = np.zeros(K + 4, dtype=np.float64)
            embedding[k+4] = 1.0
            embeddings.append(embedding)

        # define the embedding size
        self.embedding_size = K + 4

        # return the embeddings
        return np.array(embeddings)
コード例 #4
0
 def load(self, file_name):
     """
     load all data from file if we have it
     :param file_name: path and filename where it should be stored
     :return:
     """
     dump_dict = pickle_call(file_name)
     if dump_dict is not None:
         self.vocab_size = dump_dict['vocab_size']
         self.vocab_list = dump_dict['vocab_list']
         self.vocab_dict = dump_dict['vocab_dict']
         self.embeddings = dump_dict['embeddings']
         return True
     return False
コード例 #5
0
    def load_data(self,
                  data_type='train',
                  directory=None,
                  initialize_term=False):
        """
        This function loads all the data defined by the data set name
        :param data_type: 'train', 'dev', 'test'
        :param directory: DEPRECATED
        :param initialize_term:
        :return:
        """
        if self.name == 'Language_Text_100':
            path = '../data/data_set/data_set_100_' + data_type
            if not os.path.isfile(path):
                path = 'data/data_set/data_set_100_' + data_type
                if not os.path.isfile(path):
                    raise Exception("tokens dont exist")

        if self.name == 'Language_Text_1000':
            path = '../data/data_set/data_set_1000_' + data_type
            if not os.path.isfile(path):
                path = 'data/data_set/data_set_1000_' + data_type
                if not os.path.isfile(path):
                    raise Exception("tokens dont exist")

        if self.name == 'Language_Text_10000':
            path = '../data/data_set/data_set_10000_' + data_type
            if not os.path.isfile(path):
                path = 'data/data_set/data_set_10000_' + data_type
                if not os.path.isfile(path):
                    raise Exception("tokens dont exist")

        # retrieve the data set from disc
        self.data_sets[data_type] = pickle_call(path)

        # loop through every sentence and encode it using the embedding object
        for elem in self.data_sets[data_type]:
            line = elem['sentence']
            elem['sentence_positions'] = self.embeddings.encode_sentence(
                line, initialize=initialize_term, count_up=True)

        return self.data_sets[data_type]
コード例 #6
0
    def __init__(self):
        """
        This class calls the FastText data
        """

        # Load the super class
        super(PolyglotEmbeddings, self).__init__()

        # Name of the embedding dataset
        self.name = 'Polyglot'

        path = '../data/embeddings/polyglot-en.pkl'
        if not os.path.isfile(path):
            path = 'data/embeddings/polyglot-en.pkl'
            if not os.path.isfile(path):
                raise Exception(
                    "please load Polyglot Embeddings from http://bit.ly/19bSoAS and store in data/embeddings/"
                )

        self.path = path
        self.poly_data = pickle_call(self.path)
コード例 #7
0
 def load(self, file_name):
     dump_dict = pickle_call(file_name)
     if dump_dict is not None:
         self.data_sets = dump_dict
         return True
     return False