Esempio n. 1
0
 def __init__(self, graph, path):
     Embedding.__init__(self, graph)
     self.graph = graph
     self.walks = None
     self.embedding = None
     self.path = path
     self.set_paths(path)
Esempio n. 2
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'use_dan',
                  dimensions=512,
                  corpus_size='na',
                  vocabulary_size='230k',
                  download_url='https://storage.googleapis.com/tfhub-modules/'
                  'google/universal-sentence-encoder/2.tar.gz',
                  format='tar.gz',
                  architecture='DAN',
                  trained_data='wikipedia and other sources',
                  language='en'),
        Embedding(name=u'use_transformer_large',
                  dimensions=512,
                  corpus_size='na',
                  vocabulary_size='230k',
                  download_url='https://storage.googleapis.com/tfhub-modules/'
                  'google/universal-sentence-encoder-large/3.tar.gz',
                  format='tar.gz',
                  architecture='Transformer',
                  trained_data='wikipedia and other sources',
                  language='en'),
        Embedding(name=u'use_transformer_lite',
                  dimensions=512,
                  corpus_size='na',
                  vocabulary_size='na',
                  download_url='https://storage.googleapis.com/tfhub-modules/'
                  'google/universal-sentence-encoder-lite/2.tar.gz',
                  format='tar.gz',
                  architecture='Transformer',
                  trained_data='wikipedia and other sources',
                  language='en')
    ]
    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.sess = tf.Session()
        self.sess.run(
            [tf.global_variables_initializer(),
             tf.tables_initializer()])
        self.use_module = None
        self.model = None

    def load_model(self, model: str, model_path: str):
        self.use_module = hub.Module(model_path)
        self.sess.run(tf.initializers.global_variables())
        self.model = model

    def encode(self,
               texts: list,
               pooling: Optional[str] = None) -> Optional[np.array]:
        return self.sess.run(self.use_module(texts))
Esempio n. 3
0
    def __init__(
        self, num_of_actions, epsilon=0.0001, num_of_neighbours=10, cluster_distance=0.008,
        pseudo_counts=0.001, maximum_similarity=8, episodic_memory_capacity=30000):
        self.epsilon = epsilon
        self.num_of_neighbours = num_of_neighbours
        self.cluster_distance = cluster_distance
        self.pseudo_counts = pseudo_counts
        self.maximum_similarity = maximum_similarity

        self.episodic_memory = deque([], maxlen=episodic_memory_capacity)
        self.moving_average = MovingAverage()
        self.network = Embedding(num_of_actions)
        self.optimizer = tf.keras.optimizers.Adam()
Esempio n. 4
0
def train(params, files):
    binary = str2bool(params['binary'])
    data_set, peptide_n_mer = read_data_set(files,
                                            test_size=0.05,
                                            binary=binary)
    print('Train data shape is {}'.format(data_set['X_train'].shape))
    print('Train data shape is {}'.format(data_set['X_test'].shape))
    # variable batch size depending on number of data points
    batch_size = int(np.ceil(len(data_set['X_train']) / 100.0))
    epochs = int(params['epochs'])
    nb_filter = int(params['filter_size'])
    filter_length = int(params['filter_length'])
    dropout = float(params['dropout'])
    lr = float(params['lr'])

    # manual drop last
    for name in data_set.keys():
        if data_set[name].shape[0] % batch_size != 0:
            data_set[name] = data_set[name][:-(data_set[name].shape[0] %
                                               batch_size)]

    # load in learned distributed representation HLA-Vec
    hla_vec_obj = Word2Vec.load(files['vector_embedding'])
    hla_vec_embed = hla_vec_obj.wv
    embed_shape = hla_vec_embed.syn0.shape
    embedding_weights = np.random.rand(embed_shape[0] + 1, embed_shape[1])
    for key in AA_IDX.keys():
        embedding_weights[AA_IDX[key], :] = hla_vec_embed[key]
        embedded_dim = embed_shape[1]

    embedding = Embedding(embedded_dim, embedding_weights)
    train_embedding = embedding(torch.from_numpy(data_set['X_train'])).numpy()
    train_embedding = train_embedding.reshape((train_embedding.shape[0], -1))
    test_embedding = embedding(torch.from_numpy(data_set['X_test'])).numpy()
    test_embedding = test_embedding.reshape((test_embedding.shape[0], -1))
    if str2bool(params['binary']):
        data_set['Y_train'] = np.argmax(data_set['Y_train'], -1)[:, np.newaxis]
        data_set['Y_test'] = np.argmax(data_set['Y_test'], -1)[:, np.newaxis]
    else:
        data_set['Y_train'] = data_set['Y_train'][:, np.newaxis]
        data_set['Y_test'] = data_set['Y_test'][:, np.newaxis]
    # weight_space(train_embedding, test_embedding, data_set)
    infinite_fcn(train_embedding, test_embedding, data_set, binary=binary)
    #infinite_resnet(train_embedding, test_embedding, data_set)
    print("The result of gaussian process of regression is:")
    gaussian_process(train_embedding,
                     test_embedding,
                     data_set,
                     is_classifier=False,
                     binary=binary)
Esempio n. 5
0
from typing import List, Dict

from models import Embedding

EMBEDDING_MODELS: List[Embedding] = [
    Embedding(
        name=u'use',
        dimensions=512,
        corpus_size='na',
        vocabulary_size='230k',
        download_url='https://tfhub.dev/google/universal-sentence-encoder/2',
        format='.tar.gz',
        architecture='DAN',
        trained_data='wikipedia and other sources',
        language='en'),
    Embedding(name=u'use_large',
              dimensions=512,
              corpus_size='na',
              vocabulary_size='230k',
              download_url=
              'https://tfhub.dev/google/universal-sentence-encoder-large/3',
              format='.tar.gz',
              architecture='Transformer',
              trained_data='wikipedia and other sources',
              language='en'),
    Embedding(name=u'use_lite',
              dimensions=512,
              corpus_size='na',
              vocabulary_size='na',
              download_url=
              'https://tfhub.dev/google/universal-sentence-encoder-lite/2',
Esempio n. 6
0
 def __init__(self, graph, save_path):
     Embedding.__init__(self, graph)
     self.set_paths(save_path)
     self.walker = RandomWalker(graph)
Esempio n. 7
0
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(
            name=u'google_news_300',
            dimensions=300,
            corpus_size='100B',
            vocabulary_size='3M',
            download_url=
            'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz',
            format='gz',
            architecture='skip-gram',
            trained_data='Google News',
            language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.word_vectors: Dict[Any, Any] = {}
        self.model = None

    @classmethod
    def _tokens(cls, text: str) -> List[str]:
        return [x.lower().strip() for x in text.split()]

    def load_model(self, model: str, model_path: str):

        try:
            encoding = 'utf-8'
            unicode_errors = 'strict'

            model_file = [
                f for f in os.listdir(model_path)
                if os.path.isfile(os.path.join(model_path, f))
            ]
            f = open(os.path.join(model_path, model_file[0]), 'rb')

            header = to_unicode(f.readline(), encoding=encoding)
            vocab_size, vector_size = (int(x) for x in header.split()
                                       )  # throws for invalid file format

            binary_len = dtype(real).itemsize * vector_size
            for _ in tqdm(range(vocab_size)):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = to_unicode(b''.join(word),
                                  encoding=encoding,
                                  errors=unicode_errors)

                weights = fromstring(f.read(binary_len),
                                     dtype=real).astype(real)

                self.word_vectors[word] = weights
            self.model = model
            print("Model loaded Successfully !")
            return self
        except Exception as e:
            print('Error loading Model, ', str(e))

    def encode(self, texts: list, pooling: str = 'mean', **kwargs) -> np.array:
        text = texts[0]
        result = np.zeros(Embeddings.EMBEDDING_MODELS[self.model].dimensions,
                          dtype="float32")
        tokens = Embeddings._tokens(text)

        vectors = np.array([
            self.word_vectors[token] for token in tokens
            if token in self.word_vectors.keys()
        ])

        if pooling == 'mean':
            result = np.mean(vectors, axis=0)

        elif pooling == 'max':
            result = np.max(vectors, axis=0)

        elif pooling == 'sum':
            result = np.sum(vectors, axis=0)

        elif pooling == 'tf-idf-sum':
            if not kwargs.get('tfidf_dict'):
                print('Must provide tfidf dict')
                return result

            tfidf_dict = kwargs.get('tfidf_dict')
            weighted_vectors = np.array([
                tfidf_dict.get(token) * self.word_vectors.get(token)
                for token in tokens
                if token in self.word_vectors.keys() and token in tfidf_dict
            ])
            result = np.mean(weighted_vectors, axis=0)
        else:
            print(
                f'Given pooling method "{pooling}" not implemented in "{self.model}"'
            )
        return result
Esempio n. 8
0
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'wiki_news_300',
                  dimensions=300,
                  corpus_size='16B',
                  vocabulary_size='1M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'wiki-news-300d-1M.vec.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Wikipedia 2017',
                  language='en'),
        Embedding(name=u'wiki_news_300_sub',
                  dimensions=300,
                  corpus_size='16B',
                  vocabulary_size='1M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'wiki-news-300d-1M-subword.vec.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Wikipedia 2017',
                  language='en'),
        Embedding(name=u'common_crawl_300',
                  dimensions=300,
                  corpus_size='600B',
                  vocabulary_size='2M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'crawl-300d-2M.vec.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Common Crawl (600B tokens)',
                  language='en'),
        Embedding(name=u'common_crawl_300_sub',
                  dimensions=300,
                  corpus_size='600B',
                  vocabulary_size='2M',
                  download_url=
                  'https://dl.fbaipublicfiles.com/fasttext/vectors-english/'
                  'crawl-300d-2M-subword.zip',
                  format='zip',
                  architecture='CBOW',
                  trained_data='Common Crawl (600B tokens)',
                  language='en'),
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.word_vectors: Dict[Any, Any] = {}
        self.model = None

    @classmethod
    def _tokens(cls, text):
        return [x.lower().strip() for x in text.split()]

    def load_model(self, model: str, model_path: str):
        try:
            model_file = [
                f for f in os.listdir(model_path)
                if os.path.isfile(os.path.join(model_path, f))
            ]
            f = open(os.path.join(model_path, model_file[0]), 'r')
            next(f)
            for line in tqdm(f):
                split_line = line.split()
                word = split_line[0]
                self.word_vectors[word] = np.array(
                    [float(val) for val in split_line[1:]])
            print("Model loaded Successfully !")
            self.model = model
            return self
        except Exception as e:
            print('Error loading Model, ', str(e))
        return self

    def encode(self, texts: list, pooling: str = 'mean', **kwargs) -> np.array:
        text = texts[0]
        result = np.zeros(Embeddings.EMBEDDING_MODELS[self.model].dimensions,
                          dtype="float32")
        tokens = Embeddings._tokens(text)
        vectors = np.array([
            self.word_vectors[token] for token in tokens
            if token in self.word_vectors.keys()
        ])

        if pooling == 'mean':
            result = np.mean(vectors, axis=0)

        elif pooling == 'max':
            result = np.max(vectors, axis=0)

        elif pooling == 'sum':
            result = np.sum(vectors, axis=0)

        elif pooling == 'tf-idf-sum':
            if not kwargs.get('tfidf_dict'):
                print('Must provide tfidf dict')
                return result

            tfidf_dict = kwargs.get('tfidf_dict')
            weighted_vectors = np.array([
                tfidf_dict.get(token) * self.word_vectors.get(token)
                for token in tokens
                if token in self.word_vectors.keys() and token in tfidf_dict
            ])
            result = np.mean(weighted_vectors, axis=0)
        else:
            print(f'Given pooling method "{pooling}" not implemented')
        return result
    def __init__(self,
                 vocab_size,
                 nb_negative,
                 embed_dims=128,
                 context_dims=128,
                 negprob_table=None,
                 optimizer='adam'):
        super(NCELangModelV2, self).__init__(weighted_inputs=False)
        self.vocab_size = vocab_size
        self.embed_dim = embed_dims
        self.optimizer = optimizers.get(optimizer)
        self.nb_negative = nb_negative
        self.loss = categorical_crossentropy
        self.loss_fnc = objective_fnc(self.loss)

        if negprob_table is None:
            negprob_table_ = np.ones(shape=(vocab_size, ),
                                     dtype=theano.config.floatX) / vocab_size
            negprob_table = theano.shared(negprob_table_)
            self.neg_prob_table = negprob_table_
        else:
            self.neg_prob_table = negprob_table.astype(theano.config.floatX)
            negprob_table = theano.shared(
                negprob_table.astype(theano.config.floatX))

        self.sampler = TableSampler(self.neg_prob_table)

        self.add_input(name='idxes', ndim=3, dtype='int32')
        self.add_node(Split(split_at=1, split_axis=0),
                      name=('pos_sents', ''),
                      inputs='idxes')

        seq = containers.Sequential()
        seq.add(self.nodes['pos_sents'])
        seq.add(Embedding(vocab_size, embed_dims))
        seq.add(LangLSTMLayer(embed_dims, output_dim=context_dims))
        # seq.add(Dropout(0.5))

        self.add_node(seq, name='seq')
        self.add_node(PartialSoftmax(input_dim=context_dims,
                                     output_dim=vocab_size),
                      name='part_prob',
                      inputs=('idxes', 'seq'))
        self.add_node(Dense(input_dim=context_dims,
                            output_dim=1,
                            activation='exponential'),
                      name='normalizer',
                      inputs='seq')
        self.add_node(LookupProb(negprob_table),
                      name='lookup_prob',
                      inputs='idxes')

        test_node = Dense(input_dim=context_dims,
                          output_dim=vocab_size,
                          activation='exponential')
        test_node.params = []
        test_node.W = self.nodes['part_prob'].W
        test_node.b = self.nodes['part_prob'].b
        self.add_node(test_node, name='true_unrm_prob', inputs='seq')
        # self.add_node(ActivationLayer(name='normalization'), name='true_prob', inputs='true_unrm_prob')

        self.add_output('pos_prob', node='part_prob')
        self.add_output('neg_prob', node='lookup_prob')
        # self.add_output('pred_prob', node='true_prob')
        self.add_output('normalizer', node='normalizer')
        self.add_output('unrm_prob', node='true_unrm_prob')
Esempio n. 10
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
                        Embedding(name=u'elmo_bi_lm',
                                  dimensions=512,
                                  corpus_size='1B',
                                  vocabulary_size='5.5B',
                                  download_url='https://storage.googleapis.com/tfhub-modules/google/elmo/2.tar.gz',
                                  format='tar.gz',
                                  architecture='Embedding layer,cnn_layer_with_maxpool,2 lstm layers',
                                  trained_data='One Billion Word Benchmark',
                                  language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    def __init__(self):
        self.elmo_module = None
        self.model = None

    @classmethod
    def tokenize(cls, text: str):
        return [word.strip() for word in text.lower().strip().split()]

    @classmethod
    def padded_tokens(cls, tokens: List[str], max_seq_length: int):
        padded_token = ""
        len_tokens = len(tokens)
        if len_tokens >= max_seq_length:
            return tokens[:max_seq_length]
        else:
            padded_len = max_seq_length - len_tokens
            return tokens + [padded_token] * padded_len

    def load_model(self, model: str, model_path: str):
        self.elmo_module = hub.Module(model_path)
        self.model = model

    def encode(self, texts: list, pooling: str = 'mean', **kwargs) -> Optional[np.array]:
        text_tokens = [Embeddings.tokenize(text) for text in texts]
        max_seq_length = kwargs.get('max_seq_length')
        if max_seq_length:
            text_tokens = [Embeddings.padded_tokens(tokens, max_seq_length) for tokens in text_tokens]
            seq_length = [max_seq_length] * len(texts)
        else:
            seq_length = [len(tokens) for tokens in text_tokens]

        embeddings = self.elmo_module(inputs={"tokens": text_tokens, "sequence_len": seq_length},
                                      signature="tokens", as_dict=True)["elmo"]

        if not pooling:
            return embeddings

        if pooling == 'mean':
            return tf.reduce_mean(embeddings, 0)

        elif pooling == 'max':
            return tf.reduce_max(embeddings, 0)

        elif pooling == 'min':
            return tf.reduce_min(embeddings, 0)

        elif pooling == 'mean_max':
            return tf.concat(values=[tf.reduce_mean(embeddings, 0), tf.reduce_max(embeddings, 0)], axis=0)

        else:
            print(f"Pooling method \"{pooling}\" not implemented")
        return None
Esempio n. 11
0
from typing import List, Dict

from models import Embedding

EMBEDDING_MODELS: List[Embedding] = [
    Embedding(
        name=u'infersent_glove',
        dimensions=300,
        corpus_size='570k human-generated English sentence pairs',
        vocabulary_size='na',
        download_url='https://dl.fbaipublicfiles.com/infersent/infersent1.pkl',
        format='tar.gz',
        architecture='cbow',
        trained_data='SNLI dataset',
        language='en'),
    Embedding(
        name=u'infersent_fasttext',
        dimensions=300,
        corpus_size='570k human-generated English sentence pairs',
        vocabulary_size='na',
        download_url='https://dl.fbaipublicfiles.com/infersent/infersent2.pkl',
        format='tar.gz',
        architecture='cbow',
        trained_data='SNLI dataset',
        language='en')
]

EMBEDDING_MODELS: Dict[str, Embedding] = {
    embedding.name: embedding
    for embedding in EMBEDDING_MODELS
}
Esempio n. 12
0
    #
    vocab.append_sents(valid_sents, fixed_vocab_set=fixed_vocab_set)
    vocab.append_sents(test_sents, fixed_vocab_set=fixed_vocab_set)
    #
    print('vocab size {} before shrink'.format(vocab.vocab_len))
    vocab.shrink_vocab(2)
    print('vocab size {} after shrink'.format(vocab.vocab_len))

    print('read vec')
    word_list = [vocab.idx2word[i] for i in range(len(vocab.idx2word))]
    vec = read_vec(pubmed_w2v_path, word_list)
    assert vec.shape[0] == vocab.vocab_len

    print('build emb layer')
    emb = Embedding(vocab.vocab_len,
                    vec.shape[1],
                    padding_idx=0,
                    trainable=False)
    emb.initialize_embedding(vec)
    emb.cuda()
    torch.save(emb.state_dict(), emb_path)

    print('dump data')
    train_sents = convert_sents_to_idx(train_sents, vocab)
    test_sents = convert_sents_to_idx(test_sents, vocab)
    valid_sents = convert_sents_to_idx(valid_sents, vocab)
    dump_preprocessed_data(opt.train_path, train_sents, train_labels)
    dump_preprocessed_data(opt.test_path, test_sents, test_labels)
    dump_preprocessed_data(opt.valid_path, valid_sents, valid_labels)
    dump_vocab(opt.vocab_path, vocab)
Esempio n. 13
0
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(name=u'twitter_100',
                  dimensions=100,
                  corpus_size='27B',
                  vocabulary_size='1.2M',
                  download_url='https://www.dropbox.com/s/q2wof83a0yq7q74/glove.twitter.27B.100d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Twitter 2B Tweets',
                  language='en'),
        Embedding(name=u'twitter_200',
                  dimensions=200,
                  corpus_size='27B',
                  vocabulary_size='1.2M',
                  download_url='https://www.dropbox.com/s/hfw00m77ibz24y5/glove.twitter.27B.200d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Twitter 2B Tweets',
                  language='en'),
        Embedding(name=u'twitter_25',
                  dimensions=25,
                  corpus_size='27B',
                  vocabulary_size='1.2M',
                  download_url='https://www.dropbox.com/s/jx97sz8skdp276k/glove.twitter.27B.25d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Twitter 2B Tweets',
                  language='en'),

        Embedding(name=u'twitter_50',
                  dimensions=50,
                  corpus_size='27B',
                  vocabulary_size='1.2M',
                  download_url='https://www.dropbox.com/s/9mutj8syz3q20e3/glove.twitter.27B.50d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Twitter 2B Tweets',
                  language='en'),
        Embedding(name=u'wiki_100',
                  dimensions=100,
                  corpus_size='6B',
                  vocabulary_size='0.4M',
                  download_url='https://www.dropbox.com/s/g0inzrsy1ds3u63/glove.6B.100d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Wikipedia+Gigaword',
                  language='en'),
        Embedding(name=u'wiki_200',
                  dimensions=200,
                  corpus_size='6B',
                  vocabulary_size='0.4M',
                  download_url='https://www.dropbox.com/s/pmj2ycd882qkae5/glove.6B.200d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Wikipedia+Gigaword',
                  language='en'),

        Embedding(name=u'wiki_300',
                  dimensions=300,
                  corpus_size='6B',
                  vocabulary_size='0.4M',
                  download_url='https://www.dropbox.com/s/9jbbk99p0d0n1bw/glove.6B.300d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Wikipedia+Gigaword',
                  language='en'),

        Embedding(name=u'wiki_50',
                  dimensions=50,
                  corpus_size='6B',
                  vocabulary_size='0.4M',
                  download_url='https://www.dropbox.com/s/o3axsz1j47043si/glove.6B.50d.txt.zip?dl=1',
                  format='zip',
                  architecture='glove',
                  trained_data='Wikipedia+Gigaword',
                  language='en'),

        Embedding(name=u'crawl_42B_300',
                  dimensions=300,
                  corpus_size='42B',
                  vocabulary_size='1.9M',
                  download_url='http://nlp.stanford.edu/data/glove.42B.300d.zip',
                  format='zip',
                  architecture='glove',
                  trained_data='Common Crawl (42B tokens)',
                  language='en'),

        Embedding(name=u'crawl_840B_300',
                  dimensions=300,
                  corpus_size='840B',
                  vocabulary_size='2.2M',
                  download_url='http://nlp.stanford.edu/data/glove.840B.300d.zip',
                  format='zip',
                  architecture='glove',
                  trained_data='Common Crawl (840B tokens)',
                  language='en')

    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    def __init__(self):
        self.word_vectors: Dict[Any, Any] = {}
        self.model = None

    @classmethod
    def _tokens(cls, text: str) -> List[str]:
        return [x.lower().strip() for x in text.split()]

    def load_model(self, model: str, model_path: str):
        try:
            model_file = [f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f))]
            f = open(os.path.join(model_path, model_file[0]), 'r')
            for line in tqdm(f):
                split_line = line.split()
                word = split_line[0]
                self.word_vectors[word] = np.array([float(val) for val in split_line[1:]])
            print("Model loaded Successfully !")
            self.model = model
            return self
        except Exception as e:
            print('Error loading Model, ', str(e))
        return self

    def encode(self, text: str, pooling: str = 'mean', **kwargs) -> np.array:
        result = np.zeros(Embeddings.EMBEDDING_MODELS[self.model].dimensions, dtype="float32")
        tokens = Embeddings._tokens(text)

        vectors = np.array([self.word_vectors[token] for token in tokens if token in self.word_vectors.keys()])

        if pooling == 'mean':
            result = np.mean(vectors, axis=0)

        elif pooling == 'max':
            result = np.max(vectors, axis=0)

        elif pooling == 'sum':
            result = np.sum(vectors, axis=0)

        elif pooling == 'tf-idf-sum':
            if not kwargs.get('tfidf_dict'):
                print('Must provide tfidf dict')
                return result

            tfidf_dict = kwargs.get('tfidf_dict')
            weighted_vectors = np.array([tfidf_dict.get(token) * self.word_vectors.get(token)
                                         for token in tokens if token in self.word_vectors.keys()
                                         and token in tfidf_dict])
            result = np.mean(weighted_vectors, axis=0)
        else:
            print(f'Given pooling method "{pooling}" not implemented in "{self.model}"')
        return result
Esempio n. 14
0
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
                        Embedding(name=u'bert_base_uncased',
                                  dimensions=768,
                                  corpus_size='3300M',
                                  vocabulary_size='30522(sub-word)',
                                  download_url='https://storage.googleapis.com/tfhub-modules/'
                                               'google/bert_uncased_L-12_H-768_A-12/1.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer, Layers=12, Hidden = 768, heads = 12',
                                  trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
                                  language='en'),

                        Embedding(name=u'bert_base_cased',
                                  dimensions=768,
                                  corpus_size='3300M',
                                  vocabulary_size='30522(sub-word)',
                                  download_url='https://storage.googleapis.com/tfhub-modules/google/'
                                               'bert_cased_L-12_H-768_A-12/1.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer Layers=12, Hidden = 768, heads = 12',
                                  trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
                                  language='en'),

                        Embedding(name=u'bert_multi_cased',
                                  dimensions=768,
                                  corpus_size='3300M',
                                  vocabulary_size='30522 (sub-word)',
                                  download_url='https://storage.googleapis.com/tfhub-modules/google/'
                                               'bert_multi_cased_L-12_H-768_A-12/1.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer Layers=12, Hidden = 768, heads = 12',
                                  trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
                                  language='en'),

                        Embedding(name=u'bert_large_uncased',
                                  dimensions=1024,
                                  corpus_size='3300M',
                                  vocabulary_size='30522 (sub-word)',
                                  download_url='https://storage.googleapis.com/tfhub-modules/google/'
                                               'bert_uncased_L-24_H-1024_A-16/1.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer Layers=24, Hidden = 1024, heads = 16',
                                  trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
                                  language='en'),

                        Embedding(name=u'bert_large_uncased',
                                  dimensions=1024,
                                  corpus_size='3300M',
                                  vocabulary_size='30522 (sub-word)',
                                  download_url='https://storage.googleapis.com/tfhub-modules/google/'
                                               'bert_uncased_L-24_H-1024_A-16/1.tar.gz',
                                  format='tar.gz',
                                  architecture='Transformer Layers=24, Hidden = 1024, heads = 16',
                                  trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
                                  language='en')
                        ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {embedding.name: embedding for embedding in EMBEDDING_MODELS}

    tokenizer: FullTokenizer = None

    def __init__(self):
        self.sess = tf.Session()
        self.bert_module = None
        self.model = None

    def create_tokenizer_from_hub_module(self, model_path: str):
        """Get the vocab file and casing info from the Hub module."""
        tokenization_info = self.bert_module(signature="tokenization_info", as_dict=True)
        vocab_file, do_lower_case = self.sess.run(
            [
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"],
            ]
        )

        Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

    @staticmethod
    def _model_single_input(text: str, max_seq_length: int) -> Tuple[List[int], List[int], List[int]]:
        tokens_a = Embeddings.tokenizer.tokenize(text)
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0: (max_seq_length - 2)]

        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        input_ids = Embeddings.tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        return input_ids, input_mask, segment_ids

    def load_model(self, model: str, model_path: str):
        self.bert_module = hub.Module(model_path)
        self.sess.run(tf.initializers.global_variables())
        self.create_tokenizer_from_hub_module(model_path)
        self.model = model
        print("Model loaded Successfully !")

    def encode(self, texts: list, pooling: Optional[str] = None, **kwargs) -> Optional[np.array]:
        max_seq_length = kwargs.get('max_seq_length', 128)
        input_ids, input_masks, segment_ids = [], [], []
        for text in tqdm(texts, desc="Converting texts to features"):
            input_id, input_mask, segment_id = self._model_single_input(text, max_seq_length)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        bert_inputs = dict(
            input_ids=np.array(input_ids),
            input_mask=np.array(input_masks),
            segment_ids=np.array(segment_ids))

        bert_outputs = self.bert_module(bert_inputs, signature="tokens", as_dict=True)
        sequence_output = bert_outputs["sequence_output"]

        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in ["mean", "max", "mean_max", "min"]:
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
Esempio n. 15
0
class Embeddings(object):
    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(
            name=u'elmo_bi_lm',
            dimensions=512,
            corpus_size='1B',
            vocabulary_size='5.5B',
            download_url=
            'https://storage.googleapis.com/tfhub-modules/google/elmo/2.tar.gz',
            format='tar.gz',
            architecture='Embedding layer,cnn_layer_with_maxpool,2 lstm layers',
            trained_data='One Billion Word Benchmark',
            language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    def __init__(self):
        self.elmo_module = None
        self.model = None
        self.sess = tf.Session()

    @classmethod
    def tokenize(cls, text: str):
        return [word.strip() for word in text.lower().strip().split()]

    @classmethod
    def padded_tokens(cls, tokens: List[str], max_seq_length: int):
        padded_token = ""
        len_tokens = len(tokens)
        if len_tokens >= max_seq_length:
            return tokens[:max_seq_length]
        else:
            padded_len = max_seq_length - len_tokens
            return tokens + [padded_token] * padded_len

    def load_model(self, model: str, model_path: str):
        self.elmo_module = hub.Module(model_path)
        self.sess.run(tf.initializers.global_variables())
        self.model = model

    def encode(self,
               texts: list,
               pooling: Optional[str] = None,
               **kwargs) -> Optional[np.array]:
        text_tokens = [Embeddings.tokenize(text) for text in texts]
        max_seq_length = kwargs.get('max_seq_length')
        if max_seq_length:
            text_tokens = [
                Embeddings.padded_tokens(tokens, max_seq_length)
                for tokens in text_tokens
            ]
            seq_length = [max_seq_length] * len(texts)
        else:
            seq_length = [len(tokens) for tokens in text_tokens]

        sequence_output = self.elmo_module(inputs={
            "tokens": text_tokens,
            "sequence_len": seq_length
        },
                                           signature="tokens",
                                           as_dict=True)["elmo"]

        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in ["mean", "max", "mean_max", "min"]:
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
Esempio n. 16
0
class Embeddings(object):

    EMBEDDING_MODELS: List[Embedding] = [
        Embedding(
            name=u'xlnet_large_cased',
            dimensions=1024,
            corpus_size='32.89B',
            vocabulary_size='32000',
            download_url='https://storage.googleapis.com/xlnet/released_models/'
            'cased_L-24_H-1024_A-16.zip',
            format='zip',
            architecture='Transformer, 24-layer, 1024-hidden, 16-heads',
            trained_data=
            'BooksCorpus(800M) English Wikipedia (2500M) words, Giga5 (16gb), '
            'ClueWeb 2012-B(19gb),  Common Crawl(78gb)',
            language='en'),
        Embedding(
            name=u'xlnet_base_cased',
            dimensions=768,
            corpus_size='3.86B',
            vocabulary_size='32000',
            download_url='https://storage.googleapis.com/xlnet/released_models/'
            'cased_L-12_H-768_A-12.zip',
            format='zip',
            architecture='Transformer 12-layer, 768-hidden, 12-heads.',
            trained_data='BooksCorpus(800M) English Wikipedia (2500M) words',
            language='en')
    ]

    EMBEDDING_MODELS: Dict[str, Embedding] = {
        embedding.name: embedding
        for embedding in EMBEDDING_MODELS
    }

    tokenizer: spm.SentencePieceProcessor = None
    mode_config_path: str = 'xlnet_config.json'
    sentence_piece_model_path: str = 'spiece.model'

    def __init__(self):
        self.xlnet_config = None
        self.run_config = None
        self.model = None
        self.sess = tf.Session()

    @staticmethod
    def load_tokenizer(model_path: str):
        """Get the vocab file and casing info from the Hub module."""
        sp_model = spm.SentencePieceProcessor()
        sp_model.Load(
            os.path.join(model_path, Embeddings.sentence_piece_model_path))
        Embeddings.tokenizer = sp_model

    @classmethod
    def tokenize_fn(cls, text):
        text = preprocess_text(text, lower=False)
        return encode_ids(cls.tokenizer, text)

    @staticmethod
    def _model_single_input(
            text: str,
            max_seq_length: int) -> Tuple[List[int], List[int], List[int]]:
        tokens_a = Embeddings.tokenize_fn(text)

        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        segment_ids = []
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(SEG_ID_A)
        tokens.append(SEP_ID)
        segment_ids.append(SEG_ID_A)

        tokens.append(CLS_ID)
        segment_ids.append(SEG_ID_CLS)

        input_ids = tokens

        # The mask has 0 for real tokens and 1 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [0] * len(input_ids)

        # Zero-pad up to the sequence length.
        if len(input_ids) < max_seq_length:
            delta_len = max_seq_length - len(input_ids)
            input_ids = [0] * delta_len + input_ids
            input_mask = [1] * delta_len + input_mask
            segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        return input_ids, input_mask, segment_ids

    def load_model(self, model: str, model_path: str):
        model_path = os.path.join(model_path, next(os.walk(model_path))[1][0])
        self.xlnet_config = xlnet.XLNetConfig(
            json_path=os.path.join(model_path, Embeddings.mode_config_path))
        self.run_config = xlnet.create_run_config(is_training=True,
                                                  is_finetune=True,
                                                  FLAGS=Flags)
        self.load_tokenizer(model_path)
        self.model = model
        print("Model loaded Successfully !")

    def encode(self,
               texts: list,
               pooling: Optional[str] = None,
               **kwargs) -> Optional[np.array]:
        max_seq_length = kwargs.get('max_seq_length', 128)
        input_ids, input_masks, segment_ids = [], [], []
        for text in tqdm(texts, desc="Converting texts to features"):
            input_id, input_mask, segment_id = self._model_single_input(
                text, max_seq_length)
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)

        # Construct an XLNet model
        xlnet_model = xlnet.XLNetModel(xlnet_config=self.xlnet_config,
                                       run_config=self.run_config,
                                       input_ids=np.array(input_ids,
                                                          dtype=np.int32),
                                       seg_ids=np.array(segment_ids,
                                                        dtype=np.int32),
                                       input_mask=np.array(input_masks,
                                                           dtype=np.float32))

        self.sess.run(tf.initializers.global_variables())

        # Get a sequence output
        sequence_output = xlnet_model.get_sequence_output()
        token_embeddings = self.sess.run(sequence_output)

        if not pooling:
            return token_embeddings
        else:
            if pooling not in ["mean", "max", "mean_max", "min"]:
                print(f"Pooling method \"{pooling}\" not implemented")
                return None
            pooling_func = POOL_FUNC_MAP[pooling]
            pooled = pooling_func(token_embeddings, axis=1)
            return pooled
def main():
    opt = Options()
    print('Use {}'.format(opt.pooling_type_str_dict[opt.pooling_type]))
    train_sents, train_labels = pickle.load(open(opt.train_path, 'rb'))
    valid_sents, valid_labels = pickle.load(open(opt.valid_path, 'rb'))
    test_sents, test_labels = pickle.load(open(opt.test_path, 'rb'))
    #
    np.set_printoptions(precision=3)
    np.set_printoptions(threshold=np.inf)
    #

    emb = Embedding(opt.vocab_size, 200, padding_idx=0, trainable=False)
    cnn = ML_CNN.CNN_Module(n_classes=opt.classifier_output_size)

    if opt.use_cuda:
        emb.cuda()
        cnn.cuda()
    param = []
    param.extend(emb.parameters())
    param.extend(cnn.parameters())
    # optimizer = torch.optim.Adam(param, lr=opt.lr, weight_decay=0.01)
    # optimizer = torch.optim.Adam(param, lr=opt.lr, weight_decay=0.00001)
    optimizer = torch.optim.Adam(param, lr=opt.lr)
    criteron = torch.nn.CrossEntropyLoss()

    if opt.restore:
        if os.path.exists(opt.feature_net_path):
            print("Load pretrained embedding")
            emb.load_state_dict(torch.load(opt.feature_net_path))
        else:
            print("No pretrained embedding")
        if os.path.exists(opt.classifier_net_path):
            print("Load pretrained cnn classifier")
            cnn.load_state_dict(torch.load(opt.classifier_net_path))
        else:
            print("No pretrained cnn classifier")

    best_acc = -1
    for epoch in range(opt.max_epochs):
        print("Starting epoch %d" % epoch)
        kf = get_minibatches_idx(len(train_sents), opt.batch_size, shuffle=True)
        epoch_losses = []
        cnn.train()
        emb.train()
        for iteridx, train_index in kf:
            if len(train_index) <= 1:
                continue
            sents = [train_sents[t] for t in train_index]
            labels = [train_labels[t] for t in train_index]
            # X_batch, X_lengths, X_labels = prepare_data_for_rnn(sents, labels)
            X_batch, X_labels = prepare_data_for_cnn(sents, labels)
            X_batch = Variable(X_batch)
            X_labels = Variable(X_labels)
            if opt.use_cuda:
                X_batch = X_batch.cuda()
                X_labels = X_labels.cuda()
            optimizer.zero_grad()
            features = emb(X_batch)
            output = cnn(features)
            loss = criteron(output, X_labels)
            local_loss = loss.data[0]
            epoch_losses.append(local_loss)
            loss.backward()
            optimizer.step()
            if iteridx % opt.print_freq == 0:
                count = output.size(0)
                topK_correct = test_result(output.cpu().data, X_labels.cpu().data, topK=topK)
                topK_acc = [float(tmp) / count for tmp in topK_correct]
                topK_str = " , ".join(["acc@{}: {}".format(k, tmp_acc) for k, tmp_acc in zip(topK, topK_acc)])
                print("Epoch {} Iteration {}  loss: {} , {}".format(epoch + 1, iteridx + 1, local_loss, topK_str))

        ave_loss = sum(epoch_losses) / len(epoch_losses)
        kf = get_minibatches_idx(len(valid_sents), opt.batch_size, shuffle=True)
        count = 0
        all_topK_correct = np.zeros(len(topK), dtype=int)
        for _, valid_index in kf:
            emb.eval()
            cnn.eval()
            sents = [valid_sents[t] for t in valid_index]
            labels = [valid_labels[t] for t in valid_index]
            X_batch, X_labels = prepare_data_for_cnn(sents, labels)
            X_batch = Variable(X_batch)
            X_labels = Variable(X_labels)
            if opt.use_cuda:
                X_batch = X_batch.cuda()
                X_labels = X_labels.cuda()
            features = emb(X_batch)
            output = cnn(features)
            topK_correct = test_result(output.cpu().data, X_labels.cpu().data, topK=topK)
            topK_correct = np.array(topK_correct)
            all_topK_correct += topK_correct
            bsize = output.size(0)
            count += bsize

        all_topK_acc = all_topK_correct / float(count)
        all_topK_acc = all_topK_acc.tolist()
        all_topK_str = " , ".join(["val_acc@{}: {}".format(k, tmp_acc) for k, tmp_acc in zip(topK, all_topK_acc)])
        print("Epoch {} Avg_loss: {}, {}".format(epoch+1, ave_loss, all_topK_str))
        acc = all_topK_acc[important_K]
        if acc > best_acc:
            print('Dump current model due to current acc {} > past best acc {}'.format(acc, best_acc))
            torch.save(cnn.state_dict(), opt.classifier_net_path)
            best_acc = acc

        fscore_records = [{k:FScore() for k in topK} for i in range(opt.classifier_output_size)]
        kf = get_minibatches_idx(len(test_sents), opt.batch_size, shuffle=True)
        emb.eval()
        cnn.eval()
        for _, test_index in kf:
            sents = [test_sents[t] for t in test_index]
            labels = [test_labels[t] for t in test_index]
            X_batch, X_labels = prepare_data_for_cnn(sents, labels)
            X_batch = Variable(X_batch)
            X_labels = Variable(X_labels)
            if opt.use_cuda:
                X_batch = X_batch.cuda()
                X_labels = X_labels.cuda()
            features = emb(X_batch)
            output = cnn(features)
            update_F1(output.cpu().data, X_labels.cpu().data, opt.classifier_output_size, topK, fscore_records)
        with open('F_score_dir/{}.pkl'.format(epoch+1),'w') as f:
            print('dumping fscore in epoch {}'.format(epoch+1))
            pickle.dump(fscore_records, f)


    print('Loading best model')
    cnn.load_state_dict(torch.load(opt.classifier_net_path))
    print('Testing Data')
    kf = get_minibatches_idx(len(test_sents), opt.batch_size, shuffle=True)
    count = 0
    all_topK_correct = np.zeros(len(topK), dtype=int)
    fscore_records = [{k:FScore() for k in topK} for i in range(opt.classifier_output_size)]
    for _, test_index in kf:
        emb.eval()
        cnn.eval()
        sents = [test_sents[t] for t in test_index]
        labels = [test_labels[t] for t in test_index]
        X_batch, X_labels = prepare_data_for_cnn(sents, labels)
        X_batch = Variable(X_batch)
        X_labels = Variable(X_labels)
        if opt.use_cuda:
            X_batch = X_batch.cuda()
            X_labels = X_labels.cuda()
        features = emb(X_batch)
        output = cnn(features)
        update_F1(output.cpu().data, X_labels.cpu().data, opt.classifier_output_size, topK, fscore_records)
        topK_correct = test_result(output.cpu().data, X_labels.cpu().data, topK=topK)
        topK_correct = np.array(topK_correct)
        all_topK_correct += topK_correct
        bsize = output.size(0)
        count += bsize
    all_topK_acc = all_topK_correct / float(count)
    all_topK_acc = all_topK_acc.tolist()
    all_topK_str = " , ".join(["test_acc@{}: {}".format(k, tmp_acc) for k, tmp_acc in zip(topK, all_topK_acc)])
    print("Training end {}".format(all_topK_str))

    with open('F_score_dir/best.pkl','w') as f:
        print('dumping fscore in')
        pickle.dump(fscore_records, f)
Esempio n. 18
0
 def __init__(self, graph, path):
     Embedding.__init__(self, graph)
     self.nodes = np.asarray(list(graph.nodes()))
     self.context_embedding = None
     self.center_embedding = None
     self.set_paths(path)
Esempio n. 19
0
from typing import List, Dict

from models import Embedding

EMBEDDING_MODELS: List[Embedding] = [
    Embedding(name=u'umlfit',
              dimensions=300,
              corpus_size='570k human-generated English sentence pairs',
              vocabulary_size='230k',
              download_url='http://files.fast.ai/models/wt103/',
              format='.h5',
              architecture='cbow',
              trained_data='Stephen Merity’s Wikitext 103 dataset',
              language='en')
]

EMBEDDING_MODELS: Dict[str, Embedding] = {
    embedding.name: embedding
    for embedding in EMBEDDING_MODELS
}
Esempio n. 20
0
    def embed_network(self, seed, save_path, algorithm_name, precomputed, training, walks):
        '''
        methods that embed the network with a given algorithm, the network is replaced by its embedding to save memory
        '''
        save_path = os.path.join(save_path, 'seed_{}'.format(seed))

        if DEBUG:
            save_path += '_debug'
        try :
            os.makedirs(save_path)
        except: 
            pass

        if precomputed:
            model = Embedding(self.residual_network)
            model.set_paths(save_path)
            model.load_embedding()
            self.residual_network = model.word_vectors
            self.embedded = True

        elif algorithm_name == 'node2vec':
            model = Node2vec(self.residual_network, save_path)

        elif algorithm_name == 'deep_walk':
            model = DeepWalk(self.residual_network, save_path)

        elif algorithm_name == 'efge':
            model = Efge(self.residual_network, save_path)

        else:
            raise NotImplementedError('embedding is not implemented')

        model.get_walks(**walks)
        model.train(**training)
        model.save_embedding()
        # we replace the network by its embedding to save memory
        self.residual_network = model.word_vectors
        self.embedded = True