Beispiel #1
0
 def __init__(self, max_seq_length, batch_size):
     self.batch_size = batch_size
     if batch_size == 1:
         self.embedder = BertEmbedding(512)
         print("seq length set to Bert maximum 512 when batch size is 1")
     else:
         self.embedder = BertEmbedding(max_seq_length)
Beispiel #2
0
    def __init__(self):
        super().__init__('bert')

        self.output_format = {
            'n_cols': 768
        }

        self.model = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')
Beispiel #3
0
    def __init__(self,
                 model='bert_24_1024_16',
                 corpus='book_corpus_wiki_en_cased'):
        self.__model = model
        self.__corpus = corpus

        assert self.__model in ['bert_12_768_12',
                                'bert_24_1024_16'], "Model is not recognized."
        assert self.__corpus in [
            'book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
            'wiki_multilingual', 'wiki_multilingual_cased'
        ], "Corpus is unknown."

        self.__bert = BertEmbedding(model=self.__model,
                                    dataset_name=self.__corpus)
 def __init__(self, model, ignore_stopwords=True):
     self.model = model
     if self.model == "elmo":
         self.MODEL = ElmoEmbedder()
     elif self.model == "bert":
         self.MODEL = BertEmbedding()
     elif self.model == 'roberta-large':
         self.num_layers = 17
         self.tokenizer = AutoTokenizer.from_pretrained(self.model)
         self.MODEL = AutoModel.from_pretrained(self.model)
         self.MODEL.encoder.layer = torch.nn.ModuleList(
             [layer for layer in self.MODEL.encoder.layer[:self.num_layers]])
         self.MODEL.eval()
     self.nlp = spacy.load("en_core_web_md")
     self.ignore_stopwords = ignore_stopwords
Beispiel #5
0
def createCustomBertEmbeddings(text: List[str],
                               firstWordOnly=True,
                               saveName: Union[str, Path] = None):
    from bert_embedding import BertEmbedding
    bert_embedding = BertEmbedding()
    result = bert_embedding(text)
    indices = [r[0] for r in result]
    data = [r[1] for r in result]
    if firstWordOnly:
        outIdxs = []
        outData = []
        for ii, idx in enumerate(indices):
            for jj, word in enumerate(idx):
                if word.isalnum():
                    outIdxs.append(word)
                    outData.append(data[ii][jj])
                    break
        indices = outIdxs
        data = outData
    else:
        indices = np.concatenate(indices)
    data = np.row_stack(data)
    out = pd.DataFrame(index=indices, data=data)
    if saveName:
        out.to_csv(saveName)
    return out
Beispiel #6
0
class Bert(Embedder):

    def __init__(self):
        super().__init__('bert')

        self.output_format = {
            'n_cols': 768
        }

        self.model = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')

    # Core functions

    def embed_text(self, abstracts):
        """
        :param abstracts: pandas Series of abstracts
        :param output_format: dict specifying output format of the embedding method
        :return: embedding and associated format
        """
        bert_embedding = self.model.bert(abstracts.tolist(), oov_way='sum')

        embedding = []

        for _, vectors in bert_embedding:
            embedding.append(sum(vectors))

        embedding = pd.DataFrame(embedding)

        return embedding, self.output_format
Beispiel #7
0
class bert_instance():
    bert_embedding = BertEmbedding()

    def matrix(self, processed_texts, enquiry):
        all_vectors = []
        enquiry_vector = []
        enquiry_vector.append(self.return_vectors(enquiry))
        enquiry_vector = np.array(enquiry_vector)
        for text in processed_texts:
            all_vectors.append(self.return_vectors(text))
        all_vectors = np.array(all_vectors)
        print(all_vectors)
        print(all_vectors.shape)
        matrix = cosine_similarity(all_vectors, enquiry_vector)
        return (matrix)

    def return_vectors(self, text):
        vectorfile = self.bert_embedding([text])
        print(len(vectorfile))
        #for i in range(len(vectorfile)):
        vectorlist = vectorfile[0][1]
        #print(vectorlist)
        sum_vector = np.empty(shape=vectorlist[0].shape)
        sum_amt = 0
        for vector in vectorlist:
            sum_amt += (vector[0])
            sum_vector += vector
        sum_vector /= len(vectorlist)
        sum_vector = np.nan_to_num(sum_vector)
        return (sum_vector)
Beispiel #8
0
def getEmbeddings(df,
                  n_restaurants=None,
                  average=True,
                  cuisines=True,
                  full=False):

    print('Cleaning Zomato data')
    cuisines, urls, names = zomatoPreprocess(df, cuisines=cuisines, full=full)

    if n_restaurants == None: n_restaurants = len(cuisines)

    print('Retrieving BERT sentence representations for {} restuarants...\n'.
          format(n_restaurants))
    __bert_embedding = BertEmbedding(model='bert_12_768_12')
    __berts = __bert_embedding(cuisines[:n_restaurants])
    bagofembeddings = __bagofBERTs(names, average, urls, __berts)

    print('Complete.')

    filtrd = [(n, u, c, e) for n, u, c, e in bagofembeddings
              if len(e.shape) > 0]

    cuisines = [c for n, u, c, e in filtrd]
    embeds = [e for n, u, c, e in filtrd]
    names = [n for n, u, c, e in filtrd]
    urls = [u for n, u, c, e in filtrd]

    return bagofembeddings
Beispiel #9
0
 def __init__(self):
     self.bert_embedding = BertEmbedding()
     # self.vectors_bank_dic = load_obj("word2vec")
     basepath = os.path.abspath(".")
     data = bz2.BZ2File(basepath + "\data\\bert\word2vectorUpdate.pbz2",
                        'rb')
     self.vectors_bank_dic = cPickle.load(data)
def run():
    # In[5]:

    baseline = Baseline(DIR, DATASET, MAX_SEQUENCE_LENGTH_T,
                        MAX_SEQUENCE_LENGTH_D)

    # In[6]:

    baseline.load_ids(DIR)
    print(len(baseline.bug_ids))

    # In[8]:

    # #### Read the corpus from bugs
    load_bugs(baseline)
    # In[9]:

    sent_title = [
        baseline.bug_set[bug_id]['title'][:MAX_SEQUENCE_LENGTH_T]
        for bug_id in baseline.bug_ids
    ]
    sent_desc = [
        baseline.bug_set[bug_id]['description'][:MAX_SEQUENCE_LENGTH_D]
        for bug_id in baseline.bug_ids
    ]

    # In[10]:

    print(len(sent_title), len(sent_desc))

    # ### BERT embedding

    # In[11]:

    ctx = mx.gpu(0)
    bert_embedding = BertEmbedding(ctx,
                                   batch_size=32,
                                   max_seq_length=MAX_SEQUENCE_LENGTH_D)

    # ### Save dataset vocabulary embedding
    # In[23]:

    # res = paralelize_processing([baseline.bug_ids, sent_title, sent_desc],
    #                                     vectorizing_bugs, (baseline.DIR, bert_embedding, baseline.bug_set, ))

    vectorizing_bugs([baseline.bug_ids, sent_title, sent_desc], baseline.DIR,
                     bert_embedding, baseline.bug_set)
    print("Vectorized all dataset with BERT.")
    # In[ ]:

    bug_selected = np.random.choice(baseline.bug_ids, 1)[0]

    bug = baseline.bug_set[bug_selected]

    print("Testing if a random bug has bert embeddings")

    assert len(bug['title_bert_embed']) == 768
    assert len(bug['desc_bert_embed']) == 768

    print("Embedding with BERT trained!")
Beispiel #11
0
def get_bert_embedding_of_several_words_as_pd_df(logger = None
                                                , phrase_in = None
                                                , root_colnames = 'dim_'
                                                , dim_vector_rep = 768):
    try:
        lst_phrase = [phrase_in]
        colnames = ['{0}{1}'.format(pd_colnames_root, i) for i in range(1, dim_vector_rep + 1)]
        
        if logger is not None:
            logger.info(' - computing BERT representation for input token: \'{0}\''.format(phrase_in))
        
        bert_embedding = BertEmbedding()
        bert_rep = bert_embedding(lst_phrase)
        
        lst_words = bert_rep[0][0]
        lst_bert_rep = bert_rep[0][1]
        
        w_context_vect = pd.DataFrame(data = {'id_context': [i for i in range(1, len(lst_words) + 1)]
                                                , 'w_context': lst_words
                                                , 'w_context_bert': lst_bert_rep
                                                })
        
        df_vect = pd.concat([pd.DataFrame(data = w_context_vect['w_context_bert'][i].reshape(1, len(w_context_vect['w_context_bert'][i]))
                                            , columns = colnames) for i in range(len(w_context_vect.index))])
        df_vect.index = w_context_vect.index

        w_context_vect = pd.concat([w_context_vect[['id_context', 'w_context']], df_vect], axis = 1)
        
    except Exception:
        if logger is not None:
            logger.exception("ERROR getting BERT-embedding of token \'{0}\'".format(phrase_in))
        raise Exception

    return w_context_vect
Beispiel #12
0
 def load_bert(self):
     from bert_embedding import BertEmbedding
     import mxnet as mx
     bert = BertEmbedding(model='bert_12_768_12',
                          dataset_name='wiki_cn_cased',
                          ctx=mx.gpu(0))
     return bert
Beispiel #13
0
class Embedder:
    def __init__(self, max_seq_length, batch_size=32):
        self.embedder = BertEmbedding(max_seq_length)
        self.batch_size = batch_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        #result = []
        #for i, row in enumerate(X):
        #    embedding = self.embedder.project(row)
        #    result.append(embedding)

        # batching
        result = []
        i = 0

        while i < len(X):
            print("start processing {} / {}".format(i, len(X)))
            batch = X[i:(i + self.batch_size)]
            embedding = self.embedder.project_batch(batch)

            result += embedding

            i += self.batch_size

        return np.array(result)
Beispiel #14
0
def predictor(model_path,
              arg1,
              arg2=None,
              originalArg1=None,
              originalArg2=None,
              ADU=False,
              verbose=1,
              fullText=None):
    """Generates model readable data from propositions to predict"""
    if verbose > 0:
        print('Start loading resources...')
    model = load_model(model_path)
    bert_embedding = BertEmbedding(model='bert_12_768_12',
                                   dataset_name='book_corpus_wiki_en_cased',
                                   max_seq_length=35)
    if verbose > 0:
        print('Generate prediction...')
    fulls = [fullText.replace('\n', ' ')] * len(arg1)
    data = data_builder.generate_data_with_features(arg1, arg2, originalArg1,
                                                    originalArg2, fulls,
                                                    bert_embedding)
    if not ADU:
        features = model_builder.select_FFNN_features(
            data, shared_feature_list=None, original_bert=True)
        prediction = model.predict(features)
        if verbose > 0:
            print(prediction)
        return prediction
    else:
        features = model_builder.select_FFNN_features(
            data, shared_feature_list=None, original_bert=True, has_2=False)
        prediction = model.predict(features)
        if verbose > 0:
            print(prediction)
        return prediction
    def _get_bert_embed(tokens):

        embedding = BertEmbedding().embedding(sentences=tokens)

        word_array = []
        for i in range(len(embedding)):
            word_array.append(embedding[i][1][0])
        return word_array
Beispiel #16
0
    def fit(self, X, y=None):
        if self.bert_args['bert_model'] is not None and self.bert_args[
                'bert_dataset_name'] is not None:
            if self.bert_args['ctx'] is not None:
                self.ft_model = BertEmbedding(
                    model=self.bert_args['bert_model'],
                    dataset_name=self.bert_args['bert_dataset_name'],
                    ctx=self.bert_args['ctx'])
            else:
                self.ft_model = BertEmbedding(
                    model=self.bert_args['bert_model'],
                    dataset_name=self.bert_args['bert_dataset_name'])

        else:
            self.ft_model = BertEmbedding()

        return self
Beispiel #17
0
def gen_features(X, wday, yday, i, j, n, tf=None, u=None, n_features=1400):
    from sklearn.feature_extraction import FeatureHasher
    from sklearn.decomposition import TruncatedSVD
    from scipy.spatial.distance import cosine

    from collections import Counter
    import numpy

    from bert_embedding import BertEmbedding

    import mxnet as mx

    ctx = mx.gpu(0)
    bert_embedding = BertEmbedding(ctx=ctx)

    result = bert_embedding(X)

    if tf is None:
        tf = Counter()
        for r in result:
            tf.update(r[0])

    N = sum(tf.values())

    h = FeatureHasher(n_features=n_features, input_type="string")

    def s_from_w(s):
        words = s[0]
        embedding = numpy.array(s[1])
        embedding = numpy.concatenate(
            (embedding, h.transform(words).toarray()), axis=1)
        weight = numpy.array([1 / (1 + tf[x] / N) / len(words) for x in words])
        return weight.dot(embedding)

    SX = numpy.array([s_from_w(x) for x in result])

    if u is None:
        svd = TruncatedSVD(n_components=1, n_iter=8, random_state=42)
        svd.fit(SX)
        u = svd.components_

    v2 = SX - SX.dot(u.transpose()) * u

    wday = numpy.array(wday, ndmin=2)
    yday = numpy.array(yday, ndmin=2)

    max_sim = wday * 0

    for K, _ in enumerate(max_sim):
        if i[K] > 0:
            max_sim[K] = max((1 - cosine(SX[K, :], SX[K2, :]))**2
                             for K2 in range(K) if j[K] == j[K2])

    i = numpy.array(i, ndmin=2)
    i_scaled = i / numpy.array(n, ndmin=2)

    return numpy.hstack(
        (wday.T, yday.T, i.T, i_scaled.T, max_sim.T, v2)), tf, u
Beispiel #18
0
def create_embedding(name):
    if name == "BERT":
        return BertEmbedding(max_seq_length=100, model='bert_12_768_12', # COMMENT: will we ever change max_seq_length?
                             dataset_name='book_corpus_wiki_en_cased')
    # elif name == "FastText":
    #     return FastText()
    else:
        print(f"ERROR: Unknown embedding type '{name}'! Supported embeddings: 'BERT' and 'FastText'.")
        exit(-1)
Beispiel #19
0
        def get_bert_embedding(self):
            '''Creates word embeddings taken from BERT language representation
            returns: list, list
              Returns the BERT embeddings of the tokens of two sentences'''

            bert_embedding = BertEmbedding().embedding(
                sentences=self.tokenized_sent)

            return bert_embedding
Beispiel #20
0
def wordlist_to_bert(wordlist_file):
    x_list = np.load(wordlist_file, allow_pickle=True)
    bert = BertEmbedding(max_seq_length=PADDING_LEN)
    bert_list = []
    for sentence in x_list:
        words, vectors = zip(*bert(sentence))
        flat_vectors = [item for sublist in vectors for item in sublist]
        bert_list.append(np.squeeze(np.asarray(flat_vectors)))
        #print(bert_list[-1].shape)
    return bert_list
def create_sent_embd(df):
    from bert_embedding import BertEmbedding
    bert_embedding = BertEmbedding()
    l = list(df['statement_filt'])
    result = bert_embedding(l)
    from tqdm import tqdm_notebook as tqdm
    vect = []
    for i in tqdm(range(len(result))):
        vect.append(result[i][1])
    return vect
Beispiel #22
0
def saveBERT(embedding_file, data_file='../data/FrameTerms_refined.ibo'):
    bert = BertEmbedding(max_seq_length=100)
    f = open(data_file)
    data = f.read().strip('\n\n')
    data = data.split('\n\n')
    f.close()
    sentences=[]
    ids={}
    for sentence in data:
        words=[]
        terms= sentence.split('\n')
        if len(terms)<2:
            continue
        for wordVector in terms:
            word = wordVector.split(' ')[1]
            words.append(word)
        text=str.join(' ', words)
        if len(text)>0:
            sentences.append(text)
            ids[len(ids)]= text
    print("Data loaded...")
    #bert_encoding = bert(sentences)
    print("Data processed...")
    bert_embeddings={}
    #f = open(embedding_file, 'w')
    f = open(embedding_file)
    prev = f.readlines()
    f.close()
    for i in range(0, len(ids), 50):
        bert_encoding = bert(sentences[i:i+50])
        print(str(i)+'/'+str(len(ids)))
        for j in range(50):
            #id= ids[i+j]
            #bert_embed= bert(sentences[i])[0][1]
            if j<len(bert_encoding):
                bert_embed= bert_encoding[j][1]
                x_tensor = torch.tensor(bert_embed, dtype=torch.float)
                vector = x_tensor.tolist()
                #f.write(id+'\t'+vector+'\n')
                bert_embeddings[i+j]= vector
    f= open(embedding_file, 'w')
    for line in prev:
        f.write(line)
    for i in range(len(ids)):
        f.write(str(bert_embeddings[i])+'\n')
    f.close()
    f=open('../data/embeddings/bert/indexer.json')
    indexer=json.load(f)
    f = open('../data/embeddings/bert/indexer2.json', 'w')
    for i in range(len(indexer)):
        f.write(indexer[i]+'\n')
    for s in sentences:
        f.write(s+'\n')
    f.close()
Beispiel #23
0
 def pa_to_sen(self):
     """ input paragraph in string
         output list of m by 2
         where m is number of sentences
         n= [0] element is txt, n=[1] is 768 by w where w is number of words                             
     """
     bert_embedding = BertEmbedding()
     self.sentences = self.description.split('. ')
     self.sent_bert = bert_embedding(self.sentences)
     self.num_sen = len(self.sent_bert)
     return self.sent_bert, self.num_sen, self.sentences
Beispiel #24
0
    def build(self):
        # build word embedding

        #in_id = Input(shape=(self._bretMaxLen,), name="input_ids")
        #in_mask = Input(shape=(self._bretMaxLen,), name="input_masks")
        #in_segment = Input(shape=(self._bretMaxLen,), name="segment_ids")
        #inputs = [in_id, in_mask, in_segment]

        #word_embeddings = BertLayer(n_fine_tune_layers=3,bert_path=self._bert_path)(inputs)
        word_ids = Input(batch_shape=(None, None),
                         dtype='int32',
                         name='word_input')
        word_embeddings = BertEmbedding(
            model='bert_24_1024_16',
            dataset_name='book_corpus_wiki_en_cased')(word_ids)

        # build character based word embedding
        # if self._use_char:
        #     print("char Embedding layer On")
        #     char_ids = Input(batch_shape=(None, None, None), dtype='int32', name='char_input')
        #     inputs.append(char_ids)
        #     char_embeddings = Embedding(input_dim=self._char_vocab_size,
        #                                 output_dim=self._char_embedding_dim,
        #                                 mask_zero=True,
        #                                 name='char_embedding')(char_ids)
        #     char_embeddings = TimeDistributed(Bidirectional(LSTM(self._char_lstm_size)))(char_embeddings)
        #     word_embeddings = Concatenate()([word_embeddings, char_embeddings])
        #
        #     word_embeddings = Dropout(self._dropout)(word_embeddings)

        z = Bidirectional(
            LSTM(units=self._word_lstm_size,
                 return_sequences=True,
                 dropout=self._layerdropout,
                 recurrent_dropout=self._layerdropout))(word_embeddings)
        if (self._layer2Flag):
            z = Bidirectional(
                LSTM(units=self._word_lstm_size,
                     return_sequences=True,
                     dropout=self._layerdropout,
                     recurrent_dropout=self._layerdropout))(z)
        z = Dense(self._fc_dim, activation='tanh')(z)

        if self._use_crf:
            crf = CRF(self._num_labels, sparse_target=False)
            loss = crf.loss_function
            pred = crf(z)
        else:
            loss = 'categorical_crossentropy'
            pred = Dense(self._num_labels, activation='softmax')(z)

        model = Model(inputs=inputs, outputs=pred)

        return model, loss
Beispiel #25
0
def test_bert_embedding():
    bert_abstract = """We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.
     Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.
     As a result, the pre-trained BERT representations can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. 
    BERT is conceptually simple and empirically powerful. 
    It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE benchmark to 80.4% (7.6% absolute improvement), MultiNLI accuracy to 86.7 (5.6% absolute improvement) and the SQuAD v1.1 question answering Test F1 to 93.2 (1.5% absolute improvement), outperforming human performance by 2.0%."""
    sentences = bert_abstract.split('\n')
    bert_embedding = BertEmbedding()
    embedding = bert_embedding(sentences)
    assert len(embedding) == 5
    assert len(embedding[0]) == 2
    assert len(embedding[0][0]) == 18
Beispiel #26
0
    def bert(self, prep_obj):
        bert_embedding = BertEmbedding(
            model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased')
        result = np.array(bert_embedding(prep_obj.detokenized_corpus))

        vec = np.zeros(768)
        for sentence in result:
            for word_vec in sentence[1]:
                vec = np.add(vec, np.array(word_vec))
            vec = np.true_divide(
                vec, 1 if len(sentence[1]) == 0 else len(sentence[1]))
            self.vector_corpus.append(vec)
Beispiel #27
0
def generateData(corpus_file, classes, split_percentage, load_embedding_from_file=False, save_embedding_dict=False, verbose=True, embedding_dict_filename="embedding_dicts/embedding_dict.pkl", shuffle=True, ignore_words=None):
	if ignore_words is None:
		ignore_words = []
	sentences = []
	# Read in corpus
	if verbose: print("Loading corpus...")
	with open(corpus_file, 'r') as f:
		for line in f:
			if any(w.lower() in line.lower() for w in classes):
				sentences.append(line.strip().replace('-', ' '))
	if shuffle: random.shuffle(sentences)
	#sentences = sentences[0:50]
	if verbose: print("Computing Bert Embeddings...")
	# split into train and test sets
	num_train = int(len(sentences)*split_percentage)
	train_set = sentences[0:num_train] 
	test_set = sentences[num_train:]
	# Create dictionary of training/test sets
	if verbose: print(len(train_set), len(test_set))
	words = [w.lower() for line in train_set + test_set for w in line.split()]
	words = list(set(words))
	bert = BertEmbedding()
	if load_embedding_from_file:
		with open(embedding_dict_filename, 'rb') as f:
			embedding_dict = pickle.load(f)
	else:
		embedding_dict = dict([(x[0], y[0]) for x, y in bert(words)])
	if save_embedding_dict:
		with open(embedding_dict_filename, 'wb') as f:
			pickle.dump(embedding_dict, f, pickle.HIGHEST_PROTOCOL)

	if verbose: print("Preparing dataset...")
	def create_dataset(dataset, max_len):
		data = []
		labels = []
		for sentence in dataset:
			for i in range(len(classes)):
				word = classes[i]
				if word.lower() in sentence.lower():
					s = sentence.lower().split()
					x = [np.array(embedding_dict[w]) for w in s if w not in word.lower() and w not in ignore_words and w in embedding_dict]
					x = np.array(x)
					max_len = max_len if x.shape[0] < max_len else x.shape[0]
					x = torch.tensor(x, dtype=torch.float)
					x = x.unsqueeze(0)
					data.append(x)
					y = [i]
					y = torch.tensor(y, dtype=torch.long)
					labels.append(y)
		return data, labels, max_len		
	train_set, train_labels, max_len = create_dataset(train_set, 0)
	test_set, test_labels, max_len = create_dataset(test_set, max_len)
	return train_set, train_labels, test_set, test_labels, classes 
def get_bert_embs(texts, label):
    ctx = mx.gpu(0)
    bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='wiki_multilingual_cased',
                                   max_seq_length=2048, ctx=ctx)
    embs_list = []
    length = len(texts) if len(texts) <= 10000 else 10000
    for i in tqdm(range(0, length)):
        if type(texts[i]) is not float:
            result = bert_embedding([texts[i]])
            embs_list.append(result[0][1][1:-1])
    with open(f'../data/reviews-eng/embs_full/embs_{label}.bin', 'wb') as file:
        pickle.dump(embs_list, file)
Beispiel #29
0
def tree_to_bert(tree_labels, verbose=False):
    bert = BertEmbedding(max_seq_length=PADDING_LEN)
    x_list = []
    y_list = []
    for t in tree_labels:
        y, x = t.to_labeled_lines()[0]
        y_list.append(y)
        str_list, arr_list = zip(*bert([x]))
        if verbose:
            print(str_list)
        x_list.append(np.squeeze(np.asarray(arr_list)))
        # print(x_list[-1].shape)
    return x_list, y_list
Beispiel #30
0
class Embedder:
    def __init__(self, max_seq_length, batch_size):
        self.batch_size = batch_size
        if batch_size == 1:
            self.embedder = BertEmbedding(512)
            print("seq length set to Bert maximum 512 when batch size is 1")
        else:
            self.embedder = BertEmbedding(max_seq_length)

    def fit(self, X, y):
        return self

    def transform(self, X):
        if self.batch_size == 1:
            return self.__transform_in_single(X)
        else:
            return self.__transform_in_batch(X)

    def __transform_in_single(self, X):
        result = []
        for i, row in enumerate(X):
            embedding = self.embedder.project(row)
            result.append(embedding)
        return np.array(result)

    def __transform_in_batch(self, X):
        result = []
        i = 0

        while i < len(X):
            print("start processing {} / {}".format(i, len(X)))
            batch = X[i:(i + self.batch_size)]
            embedding = self.embedder.project_batch(batch)

            result += embedding

            i += self.batch_size

        return np.array(result)