def encode_dataset(dataset, max_len, enc_dim): # Create a model to represent each word by a enc_dim dimensional vector, # transform only max_len words to represent each document dataset['text'] = dataset['text'].map( lambda text: nltk.tokenize.word_tokenize(text)) dataset['length'] = dataset['text'].map(lambda text: len(text)) model = W2VTransformer(size=enc_dim, min_count=1, seed=1) wordvecs = model.fit(dataset['text'].values) embedings = [] targets = [] for row in range(len(dataset['text'])): if dataset['length'][row] <= max_len: embedings.append(wordvecs.transform(dataset['text'][row])) targets.append(dataset['spam'][row]) else: embedings.append(wordvecs.transform( dataset['text'][row][:max_len])) targets.append(dataset['spam'][row]) x_lstm_sentence_seq = keras.preprocessing.sequence.pad_sequences(embedings) return x_lstm_sentence_seq, targets
def fit(self, x, y=None): """ Fits a word2vec model on x : list of sentences """ self.tokens = [s.split() for s in x] self.wordvecs = W2VTransformer(size=self.size, min_count=self.min_count, window=self.window, sg=self.sg).fit(self.tokens) return self
def w2v_scikit(data): from gensim.test.utils import common_texts from gensim.sklearn_api import W2VTransformer # Create a model to represent each word by a 10 dimensional vector. model = W2VTransformer(size=len(data), min_count=1, seed=1) #print(model.gensim_model.wv.vocab) # What is the vector representation of the word 'graph'? wordvecs = model.fit(data).transform(['taken', 'arms']) assert wordvecs.shape == (len(data)) return wordvecs
def __init__(self, corpus, idx, dim=50, window=3, training_algorithm='skip', n_epochs=5): self.name = 'word2vec' self.corpus = corpus self.idx = idx self.dim = dim self.window = window self.n_epochs = n_epochs logger.info("Inferring word2vec from data") self.corpus = [simple_preprocess(doc, deacc=True) for doc in corpus] self.vectorizer = W2VTransformer( size=dim, window=window, sg=0 if training_algorithm == 'skip' else 1, iter=n_epochs) self.vectorizer = self.vectorizer.fit(self.corpus) self.vectors = [] for doc in self.corpus: doc_vector = [] for word in doc: try: doc_vector.append(self.vectorizer.transform(word)) except: continue if len(doc_vector) > 0: self.vectors.append(np.mean(doc_vector, axis=0)) else: self.vectors.append(np.ones(shape=(1, dim))) self.vectors = np.concatenate(self.vectors, axis=0)
# for full details. #%% sizes_list = [] to_concat = [] split = 0 for train, val in ShuffleSplit(n_splits=5).split(d.enc): print('Performing cross-validation split: {}'.format(split)) # prepare the data for the fold d.cross_val_split(train, val) profiles_train, targets_train, seq_train, active_meds_train, depa_train, targets_val, seq_val, active_meds_val, depa_val = d.make_lists() # train word2vec embeddings w2v = Pipeline([ ('w2v', W2VTransformer(alpha=W2V_ALPHA, iter=W2V_ITER, size=W2V_EMBEDDING_DIM, hs=W2V_HS, sg=W2V_SG, min_count=W2V_MIN_COUNT, workers=W2V_WORKERS)), ]) print('Fitting word2vec embeddings...') w2v.fit(profiles_train) w2v.named_steps['w2v'].gensim_model.init_sims(replace=True) # fit the profile state encoder pipeline print('Fitting PSE...') pse_data = [[ap, de] for ap, de in zip(active_meds_train, depa_train)] n_pse_columns = len(pse_data[0]) pse_transformers = [] for i in range(n_pse_columns): pse_transformers.append(('pse{}'.format(i), CountVectorizer(binary=True, lowercase=False, preprocessor=pse_pp, analyzer=pse_a), i)) pse_pipeline_transformers = [ ('columntrans', ColumnTransformer(transformers=pse_transformers)) ]
with open(profiles_path, mode='rb') as file: data = pickle.load(file) data = list(data.values()) print('Data successfully loaded.') #%%[markdown] # ## Transformers # # Prepare the word2vec and clustering transformers #%%[markdown] # ### Word2vec transformer #%% w2v_pipe = Pipeline([ ('w2v', W2VTransformer()), ]) #%%[markdown] # ### Clustering transformer #%% clust_pipe = Pipeline([ ('ac', AgglomerativeClustering()), ]) #%%[markdown] # ## Helper functions # # These are scoring functions that will be used to score # the word2vec embeddings and the clustering of the embeddings.
def do_word2vec(data): data = tokenize(data) features = pd.DataFrame() model = W2VTransformer(size=1, min_count=1, seed=42) return fit_transform(model, data)
def toy_model_keyed_vectors(): """ Instantiate trainable word2vec vectorizer """ model = W2VTransformer(size=10, min_count=1, seed=42) model.fit(common_texts) return model.gensim_model.wv
def wrd2vc(): return W2VTransformer(size=300, window=3, min_count=3, sg=1, trim_rule=lemmatize)
from gensim.test.utils import common_texts from gensim.sklearn_api import W2VTransformer import code # Create a model to represent each word by a 10 dimensional vector. model = W2VTransformer(size=10, min_count=1, seed=1) # What is the vector representation of the word 'graph'? wordvecs = model.fit(common_texts).transform(['graph', 'system']) assert wordvecs.shape == (2, 10) code.interact(local=locals())