def test_skipgrams(): # test with no window size and binary labels couples, labels = skipgrams(np.arange(3), vocabulary_size=3) for couple in couples: assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2] # test window size and categorical labels couples, labels = skipgrams(np.arange(5), vocabulary_size=5, window_size=1, categorical=True) for couple in couples: assert couple[0] - couple[1] <= 3 for l in labels: assert len(l) == 2
def loadData(): ''' I love green eggs and ham. (context,word): ([I,green],love) ([love,eggs],green) ([green,and],eggs) --------------> (love,I) 1 (love,green) 1 :return: ''' text ='I love green eggs and ham .' tokenizer =Tokenizer() tokenizer.fit_on_texts([text]) word2id =tokenizer.word_index id2word ={v:k for k,v in word2id.items()} wids =[word2id[w]for w in text_to_word_sequence(text,split=' ')] pairs,labels =skipgrams(wids,len(word2id)) print (len(pairs),len(labels)) for i in range(10): print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format( id2word[pairs[i][0]], pairs[i][0], id2word[pairs[i][1]], pairs[i][1], labels[i]))
def train_corpus(self, negative_samples=20, window_size=4): """ Train the model on the given corpus Parameters: negative_samples (int): the number of `false contexts' for each word window_size (int): the size of each context """ logging.info('Initialising sampling table') sampling_table = sequence.make_sampling_table(self.vocab_size) ans = [] for i, seq in enumerate( self.tokenizer.texts_to_sequences_generator(self.corpus)): logging.info(i) couples, labels = sequence.skipgrams( seq, self.vocab_size, window_size=window_size, negative_samples=negative_samples, sampling_table=sampling_table) if couples: word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") loss = self.model.train_on_batch([word_target, word_context], labels) ans.append(loss) return ans
def _fit_embeddings(self, text): sampling_table = sequence.make_sampling_table(max_words) for e in range(self.n_epochs): print('-' * 40) print('Epoch', e) print('-' * 40) progbar = generic_utils.Progbar(self.tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate(self.tokenizer.texts_to_sequences_generator(text)): #MAKE SURE TOKENIZER AND FITTING ARE WORKING #if i < 5: # print(map(lambda x: reverse_word_index[x], seq)) # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, max_words, window_size=self.window_size, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") loss = self.embedding_model.train_on_batch(X, labels) losses.append(loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") return self
def train_on_model(model, g, vocab_size, max_len = 10, epochs = 100, print_every=10, window_size=4, negative_sampling=1.0, sampling_table=None): losses, valid_sequences = 0.0, 0 for epoch in xrange(epochs): sequences = pad_sequences([g.random_walk(k,max_len) for k in range(vocab_size)]) X_couples = [] y_labels = [] for seq in sequences: couples, labels = skipgrams(seq, vocab_size, window_size=window_size, negative_samples=negative_sampling, sampling_table=sampling_table) X_couples.extend(couples) y_labels.extend(labels) if len(couples) == 0: continue valid_sequences += 1 loss = train_batch(model, X_couples, y_labels) losses += loss if epoch % print_every == 0: logging.info("Mean loss in Epoch [%s] with %s valid sequences = %s" % (epoch, valid_sequences, losses / valid_sequences)) losses, valid_sequences = 0.0, 0
for e in range(nb_epoch): print('-' * 40) print('Epoch', e) print('-' * 40) progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate( tokenizer.texts_to_sequences_generator(text_generator())): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") loss = model.train_on_batch(X, labels) losses.append(loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") if save:
def run_preprocessing(texts, data_dir, run_name, min_freq_threshold=10, max_len=100, bad=None, vectors='en_core_web_lg', n_threads=2, token_type='lemma', only_keep_alpha=False, write_every=10000, merge=False): """ This function abstracts the rest of the preprocessing needed to run Lda2Vec in conjunction with the NlpPipeline. :param texts: (list[str]) list of text :param data_dir: (str) directory where data is held :param run_name: (str) Named of directory created to hold preprocessed data :param min_freq_threshold: (int, optional) If words occur less frequently than this threshold, then purge them from the docs :param max_len: (int, optional) Length to pad/cut off sequences :param bad: (list|set, optional) words to filter out of dataset :param vectors: (str) Name of vectors to load from spacy, e.g. ["en", "en_core_web_sm"] :param n_threads: (int, optional) Number of threads used in spacy pipeline :param token_type: (str, optional) Type of tokens to keep, one of ["lemma", "lower", "orth"] :param only_keep_alpha: (bool, optional) Only keep alpha characters :param write_every: (int, optional) Number of documents' data to store before writing cache to skipgrams file :param merge: (bool, optional) Merge noun phrases :return: """ if bad is None: bad = [] def clean(line): return ' '.join(w for w in line.split() if not any(tk in w for tk in bad)) # Location for preprocessed data to be stored out_path = data_dir / run_name if not os.path.exists(out_path): # Make directory to save data in os.makedirs(out_path) # Remove tokens with these substrings bad = set(bad) # Preprocess data # Convert to unicode (spacy only works with unicode) texts = [str(clean(d)) for d in texts] texts = [t for t in texts if t] # remove empty lines after cleaning # Process the text, no file because we are passing in data directly p = NlpPipeline(None, max_len, texts=texts, n_threads=n_threads, only_keep_alpha=only_keep_alpha, token_type=token_type, vectors=vectors, merge=merge) # Computes the embed matrix along with other variables p.compute_embed_matrix() print('Convert data to word2vec indices') p.convert_data_to_word2vec_indices() print('Trim zeros') p.trim_zeros_from_idx_data() # Extract the length of each document (needed for pyLDAvis) doc_lengths = [len(x) for x in p.idx_data] # Find the cutoff index cutoff = 0 for i, freq in enumerate(p.freqs): if freq < min_freq_threshold: cutoff = i break # Then cutoff the embed matrix embed_matrix = p.embed_matrix[:cutoff] # Also replace all tokens below cutoff in `idx_data` for i in range(len(p.idx_data)): p.idx_data[i][p.idx_data[i] > cutoff - 1] = 0 # Next cut off the frequencies freqs = p.freqs[:cutoff] print('Convert to skipgrams') data = [] n_examples = p.idx_data.shape[0] # Sometimes docs can be less than the required amount for # the skipgram function. So we must manually make a counter # instead of relying on the enumerated index (i). doc_id_counter = 0 # Additionally, we will keep track of these lower level docs # and will purge them later. purged_docs = [] for i, t in enumerate(p.idx_data): pairs, _ = skipgrams(t, vocabulary_size=p.vocab_size, window_size=5, shuffle=True, negative_samples=0) # Pairs will be 0 if document is less than 2 indexes if len(pairs) > 2: for pair in pairs: temp_data = pair # Appends doc ID temp_data.append(doc_id_counter) # Appends document index temp_data.append(i) data.append(temp_data) doc_id_counter += 1 else: purged_docs.append(i) if i // write_every: temp_df = pd.DataFrame(data) temp_df.to_csv(out_path / 'skipgrams.txt', sep='\t', index=False, header=None, mode='a') del temp_df data = [] if i % 500 == 0: print('step', i, 'of', n_examples) temp_df = pd.DataFrame(data) temp_df.to_csv(out_path / 'skipgrams.txt', sep='\t', index=False, header=None, mode='a') del temp_df # Save embed matrix np.save(out_path / 'embed_matrix', embed_matrix) # Save the doc lengths to be used later # Also purge those that didnt make it into skipgram function np.save(out_path / 'doc_lengths', np.delete(doc_lengths, np.array(purged_docs))) # Save frequencies to file np.save(out_path / 'freqs', freqs) # Save vocabulary dictionary to file with open(out_path / 'idx_to_word.pkl', 'wb') as f: pickle.dump(p.idx_to_word, f) with open(out_path / 'word_to_idx.pkl', 'wb') as f: pickle.dump(p.word_to_idx, f)
def sequence_skipgrams(): couples, labels = sequence.skipgrams([0, 1, 2, 3], vocabulary_size=4, window_size=2) print(couples) print(labels)
def main(): scoreList = [0.0,0.0] with open('data/info.json') as j: info = ujson.load(j) for problem in os.listdir('data'): greek=False if problem.startswith('problem'): truthPath = 'data/truth/'+problem+'/clustering.json' with open(truthPath) as t: truth = ujson.load(t) print(problem) probTokList = [] docList = [] docDict = {} X=[] Y=[] path = 'data/' + problem for entry in info: if entry["folder"] == problem: lang=entry["language"] if entry["language"] == "gr": greek=True CV = CountVectorizer(input='filename', strip_accents='unicode', analyzer='word', ngram_range=(1,4)) docs = [path+'/'+x for x in os.listdir(path)] cMatrix = CV.fit_transform(docs) for doc in os.listdir(path): docTokList = [] with open(path + '/' + doc) as d: article = d.readlines() for sent in article: sentTokList = [] for word in sent.split(): for token in word: procToken = preprop(token,greek) sentTokList.append(procToken) #Every item of the list is a normalized character docTokList.append(' '.join(sentTokList))#Every item of the list is a sentence probTokList.append(' '.join(docTokList))#Every item of the list is a document docList.append(doc) tokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ") tokenizer.fit_on_texts(probTokList) seqList = tokenizer.texts_to_sequences(probTokList) uniqueTokens = max([max(x) for x in seqList]) print(uniqueTokens,lang) sampling_table = sequence.make_sampling_table(uniqueTokens+1) for i,seq in enumerate(seqList): x, y = sequence.skipgrams(seq, uniqueTokens, window_size=4, negative_samples=1.0, categorical=False, sampling_table=sampling_table) x = zip(x, y) X.append(x) #Y.extend(y) docDict[docList[i]] = seq strX=[str(x) for x in X] xTokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ") xTokenizer.fit_on_texts(strX) #docMatrix = tokenizer.sequences_to_matrix(seqList,mode="tfidf") docMatrix = xTokenizer.sequences_to_matrix(strX,mode="tfidf") #scores = embedNN(X,Y) pairs = combinations(docDict.keys(),2) cList = [] nnDict = {} for cluster in truth: cPairs = [] if len(cluster) > 1: for item in cluster: cPairs.append(str(item["document"])) cList.extend(list(permutations(cPairs,2))) for pair in pairs: match = False if pair in cList: match = True nnDict[pair] = match for i, doc in enumerate(docMatrix): docDict[docList[i]] = doc truthCounter = Counter(nnDict.values()) baseline = 1-float(truthCounter[True])/float(len(nnDict)) print("Baseline for {} is {}".format(problem, baseline)) clusterCount = Counter() kmclusters = False # Change to False for meanshift if kmclusters: pbar = ProgressBar() for nclusters in pbar(reversed(range(len(docMatrix)-1))): #print("{} Clusters".format(nclusters+1)) clusters = KMclusterer(nclusters+1,cMatrix) for c in range(nclusters+1): #print(c,"has:",[i for i,x in enumerate(clusters) if x == c]) for clusterpair in list(combinations([i for i,x in enumerate(clusters) if x == c],2)): combo = (docList[clusterpair[0]],docList[clusterpair[1]]) clusterCount[combo] +=1 else: clusters = KMclusterer(int(len(docMatrix)*0.67),docMatrix) #clusters = MSclusterer(cMatrix)#cMatrixdocMatrix for clusterpair in list(combinations([i for i,x in enumerate(clusters)],2)): combo = (docList[clusterpair[0]],docList[clusterpair[1]]) clusterCount[combo] +=1 x = 0.0 scoreList[0] += truthCounter[True] deleteList = [] #print("Most common cluster is in {}%".format((float(clusterCount.most_common(20)[19][1])/len(docMatrix))*100)) for combo in nnDict.keys(): if combo not in clusterCount.keys(): deleteList.append(combo) y = 0.0 for item in deleteList: if item in cList: y+=1 del nnDict[item] scores = sharedNN(docDict, nnDict) print("Deleted pairs are {}% of total correct pairs, {}% of deleted pairs was wrongly deleted".format(round(y/len(cList)*100.0,2), round(y/len(deleteList)*100.0,2))) for combo in clusterCount.most_common(20): if combo[0] in cList: x += 1 scoreList[1] += 1 print("prec: {}".format(x/20)) #print("Document score is {} clusters correct out of {} (accuracy {})".format(x, truthCounter[True], x/truthCounter[True])) #print("prec: {} \nrec: {}".format(x/20, x/len(nnDict.values()))) #print("Total precision is {}, {} clusters correct".format(scoreList[1]/scoreList[0], scoreList[1])) if not os.path.exists('answers/'+problem): os.mkdir('answers/'+problem) clusDict = defaultdict(list) rankDict = defaultdict(list) for i, cluster in enumerate(list(clusters)): clusDict[cluster] .append({"document": docList[i]}) rankDict[cluster] .append(docList[i]) with open('answers/'+problem+'/clustering.json', "w") as jsonFile: ujson.dump(list(clusDict.values()), jsonFile, indent=4) rankList = [] for value in rankDict.values(): if len(value) > 1 : pairs = combinations(value,2) for pair in pairs: rankList.append({"document1": pair[0], "document2": pair[1], "score": scores[pair][0]}) with open('answers/'+problem+'/ranking.json', "w") as jsonFile: ujson.dump(rankList, jsonFile, indent=4)
def main(dname, encode_dir, raw_dir, odir='./resources/skipgrams/', mode='local'): # load corpus data raw_corpus = pd.read_csv(raw_dir + dname + '.tsv', sep='\t') # load user data user_idx = json.load(open(raw_dir + 'user_idx.json')) user_info = dict() user_control = set() # control if renew user_info sample method with open(encode_dir + 'users.json') as dfile: for line in dfile: line = json.loads(line) user_info[line['uid']] = line user_info[line['uid']]['count'] = 0 # load tokenizer tok = pickle.load(open(encode_dir + dname + '.tkn', 'rb')) params = { 'window': 5, 'vocab_size': tok.num_words, 'user_size': len(user_info) + 1, # +1 for unknown 'emb_dim': 300, 'word_emb_path': './resources/word_emb.npy', 'user_emb_path': './resources/user_emb.npy', 'word_emb_train': True, 'user_emb_train': True, 'user_task_weight': 1, 'word_task_weight': 1, 'epochs': 5, 'optimizer': 'adam', 'lr': 1e-5, } word_sampling_table = make_sampling_table(size=params['vocab_size']) ww_model, uw_model = build_model(params) print() print(params) for epoch in range(params['epochs']): loss = 0 # shuffle the data raw_corpus = raw_corpus.sample(frac=1).reset_index(drop=True) for step, entry in raw_corpus.iterrows(): '''word info, ww: word-word''' encode_doc = tok.texts_to_sequences([entry.text]) ww_pairs, ww_labels = skipgrams( sequence=encode_doc[0], vocabulary_size=params['vocab_size'], window_size=params['window']) word_pairs = [np.array(x) for x in zip(*ww_pairs)] ww_labels = np.array(ww_labels, dtype=np.int32) '''user info, uw: user-word''' cur_user = user_info[entry.uid] if mode == 'local': uw_pairs, uw_labels = utils.user_word_sampler( uid=cur_user['uid_encode'], sequence=encode_doc[0], vocab_size=params['vocab_size'], filter_words=set(cur_user['words']), negative_samples=1) uw_pairs = [np.array(x) for x in zip(*uw_pairs)] uw_labels = np.array(uw_labels, dtype=np.int32) elif mode == 'decay': decay_num = utils.sample_decay(cur_user['count']) if decay_num > np.random.random(): uw_pairs, uw_labels = utils.user_word_sampler( uid=cur_user['uid_encode'], sequence=set(cur_user['words']), vocab_size=params['vocab_size'], negative_samples=1) uw_pairs = [np.array(x) for x in zip(*uw_pairs)] uw_labels = np.array(uw_labels, dtype=np.int32) user_info[entry.uid]['count'] += 1 user_control.add(entry.uid) else: uw_pairs = None uw_labels = None if len(user_control) >= len(user_info) - 10: # restart the control for sampling for uid in user_info: user_info[uid]['count'] = 0 user_control.clear() elif mode == 'global': uw_pairs, uw_labels = utils.user_word_sampler( uid=cur_user['uid_encode'], sequence=set(cur_user['words']), vocab_size=params['vocab_size'], negative_samples=1) uw_pairs = [np.array(x) for x in zip(*uw_pairs)] uw_labels = np.array(uw_labels, dtype=np.int32) else: raise ValueError('Mode {} does not exist!'.format(mode)) '''Train''' if word_pairs: loss += ww_model.train_on_batch(word_pairs, ww_labels) if uw_pairs: loss += uw_model.train_on_batch(uw_pairs, uw_labels) loss_avg = loss / step if step % 100 == 0: print('Epoch: {}, Step: {}'.format(epoch, step)) print('\tLoss: {}.'.format(loss_avg)) print('-------------------------------------------------') # save the model ww_model.save(odir + 'ww_model_{}.h5'.format(epoch)) uw_model.save(odir + 'uw_model_{}.h5'.format(epoch)) # save the word embedding np.save(odir + 'word_{}.npy'.format(epoch), ww_model.get_layer(name='word_emb').get_weights()[0]) # save the user embedding np.save(odir + 'user_{}.npy'.format(epoch), uw_model.get_layer(name='user_emb').get_weights()[0]) # save the model ww_model.save(odir + 'ww_model.h5') uw_model.save(odir + 'uw_model.h5') # save the word embedding np.save(odir + 'word.npy', ww_model.get_layer(name='word_emb').get_weights()[0]) # save the user embedding np.save(odir + 'user.npy', uw_model.get_layer(name='user_emb').get_weights()[0])
sampling_table = sequence.make_sampling_table(vocab_size) for e in range(nb_epoch): print('-'*40) print('Epoch',e) print('-'*40) progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, vocab_size, window_size=4, negative_samples=1., sampling_table=sampling_table) if couples: X1,X2 = zip(*couples) X1 = np.array(X1,dtype="int32") X2 = np.array(X2,dtype="int32") loss = model.train_on_batch([X1,X2], labels) losses.append(loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") if save:
def sg(sentence): return seq.skipgrams(sentence, vocab_size, window_size=np.random.randint(window - 1) + 1, negative_samples=nb_negative_samples)
return analogies, pd.Series(idx) analogies, categories = get_analogies('en') analogies_id = analogies.apply(lambda x: x.map(token_to_id)) test_set = analogies_id.dropna().astype(int) a, b, c, actual = test_set.values.T actual = actual.reshape(-1, 1) n_analogies = len(actual) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = skipgrams(sequence=data, vocabulary_size=vocab_size, window_size=WINDOW_SIZE, sampling_table=sampling_table, negative_samples=1.0, shuffle=True) target_word, context_word = np.array(couples, dtype=np.int32).T labels = np.array(labels, dtype=np.int8) del couples with pd.HDFStore(PATH / 'data.h5') as store: store.put('id_to_token', pd.Series(id_to_token)) store.put('analogies', test_set) def model_graph(): #### Scalar Input Variables input_target = Input((1, ), name='target_input')
def main(): vocabulary_size = 2000 X_train = [] y_train = [] train, test = load_card_data() corpus = [t[2] for t in train] tokenizer = MTGTokenizer(nb_words=vocabulary_size, filters=None, lower=True, split=" ") tokenizer.fit_on_texts(corpus) train_tokens = tokenizer.texts_to_sequences(corpus) for token in train_tokens: couples, labels = skipgrams(token, vocabulary_size) X_train += couples y_train += labels """ couples is a list of 2-elements lists of int: [word_index, other_word_index]. labels is a list of 0 and 1, where 1 indicates that other_word_index was found in the same window as word_index, and 0 indicates that other_word_index was random. """ X_train = np.asarray(X_train) y_train = np.asarray(y_train) embedding_size = 256 model = Sequential() model.add(Embedding(vocabulary_size+1, 256)) model.add(SimpleRNN(128, return_sequences=False)) model.add(Dense(1, activation="sigmoid")) model.compile(optimizer="Adam", loss="binary_crossentropy") # model.load_weights("mtgW2V.mdl") # embedding_weights = model.layers[0].get_weights()[0] model.fit(X_train, y_train, nb_epoch=5, batch_size=1024) model.save_weights("mtgW2V.mdl") return from scipy.spatial.distance import cosine embedding_dict = tokenizer.word_index top5 = [] for word1 in embedding_dict: print word1 top5 = [] bottom5 = [] scores = [] for word2 in embedding_dict: score = 1 - cosine(embedding_weights[embedding_dict[word1]], embedding_weights[embedding_dict[word2]]) if score > min(top5): top5.append(word2) if score < max(bottom5): bottom5.append(word2) while True: try: word1 = raw_input("What is the first word you want to compare?") word2 = raw_input("What is the second word you want to compare?") print embedding_dict[word1], embedding_dict[word2] #1 - gets us similarity instead of distance print 1-cosine(embedding_weights[embedding_dict[word1]], embedding_weights[embedding_dict[word2]]) except KeyError: pass except IndexError: pass
sampling_table = make_sampling_table(vocab_size) for i in range(n_epochs): loss = 0 for seq in tokenizer.texts_to_sequences_generator(text_generator()): # generate skip-gram training examples # - `couples` consists of the pivots (i.e. target words) and surrounding contexts # - `labels` represent if the context is true or not # - `window_size` determines how far to look between words # - `negative_samples` specifies the ratio of negative couples # (i.e. couples where the context is false) # to generate with respect to the positive couples; # i.e. `negative_samples=4` means "generate 4 times as many negative samples" couples, labels = skipgrams(seq, vocab_size, window_size=5, negative_samples=4, sampling_table=sampling_table) if couples: pivot, context = zip(*couples) pivot = np.array(pivot, dtype='int32') context = np.array(context, dtype='int32') labels = np.array(labels, dtype='int32') loss += model.train_on_batch([pivot, context], labels) print('epoch %d, %0.02f' % (i, loss)) ### embeddings = model.get_weights()[0] ###
# -*- coding: utf-8 -*- from keras.preprocessing.text import * from keras.preprocessing.sequence import skipgrams text = "I love green eggs and ham ." tokenizer = Tokenizer() tokenizer.fit_on_texts([text]) word2id = tokenizer.word_index id2word = {v:k for k, v in word2id.items()} wids = [word2id[w] for w in text_to_word_sequence(text)] pairs, labels = skipgrams(wids, len(word2id)) print(len(pairs), len(labels)) for i in range(10): print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format( id2word[pairs[i][0]], pairs[i][0], id2word[pairs[i][1]], pairs[i][1], labels[i]))
def process(args): print "Loading graph..." if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: #print("Walking...") #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") max_features = len(G.nodes()) # vocabulary size dim_proj = args.representation_size # embedding space dimension nb_epoch = 1 # number of training epochs # Neural network ( in Keras ) model = Sequential() model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) print("Fitting tokenizer on walks...") tokenizer = text.Tokenizer(nb_words=max_features) print "Epochs: %d" % nb_epoch #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length)) for e in range(nb_epoch): print('-'*40) print('Epoch', e) print('-'*40) #progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] # for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )): for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) ): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") print "Started fitting..." loss = model.fit(X, labels) print "Dumping..." # Dump weights to a temp file weights = model.layers[0].get_weights()[0] norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt( args.output, norm_weights ) losses.append(loss) if len(losses) % 100 == 0: # progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") #TODO: IMPLEMENT THAT print "Not implemented yet..." sys.exit(1) print "Optimization done. Saving..." # recover the embedding weights trained with skipgram: weights = model.layers[0].get_weights()[0] # we no longer need this del model norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt( args.output, norm_weights ) print "Saved!"
data_str = "".join(data) # 将原文按句子划分 sents = nltk.sent_tokenize(data_str) # Tokenizer()函数将文本生产一个token序列 tokenizer = Tokenizer() tokenizer.fit_on_texts(sents) # 建立单词和ID的映射表 word2index = tokenizer.word_index word2index[0] = "UNK" index2word = {value : key for key, value in word2index.items()} # 构造训练集和测试集 wids = [word2index[w] for w in text_to_word_sequence(data_str)] # 使用skipgrams函数对文本进行采样,window_size是半个窗口,即[center_word-window_size, center_word+window_size+1] pairs, labels = skipgrams(wids, vocabulary_size = len(word2index), window_size = 1, negative_samples = 1) pairs, labels = np.array(pairs), np.array(labels) x_train, x_test, y_train, y_test = train_test_split(pairs, labels, test_size = 0.3) # 搭建模型 voca_size = len(word2index) embed_size = 300 input_word = Input(shape = (2,), name = "input_1") embedding = Embedding(input_dim = voca_size, output_dim = embed_size, input_length = 2, embeddings_initializer = "glorot_uniform", name = "embedding_1")(input_word) lambda_dot = Lambda(lambda x : K.prod(x, axis = 1), output_shape = (embed_size,))(embedding) dense1 = Dense(units = 1, activation = "sigmoid", name = "dense_1")(lambda_dot) model = Model(inputs = input_word, outputs = dense1) model.summary() model.compile(optimizer = "adam", loss = "mse")
skip_gram = build_sg() skip_gram.compile(loss='binary_crossentropy', optimizer='adam') skip_gram.summary() # Training nb_epochs = 5 t_start = time() t0 = t_start for epoch in range(nb_epochs): print "Epoch %d:" % (epoch + 1) for i, review in enumerate(revs): # skipgram function helps build data/labels with positive/negative smaples data, labels = skipgrams(sequence=review, vocabulary_size=vocab_size, window_size=2, negative_samples=5.) x = [np.array(x) for x in zip(*data)] y = np.array(labels, dtype=np.int32) #skip_gram.fit(x, y, batch_size=256, nb_epoch=1) loss = skip_gram.train_on_batch(x, y) if i % 1000 == 0: print "training %d, with loss %.6f, elapsed time %.3fs." % ( i, loss, time() - t0) t0 = time() #print "\n-------training %d, elapsed time %.3fs.\n" %(i, time()-t0) print "Loss %d, and elapsed time %.3fs for the epoch." % (loss, time() - t_start) # See the results embeddings = skip_gram.get_weights()[0]
for each_word in each_words: if each_word not in word2idx: word2idx[each_word] = 1 idx2word.append(each_word) else: word2idx[each_word] += 1 sents_as_ids = list() count_num = 0 for each_words in normalized_corpus: sents_as_ids.append([]) for each_word in each_words: index = idx2word.index(each_word) sents_as_ids[count_num].append(index) count_num += 1 print('\nSample word2idx: ', list(word2idx.items())[:10]) print(len(word2idx)) print(len(idx2word)) print('\nSample word2idx: ', list(word2idx.items())[:10]) print('\nSample normalized corpus:', normalized_corpus[:3]) print('\nAbove sentence as a list of ids:' , sents_as_ids[:3]) print(len(sents_as_ids)) # training from keras.preprocessing.sequence import skipgrams skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=5) for sent in sents_as_ids]
def run_preprocessing(texts, data_dir, run_name, min_freq_threshold=10, max_length=100, bad=[], vectors="en_core_web_lg", num_threads=2, token_type="lemma", only_keep_alpha=False, write_every=10000, merge=False): """This function abstracts the rest of the preprocessing needed to run Lda2Vec in conjunction with the NlpPipeline Parameters ---------- texts : TYPE Python list of text data_dir : TYPE directory where your data is held run_name : TYPE Name of sub-directory to be created that will hold preprocessed data min_freq_threshold : int, optional If words occur less frequently than this threshold, then purge them from the docs max_length : int, optional Length to pad/cut off sequences bad : list, optional List or Set of words to filter out of dataset vectors : str, optional Name of vectors to load from spacy (Ex. "en", "en_core_web_sm") num_threads : int, optional Number of threads used in spacy pipeline token_type : str, optional Type of tokens to keep (Options: "lemma", "lower", "orth") only_keep_alpha : bool, optional Only keep alpha characters write_every : int, optional Number of documents' data to store before writing cache to skipgrams file merge : bool, optional Merge noun phrases or not """ def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Location for preprocessed data to be stored file_out_path = data_dir + "/" + run_name if not os.path.exists(file_out_path): # Make directory to save data in os.makedirs(file_out_path) # Remove tokens with these substrings bad = set(bad) # Preprocess data # Convert to unicode (spaCy only works with unicode) texts = [str(clean(d)) for d in texts] # Process the text, no file because we are passing in data directly SP = NlpPipeline(None, max_length, texts=texts, num_threads=num_threads, only_keep_alpha=only_keep_alpha, token_type=token_type, vectors=vectors, merge=merge) # Computes the embed matrix along with other variables SP._compute_embed_matrix() print("converting data to w2v indexes") # Convert data to word2vec indexes SP.convert_data_to_word2vec_indexes() print("trimming 0's") # Trim zeros from idx data SP.trim_zeros_from_idx_data() # This extracts the length of each document (needed for pyldaviz) doc_lengths = [len(x) for x in SP.idx_data] # Find the cutoff idx for i, freq in enumerate(SP.freqs): if freq < min_freq_threshold: cutoff = i break # Then, cut off the embed matrix embed_matrix = SP.embed_matrix[:cutoff] # Also, replace all tokens below cutoff in idx_data for i in range(len(SP.idx_data)): SP.idx_data[i][SP.idx_data[i] > cutoff - 1] = 0 # Next, cut off the frequencies freqs = SP.freqs[:cutoff] print("converting to skipgrams") data = [] num_examples = SP.idx_data.shape[0] # Sometimes docs can be less than the required amount for # the skipgram function. So, we must manually make a counter # instead of relying on the enumerated index (i) doc_id_counter = 0 # Additionally, we will keep track of these lower level docs # and will purge them later purged_docs = [] for i, t in enumerate(SP.idx_data): pairs, _ = skipgrams(t, vocabulary_size=SP.vocab_size, window_size=5, shuffle=True, negative_samples=0) # Pairs will be 0 if document is less than 2 indexes if len(pairs) > 2: for pair in pairs: temp_data = pair # Appends doc ID temp_data.append(doc_id_counter) # Appends document index temp_data.append(i) data.append(temp_data) doc_id_counter += 1 else: purged_docs.append(i) if i // write_every: temp_df = pd.DataFrame(data) temp_df.to_csv(file_out_path + "/skipgrams.txt", sep="\t", index=False, header=None, mode="a") del temp_df data = [] if i % 500 == 0: print("step", i, "of", num_examples) temp_df = pd.DataFrame(data) temp_df.to_csv(file_out_path + "/skipgrams.txt", sep="\t", index=False, header=None, mode="a") del temp_df # Save embed matrix np.save(file_out_path + "/embed_matrix", embed_matrix) # Save the doc lengths to be used later, also, purge those that didnt make it into skipgram function np.save(file_out_path + "/doc_lengths", np.delete(doc_lengths, np.array(purged_docs))) # Save frequencies to file np.save(file_out_path + "/freqs", freqs) # Save vocabulary dictionaries to file idx_to_word_out = open(file_out_path + "/" + "idx_to_word.pickle", "wb") pickle.dump(SP.idx_to_word, idx_to_word_out) idx_to_word_out.close() word_to_idx_out = open(file_out_path + "/" + "word_to_idx.pickle", "wb") pickle.dump(SP.word_to_idx, word_to_idx_out) word_to_idx_out.close()
vector_dim = 300 # Number of epochs epochs = 2 # For validation to monitor performance # pick 16 words valid_size = 16 # Pick from the top 100 words valid_window = 100 valid_examples = np.random.choice(valid_window, valid_size, replace=False) # skip-gram function sampling_table = sequence.make_sampling_table(vocab_size) # target and context words are returned in 'couples' and labels indicates if # the pair is a positive or negative sample couples, labels = skipgrams(data, vocab_size, window_size = window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) # The inputs will be the target word and the context word input_target = Input((1,)) input_context = Input((1,)) # The inputs will feed into an embedding layer # Number of rows = vocabulary size # Number of columns = dimension of the vector embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') # The target and context words target = embedding(input_target)
from __future__ import print_function from keras.preprocessing.text import Tokenizer, text_to_word_sequence from keras.preprocessing.sequence import skipgrams text = "I love green eggs and ham ." tokenizer = Tokenizer() tokenizer.fit_on_texts([text]) word2id = tokenizer.word_index id2word = {v: k for k, v in word2id.items()} wids = [word2id[w] for w in text_to_word_sequence(text)] pairs, labels = skipgrams(wids, len(word2id), window_size=1) print(len(pairs), len(labels)) for i in range(10): print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format( id2word[pairs[i][0]], pairs[i][0], id2word[pairs[i][1]], pairs[i][1], labels[i]))
# create dataset: word pairs and doc ids with positive and negative samples window_size = 2 targets = [] contexts = [] labels = [] couples = [] doc_ids = [] for i in range(0,n_documents): if i % 1000 == 0 and i > 0: print (i) seq = sequences[i] sampling_table = sequence.make_sampling_table(vocab_size) couple, label = skipgrams(seq, vocab_size, window_size=window_size, sampling_table=sampling_table) if not couple: next try: target, context = zip(*couple) targets = targets + list(target) contexts = contexts + list(context) doc_ids = doc_ids + [i]*len(context) labels = labels + label couples = couples + couple except: print ("Error on " + str(seq)) data_target = np.array(targets, dtype='int32') data_context = np.array(contexts, dtype='int32')
def main(): list_of_words, words, word_count = readDataBible() ''' list_of_words contains words in the order in which they appear. As it is word_count is the number of unique words in the vocabulary words is a dictionary. Key is word and value is the number of occurances of that word ''' embedding_size = 64 batch_size = 64 window_size = 5 vocabulary_size = word_count dec_word_list = sorted(words.items(), key=operator.itemgetter(1), reverse=True) dic = {} for word in dec_word_list: dic[word[0]] = word[1] reverse_dic = {} for word in dic: reverse_dic[dic[word]] = word ''' print(vocabulary_size) print(dec_word_list) print(dic) print(reverse_dic) ''' for i in range(len(list_of_words)): list_of_words[i] = dic[list_of_words[i]] sampling_table = sequence.make_sampling_table(vocabulary_size) couples, labels = skipgrams(list_of_words, vocabulary_size, window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) input_target = Input((1, )) input_context = Input((1, )) embedding = Embedding(vocabulary_size, embedding_size, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((embedding_size, 1))(target) context = embedding(input_context) context = Reshape((embedding_size, 1))(context) # now perform the dot product operation to get a similarity measure dot_product = merge([target, context], mode='dot', dot_axes=1) dot_product = Reshape((1, ))(dot_product) # add the sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) # create the primary training model model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') # setup a cosine similarity operation which will be output in a secondary model similarity = merge([target, context], mode='cos', dot_axes=0) # create a secondary validation model to run our similarity checks during training validation_model = Model(input=[input_target, input_context], output=similarity)
if i > 50000: break vocabulary += list(item) texts.append(item) vocabulary = set(vocabulary) vocab_size = len(vocabulary) print('vocabulary size: ', vocab_size) word2id, id2word = {}, {} for idx, word in enumerate(vocabulary): word2id[word] = idx id2word[idx] = word text_seq = [word2id[word] for text in texts for word in text] X_train, y_train = skipgrams(text_seq, len(word2id), window_size=4) X_train, y_train = np.array(X_train), np.array(y_train) print('show 5 training examples...') for i in range(5): print(X_train[i], y_train[i]) def build_skipgram_model(): input1 = Input(shape=(1, )) embeddings1 = Embedding(vocab_size, embedding_dims)(input1) output1 = Flatten()(embeddings1) input2 = Input(shape=(1, )) embeddings2 = Embedding(vocab_size, embedding_dims)(input2) output2 = Flatten()(embeddings2)
def my_test_skipgrams(): couples, labels = skipgrams([0, 1, 2, 3], vocabulary_size=4) print("couples: ", couples) print("labels: ", labels)
# Convert text to numerical sequences # Note that the Tokenizer starts numbering words with 1. So we have vocabulary_size+1 words. The 0-th word # is considered to be the 'Out-of-vocabulary' token. tokenizer = Tokenizer(num_words=vocabulary_size+1, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ', lower=True, split=' ',) tokenizer.fit_on_texts(animal_corpus) sequences = tokenizer.texts_to_sequences(animal_corpus) # Generate (target, context) pairs with negative sampling pairs = [] labels = [] for this_sequence in sequences: # Again note the vocabulary_size+1 expression c, l = skipgrams(this_sequence, vocabulary_size+1, window_size=WINDOW_SIZE, negative_samples=1, shuffle=True) for i in range(len(c)): pairs.append(c[i]) labels.append(l[i]) pairs = np.array(pairs) labels = np.array(labels) print("There are {} (context,target) pairs in the dataset".format(len(pairs))) from keras.layers import Embedding, Input, Dense, Reshape from keras.layers.merge import Dot from keras.models import Model from keras.optimizers import RMSprop
return corpus_encoded, dict_3, dict_2 corpus_encoded, reverse_dict, corpus_dict = format_corpus( corpus, corpus_length) window_size = 3 vector_dim = 600 epochs = 2000000 print(len(corpus_encoded), len(corpus)) sampling_table = sequence.make_sampling_table(corpus_length) couples, labels = sequence.skipgrams(corpus_encoded, corpus_length, window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") # print(couples[:10], labels[:10]) from tensorflow.keras.layers import Embedding from tensorflow.keras.layers import Input from tensorflow.keras.layers import Reshape from tensorflow.keras.layers import dot from tensorflow.keras.layers import multiply from tensorflow.keras.layers import subtract from tensorflow.keras.layers import Dense from tensorflow.keras.models import Model
def main(): """ Main function documentation template :return: None :rtype: None """ logging.basicConfig(level=logging.DEBUG) # Read in text text_path = '../data/alice_in_wonderland.txt' text = open(text_path, 'r').read() logging.info('Read {} characters from {}'.format(len(text), text_path)) # Change text to sequence of indices vectorizer = transformations.EmbeddingVectorizer() indices = vectorizer.fit_transform([[text]]) vocab_size = numpy.max(indices) + 1 # Change sequence of indices to skipgram training pair and T/F label (E.g. [[project, gutenberg], True] # TODO There must be a better way of getting a 1d array X, y = skipgrams(indices.tolist()[0], vocabulary_size=vocab_size, window_size=4, categorical=True) X = numpy.array(X) y = numpy.array(y) logging.info('X shape: {}, y shape: {}'.format(X.shape, y.shape)) # Create architecture # TODO Should be two separate inputs, rather than a timeseries w/ 2 time steps input_layer = Input(shape=(2, ), name='text_input') x = input_layer x = Embedding(input_dim=vocab_size, output_dim=50, input_length=2, name='text_embedding')(x) x = Flatten()(x) x = Dense(2, activation='softmax', name='output')(x) model = Model(input_layer, x) model.compile(optimizer='Adam', loss='categorical_crossentropy') # Train architecture callbacks = [ TensorBoard( os.path.expanduser('~/.logs/' + str(datetime.datetime.now()))) ] model.fit(X, y, epochs=5, validation_split=.1, callbacks=callbacks, batch_size=2**13) embedding = model.get_layer('text_embedding') weights = embedding.get_weights()[0] print(weights) print(weights.shape) print(type(weights)) # Store weights pickle.dump(weights, open('custom_embedding.pkl', 'wb')) pickle.dump(vectorizer.token_index_lookup, open('custom_vocab_index.pkl', 'wb'))
docWords, freq, dictionary, inverseDict = collectDataset() windowSize, vectorDim, epochs = 5, 300, 70000 valSize, valWindow = 20, 120 #5, 10 valExamples = np.random.choice(valWindow, valSize, replace=False) wordTargetArray = np.zeros((1, )) wordContextArray = np.zeros((1, )) labelsArray = np.zeros((1, )) sampleTable = make_sampling_table(vocabSize) pairs, labels = skipgrams(docWords, vocabSize, window_size=windowSize, sampling_table=sampleTable) wordTarget, wordContext = zip(*pairs) wordTarget = np.array(wordTarget, dtype="int32") wordContext = np.array(wordContext, dtype="int32") inputTarget, inputContext = Input((1, )), Input((1, )) embedding = Embedding(vocabSize, vectorDim, input_length=1, name='embedding') target = embedding(inputTarget) target = Reshape((vectorDim, 1))(target) context = embedding(inputContext) context = Reshape((vectorDim, 1))(context) similarity = merge([target, context], mode='cos', dot_axes=0) dotProduct = merge([target, context], mode='dot', dot_axes=1)
o = Reshape((1, ), input_shape=(1, 1))(o) o = Activation('sigmoid')(o) SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=o) SkipGram.summary() SkipGram.compile(loss='binary_crossentropy', optimizer='adam') from math import ceil batch_size = 256 for _ in range(5): loss = 0. for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)): data, labels = skipgrams(sequence=doc, vocabulary_size=V, window_size=5, negative_samples=5.) x = [np.array(x).reshape(-1, 1) for x in zip(*data)] y = np.array(labels, dtype=np.int32) if x: for i in range(int(len(x) / batch_size)): x_batch = [ x[0][i * batch_size:(i + 1) * batch_size], x[1][i * batch_size:(i + 1) * batch_size] ] loss += SkipGram.train_on_batch( x_batch, y[i * batch_size:(i + 1) * batch_size]) print(loss) f = open('vectors_negative_sampling.txt', 'w')
def dumb_word_embedding_shit_idegaf(vocab): window_size = 2 vector_dim = embedding_dim epochs = 50000 global valid_size global valid_examples valid_size = 6 # Random set of words to evaluate similarity on. valid_window = 10 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) sampling_table = sequence.make_sampling_table(len(vocab)) print(sampling_table) couples, labels = sequence.skipgrams(data, len(vocab), window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) # create some input variables input_target = Input((1, )) input_context = Input((1, )) embedding = Embedding(len(vocab), vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target) context = embedding(input_context) context = Reshape((vector_dim, 1))(context) # setup a cosine similarity operation which will be output in a secondary model similarity = keras.layers.dot([target, context], 1) # now perform the dot product operation to get a similarity measure dot_product = keras.layers.dot([target, context], 1, normalize=True) dot_product = Reshape((1, ))(dot_product) # add the sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) # create the primary training model model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') # create a secondary validation model to run our similarity checks during training global validation_model validation_model = Model(input=[input_target, input_context], output=similarity) arr_1 = np.zeros((1, )) arr_2 = np.zeros((1, )) arr_3 = np.zeros((1, )) for cnt in range(epochs): idx = np.random.randint(0, len(labels) - 1) arr_1[0, ] = word_target[idx] arr_2[0, ] = word_context[idx] arr_3[0, ] = labels[idx] loss = model.train_on_batch([arr_1, arr_2], arr_3) if cnt % 100 == 0: print("Iteration {}, loss={}".format(cnt, loss)) if cnt % 10000 == 0: sim_cb.run_sim() sim_cb.run_sim() model.save("dumb_embeddings") return model
return data, count, dictionary, reverse_dictionary vocab_size = 10000 data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size) print(data[:7]) window_size = 3 vector_dim = 300 epochs = 200000 valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) sampling_table = sequence.make_sampling_table(vocab_size) couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) # create some input variables input_target = Input((1,)) input_context = Input((1,)) embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target) context = embedding(input_context) context = Reshape((vector_dim, 1))(context)
stream_gen = stream_generator(args.input, meta=meta, field=args.field, session=True) context_streamer = SessionContextStreamer(stream_gen) feature_map = read_feature_map(args.sampling_table) sampling_table = read_sampling_table(args.sampling_table) skipgram_streamer = SkipGramStreamer(context_streamer, neg_pairs=args.neg_pairs, window_size=args.window_size, feature_map=, sampling_table=) batch_streamer = Batch2BatchStreamer(skipgram_streamer, batch_size=args.batch_size) sampling_table = preprocessing.read_sampling_table() num_features = len(sampling_table) for i, seq in enumerate(streamer): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams(seq) if couples: X1, X2 = zip(*couples) X1 = np.array(X1, dtype="int32") X2 = np.array(X2, dtype="int32") loss = model.train_on_batch([X1, X2], labels)
# -*- coding: utf-8 -*- from keras.preprocessing.text import * from keras.preprocessing.sequence import skipgrams text = "I love green eggs and ham ." tokenizer = Tokenizer() tokenizer.fit_on_texts([text]) word2id = tokenizer.word_index id2word = {v: k for k, v in word2id.items()} wids = [word2id[w] for w in text_to_word_sequence(text)] pairs, labels = skipgrams(wids, len(word2id)) print(len(pairs), len(labels)) for i in range(10): print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(id2word[pairs[i][0]], pairs[i][0], id2word[pairs[i][1]], pairs[i][1], labels[i]))
wordDict, wordWeights = getPretrainedEmbeddings(wordDict,globalWordTokens,globalWordVectors) ## Add some noise for context vectors since wordVecs != contextVecs contextWeights = wordWeights + np.random.uniform() del globalWordVectors, globalWordTokens ## Convert wordList to list of integers vocab_size = len(sampleMatrix) wordListInts = [] for ind,item in enumerate(wordList): wordListInts.append(wordDict[item]) del wordList print 'Generating Skipgram pairs for training...will take a while...' ## TODO : Make samplingMatrix realistic for freq-sampling and remove next line sampleMatrix = None couples,labels = skipgrams(wordListInts, vocab_size, window_size=int(options.window), negative_samples=int(options.nsamples), shuffle=True, categorical=False, sampling_table=sampleMatrix) ## This part taken from @zachmayer model_word = Sequential() model_word.add(Embedding(vocab_size, veclen, input_length=1, weights = (wordWeights,))) model_word.add(Reshape((1,veclen))) model_context = Sequential() model_context.add(Embedding(vocab_size,veclen, input_length=1, weights = (contextWeights,))) model_context.add(Reshape((1,veclen,))) model = Sequential() model.add(Merge([model_word,model_context], mode='dot',dot_axes=2)) model.add(Flatten()) model.add(Dense(1))
n_components=2, init='pca', n_iter=3500, random_state=32) embeddings_en_2d = np.array( tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape( n, m, 2) tsne_plot_similar_words('Similar words from vocabulary', keys, embeddings_en_2d, word_clusters, 0.7) class Visualiser(Callback): def on_epoch_end(self, epoch, logs=None): prep_for_plot() couples, labels = skipgrams(data, vocabulary_size=data_size, window_size=3, sampling_table=make_sampling_table(data_size)) word_target, word_context = zip(*couples) word_target, word_context = np.array(word_target, dtype="int32"), np.array(word_context, dtype="int32") model.fit([word_target, word_context], labels, epochs=5, callbacks=[Visualiser()])
def train(self, model_config, wids, word2id): vocab_size = len(word2id.values()) sampling_table = sequence.make_sampling_table(vocab_size) wids_flat = [word for sentence in wids for word in sentence] couples, labels = skipgrams(wids_flat, vocab_size, window_size=model_config['window_size'], sampling_table=sampling_table) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") input_target = Input((1,)) input_context = Input((1,)) vector_dim = model_config['number_of_dimensions_in_hidden_layer'] embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target) context = embedding(input_context) context = Reshape((vector_dim, 1))(context) # setup a cosine similarity operation which will be output in a secondary model similarity = merge([target, context], mode='cos', dot_axes=0) # now perform the dot product operation to get a similarity measure dot_product = merge([target, context], mode='dot', dot_axes=1) dot_product = Reshape((1,))(dot_product) # add the sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) # create the primary training model model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') validation_model = Model(input=[input_target, input_context], output=similarity) class SimilarityCallback: def run_sim(self): valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) reverse_dictionary = dict(zip(word2id.values(), word2id.keys())) for i in range(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors sim = self._get_sim(valid_examples[i]) nearest = (-sim).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log_str = '%s %s,' % (log_str, close_word) print(log_str) @staticmethod def _get_sim(valid_word_idx): sim = np.zeros((vocab_size,)) in_arr1 = np.zeros((1,)) in_arr2 = np.zeros((1,)) in_arr1[0,] = valid_word_idx for i in range(vocab_size): in_arr2[0,] = i out = validation_model.predict_on_batch([in_arr1, in_arr2]) sim[i] = out return sim sim_cb = SimilarityCallback() arr_1 = np.zeros((1,)) arr_2 = np.zeros((1,)) arr_3 = np.zeros((1,)) for cnt in range(model_config['epochs']): idx = np.random.randint(0, len(labels) - 1) arr_1[0,] = word_target[idx] arr_2[0,] = word_context[idx] arr_3[0,] = labels[idx] loss = model.train_on_batch([arr_1, arr_2], arr_3) if cnt % 100 == 0: print("Iteration {}, loss={}".format(cnt, loss)) # if cnt % 10000 == 0: # sim_cb.run_sim() return model, model.get_weights()[0]
word2id = tokenizer.word_index id2word = {v: k for k, v in word2id.items()} vocab_size = len(word2id) + 1 embed_size = 100 wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible] print('Vocabulary Size:', vocab_size) print('Vocabulary Sample:', list(word2id.items())[:10]) from keras.preprocessing.sequence import skipgrams # generate skip-grams skip_grams = [ skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids ] from keras.layers import Merge from keras.layers.core import Dense, Reshape from keras.layers.embeddings import Embedding from keras.models import Sequential # build skip-gram architecture word_model = Sequential() word_model.add( Embedding(vocab_size, embed_size, embeddings_initializer="glorot_uniform", input_length=1)) word_model.add(Reshape((embed_size, )))
def process(args): print "Loading graph..." if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: #print("Walking...") #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, # path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") max_features = len(G.nodes()) # vocabulary size dim_proj = args.representation_size # embedding space dimension nb_epoch = 1 # number of training epochs # Neural network ( in Keras ) model = Sequential() model.add( WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) print("Fitting tokenizer on walks...") tokenizer = text.Tokenizer(nb_words=max_features) print "Epochs: %d" % nb_epoch #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length)) for e in range(nb_epoch): print('-' * 40) print('Epoch', e) print('-' * 40) #progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen = 0 losses = [] # for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )): for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter( G, args.number_walks, args.walk_length)): # get skipgram couples for one text in the dataset couples, labels = sequence.skipgrams( seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table) if couples: # one gradient update per sentence (one sentence = a few 1000s of word couples) X = np.array(couples, dtype="int32") print "Started fitting..." loss = model.fit(X, labels) print "Dumping..." # Dump weights to a temp file weights = model.layers[0].get_weights()[0] norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt(args.output, norm_weights) losses.append(loss) if len(losses) % 100 == 0: # progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) print('Samples seen:', samples_seen) print("Training completed!") else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") #TODO: IMPLEMENT THAT print "Not implemented yet..." sys.exit(1) print "Optimization done. Saving..." # recover the embedding weights trained with skipgram: weights = model.layers[0].get_weights()[0] # we no longer need this del model norm_weights = np_utils.normalize(weights) # TODO: save weights with indices np.savetxt(args.output, norm_weights) print "Saved!"
tokenizer.fit_on_texts(text_generator()) # ----- 训练 ----- model = Sequential() model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform")) model.compile(loss='mse', optimizer='rmsprop') sampling_table = sequence.make_sampling_table(max_features) for e in range(nb_epoch): print 'Epoch:', e progbar = generic_utils.Progbar(tokenizer.document_count) samples_seen, losses = 0, [] for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())): couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table) if couples: X = np.array(couples, dtype="int32") loss = model.train_on_batch(X, labels) losses.append(loss) if len(losses) % 100 == 0: progbar.update(i, values=[("loss", np.mean(losses))]) losses = [] samples_seen += len(labels) weights = model.layers[0].get_weights()[0] weights[:skip_top] = np.zeros((skip_top, dim_proj)) norm_weights = np_utils.normalize(weights) del model word_index = tokenizer.word_index
for i, line in enumerate(smart_open(fname, "r")): arr = line.strip().split(" ") # if len(arr) < max_doc_len: continue seq = [] for word in arr: if word not in top_vocab_list: seq.append(1) else: seq.append(top_vocab_list.index(word) + 2) temp = skipgrams(seq, vocab_size, window_size=window_size, negative_samples=negative_samples) pairs += temp[0] labels += temp[1] # print(temp) if i % 100000 == 0: for w_c, label in zip(pairs, labels): if len(w_c) > 0: print(*w_c, time, label, sep="\t", file=fw) print("Processed {} sentence".format(i), file=sys.stderr) pairs, labels = [], [] if i > max_sents: break fw.close()