Exemple #1
0
def test_skipgrams():
    # test with no window size and binary labels
    couples, labels = skipgrams(np.arange(3), vocabulary_size=3)
    for couple in couples:
        assert couple[0] in [0, 1, 2] and couple[1] in [0, 1, 2]

    # test window size and categorical labels
    couples, labels = skipgrams(np.arange(5), vocabulary_size=5, window_size=1, categorical=True)
    for couple in couples:
        assert couple[0] - couple[1] <= 3
    for l in labels:
        assert len(l) == 2
Exemple #2
0
def loadData():
    '''
    I love green eggs and ham.
    (context,word):
    ([I,green],love)
    ([love,eggs],green)
    ([green,and],eggs)
    -------------->
    (love,I) 1
    (love,green) 1
    :return:
    '''
    text ='I love green eggs and ham .'
    tokenizer =Tokenizer()
    tokenizer.fit_on_texts([text])
    word2id =tokenizer.word_index
    id2word ={v:k for k,v in word2id.items()}
    wids =[word2id[w]for w in text_to_word_sequence(text,split=' ')]
    pairs,labels =skipgrams(wids,len(word2id))
    print (len(pairs),len(labels))
    for i in range(10):
        print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
            id2word[pairs[i][0]], pairs[i][0],
            id2word[pairs[i][1]], pairs[i][1],
            labels[i]))
Exemple #3
0
    def train_corpus(self, negative_samples=20, window_size=4):
        """ Train the model on the given corpus

        Parameters:
        negative_samples (int): the number of `false contexts' for each word
        window_size (int): the size of each context
        """
        logging.info('Initialising sampling table')
        sampling_table = sequence.make_sampling_table(self.vocab_size)
        ans = []
        for i, seq in enumerate(
                self.tokenizer.texts_to_sequences_generator(self.corpus)):
            logging.info(i)
            couples, labels = sequence.skipgrams(
                seq, self.vocab_size, window_size=window_size,
                negative_samples=negative_samples,
                sampling_table=sampling_table)
        if couples:
            word_target, word_context = zip(*couples)
            word_target = np.array(word_target, dtype="int32")
            word_context = np.array(word_context, dtype="int32")
            loss = self.model.train_on_batch([word_target, word_context],
                                             labels)
        ans.append(loss)
        return ans
    def _fit_embeddings(self, text):
        sampling_table = sequence.make_sampling_table(max_words)

        for e in range(self.n_epochs):
            print('-' * 40)
            print('Epoch', e)
            print('-' * 40)

            progbar = generic_utils.Progbar(self.tokenizer.document_count)
            samples_seen = 0
            losses = []

            for i, seq in enumerate(self.tokenizer.texts_to_sequences_generator(text)):

                #MAKE SURE TOKENIZER AND FITTING ARE WORKING
                #if i < 5:
                #    print(map(lambda x: reverse_word_index[x], seq))

                # get skipgram couples for one text in the dataset
                couples, labels = sequence.skipgrams(seq, max_words,
                                                     window_size=self.window_size,
                                                     negative_samples=1.,
                                                     sampling_table=sampling_table)
                if couples:
                    # one gradient update per sentence (one sentence = a few 1000s of word couples)
                    X = np.array(couples, dtype="int32")
                    loss = self.embedding_model.train_on_batch(X, labels)
                    losses.append(loss)
                    if len(losses) % 100 == 0:
                        progbar.update(i, values=[("loss", np.mean(losses))])
                        losses = []
                    samples_seen += len(labels)
            print('Samples seen:', samples_seen)
        print("Training completed!")
        return self
def train_on_model(model, g, vocab_size, max_len = 10, epochs = 100, print_every=10, window_size=4, negative_sampling=1.0, sampling_table=None):
  losses, valid_sequences = 0.0, 0
  for epoch in xrange(epochs):
    sequences = pad_sequences([g.random_walk(k,max_len) for k in range(vocab_size)])
    X_couples = []
    y_labels = []
    for seq in sequences:
      couples, labels = skipgrams(seq, vocab_size, window_size=window_size, negative_samples=negative_sampling, sampling_table=sampling_table)
      X_couples.extend(couples)
      y_labels.extend(labels)
      if len(couples) == 0:
        continue
      valid_sequences += 1
    loss = train_batch(model, X_couples, y_labels)
    losses += loss
    if epoch % print_every == 0:
      logging.info("Mean loss in Epoch [%s] with %s valid sequences = %s" % (epoch, valid_sequences, losses / valid_sequences))
      losses, valid_sequences = 0.0, 0
    for e in range(nb_epoch):
        print('-' * 40)
        print('Epoch', e)
        print('-' * 40)

        progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

        for i, seq in enumerate(
                tokenizer.texts_to_sequences_generator(text_generator())):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq,
                                                 max_features,
                                                 window_size=4,
                                                 negative_samples=1.,
                                                 sampling_table=sampling_table)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                loss = model.train_on_batch(X, labels)
                losses.append(loss)
                if len(losses) % 100 == 0:
                    progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        print('Samples seen:', samples_seen)
    print("Training completed!")

    if save:
Exemple #7
0
def run_preprocessing(texts,
                      data_dir,
                      run_name,
                      min_freq_threshold=10,
                      max_len=100,
                      bad=None,
                      vectors='en_core_web_lg',
                      n_threads=2,
                      token_type='lemma',
                      only_keep_alpha=False,
                      write_every=10000,
                      merge=False):
    """
    This function abstracts the rest of the preprocessing needed
    to run Lda2Vec in conjunction with the NlpPipeline.

    :param texts: (list[str]) list of text
    :param data_dir: (str) directory where data is held
    :param run_name: (str) Named of directory created to hold preprocessed data
    :param min_freq_threshold: (int, optional) If words occur less frequently than this threshold,
        then purge them from the docs
    :param max_len: (int, optional) Length to pad/cut off sequences
    :param bad: (list|set, optional) words to filter out of dataset
    :param vectors: (str) Name of vectors to load from spacy, e.g. ["en", "en_core_web_sm"]
    :param n_threads: (int, optional) Number of threads used in spacy pipeline
    :param token_type: (str, optional) Type of tokens to keep, one of ["lemma", "lower", "orth"]
    :param only_keep_alpha: (bool, optional) Only keep alpha characters
    :param write_every: (int, optional) Number of documents' data to store before writing cache to skipgrams file
    :param merge: (bool, optional) Merge noun phrases
    :return:
    """
    if bad is None:
        bad = []

    def clean(line):
        return ' '.join(w for w in line.split()
                        if not any(tk in w for tk in bad))

    # Location for preprocessed data to be stored
    out_path = data_dir / run_name
    if not os.path.exists(out_path):
        # Make directory to save data in
        os.makedirs(out_path)

        # Remove tokens with these substrings
        bad = set(bad)

        # Preprocess data
        # Convert to unicode (spacy only works with unicode)
        texts = [str(clean(d)) for d in texts]
        texts = [t for t in texts if t]  # remove empty lines after cleaning

        # Process the text, no file because we are passing in data directly
        p = NlpPipeline(None,
                        max_len,
                        texts=texts,
                        n_threads=n_threads,
                        only_keep_alpha=only_keep_alpha,
                        token_type=token_type,
                        vectors=vectors,
                        merge=merge)

        # Computes the embed matrix along with other variables
        p.compute_embed_matrix()

        print('Convert data to word2vec indices')
        p.convert_data_to_word2vec_indices()

        print('Trim zeros')
        p.trim_zeros_from_idx_data()

        # Extract the length of each document (needed for pyLDAvis)
        doc_lengths = [len(x) for x in p.idx_data]

        # Find the cutoff index
        cutoff = 0
        for i, freq in enumerate(p.freqs):
            if freq < min_freq_threshold:
                cutoff = i
                break

        # Then cutoff the embed matrix
        embed_matrix = p.embed_matrix[:cutoff]

        # Also replace all tokens below cutoff in `idx_data`
        for i in range(len(p.idx_data)):
            p.idx_data[i][p.idx_data[i] > cutoff - 1] = 0

        # Next cut off the frequencies
        freqs = p.freqs[:cutoff]

        print('Convert to skipgrams')
        data = []
        n_examples = p.idx_data.shape[0]

        # Sometimes docs can be less than the required amount for
        # the skipgram function. So we must manually make a counter
        # instead of relying on the enumerated index (i).
        doc_id_counter = 0

        # Additionally, we will keep track of these lower level docs
        # and will purge them later.
        purged_docs = []
        for i, t in enumerate(p.idx_data):
            pairs, _ = skipgrams(t,
                                 vocabulary_size=p.vocab_size,
                                 window_size=5,
                                 shuffle=True,
                                 negative_samples=0)

            # Pairs will be 0 if document is less than 2 indexes
            if len(pairs) > 2:
                for pair in pairs:
                    temp_data = pair

                    # Appends doc ID
                    temp_data.append(doc_id_counter)

                    # Appends document index
                    temp_data.append(i)

                    data.append(temp_data)

                doc_id_counter += 1
            else:
                purged_docs.append(i)

            if i // write_every:
                temp_df = pd.DataFrame(data)
                temp_df.to_csv(out_path / 'skipgrams.txt',
                               sep='\t',
                               index=False,
                               header=None,
                               mode='a')
                del temp_df
                data = []

            if i % 500 == 0:
                print('step', i, 'of', n_examples)

        temp_df = pd.DataFrame(data)
        temp_df.to_csv(out_path / 'skipgrams.txt',
                       sep='\t',
                       index=False,
                       header=None,
                       mode='a')
        del temp_df

        # Save embed matrix
        np.save(out_path / 'embed_matrix', embed_matrix)

        # Save the doc lengths to be used later
        # Also purge those that didnt make it into skipgram function
        np.save(out_path / 'doc_lengths',
                np.delete(doc_lengths, np.array(purged_docs)))

        # Save frequencies to file
        np.save(out_path / 'freqs', freqs)

        # Save vocabulary dictionary to file
        with open(out_path / 'idx_to_word.pkl', 'wb') as f:
            pickle.dump(p.idx_to_word, f)

        with open(out_path / 'word_to_idx.pkl', 'wb') as f:
            pickle.dump(p.word_to_idx, f)
Exemple #8
0
def sequence_skipgrams():
    couples, labels = sequence.skipgrams([0, 1, 2, 3],
                                         vocabulary_size=4,
                                         window_size=2)
    print(couples)
    print(labels)
Exemple #9
0
def main():
	scoreList = [0.0,0.0]
	with open('data/info.json') as j:
		info = ujson.load(j)
	for problem in os.listdir('data'):
		greek=False
		if problem.startswith('problem'):
			truthPath = 'data/truth/'+problem+'/clustering.json'
			with open(truthPath) as t:
				truth = ujson.load(t)
			print(problem)
			probTokList = []
			docList = []
			docDict = {}
			X=[]
			Y=[]

			path = 'data/' + problem
			for entry in info:
				if entry["folder"] == problem:
					lang=entry["language"]
					if entry["language"] == "gr":
						greek=True

			CV = CountVectorizer(input='filename', strip_accents='unicode', analyzer='word', ngram_range=(1,4))
			docs = [path+'/'+x for x in os.listdir(path)]
			cMatrix = CV.fit_transform(docs)
			for doc in os.listdir(path):
				docTokList = []
				with open(path + '/' + doc) as d:
						article = d.readlines()
						for sent in article:
							sentTokList = []
							for word in sent.split():
								for token in word:
									procToken = preprop(token,greek)
									sentTokList.append(procToken) #Every item of the list is a normalized character
							docTokList.append(' '.join(sentTokList))#Every item of the list is a sentence
				probTokList.append(' '.join(docTokList))#Every item of the list is a document
				docList.append(doc)
			tokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ")
			tokenizer.fit_on_texts(probTokList)
			seqList = tokenizer.texts_to_sequences(probTokList)
			
			uniqueTokens = max([max(x) for x in seqList])

			print(uniqueTokens,lang)
			sampling_table = sequence.make_sampling_table(uniqueTokens+1)
			for i,seq in enumerate(seqList):
				x, y = sequence.skipgrams(seq, uniqueTokens, window_size=4, negative_samples=1.0, categorical=False, sampling_table=sampling_table)
				x = zip(x, y)
				X.append(x)
				#Y.extend(y)
				docDict[docList[i]] = seq
			strX=[str(x) for x in X]
			xTokenizer = text.Tokenizer(nb_words=None,filters=text.base_filter(),lower=True,split=" ")
			xTokenizer.fit_on_texts(strX)
			#docMatrix = tokenizer.sequences_to_matrix(seqList,mode="tfidf")
			docMatrix = xTokenizer.sequences_to_matrix(strX,mode="tfidf")
			#scores = embedNN(X,Y)
			pairs = combinations(docDict.keys(),2)
			cList = []
			nnDict = {}
			for cluster in truth:
				cPairs = []
				if len(cluster) > 1:
					for item in cluster:
						cPairs.append(str(item["document"]))
					cList.extend(list(permutations(cPairs,2)))
			for pair in pairs:
				match = False
				if pair in cList:
					match = True
				nnDict[pair] = match
			for i, doc in enumerate(docMatrix):
				docDict[docList[i]] = doc

			
			truthCounter =  Counter(nnDict.values())
			baseline = 1-float(truthCounter[True])/float(len(nnDict))
			print("Baseline for {} is {}".format(problem, baseline))
			clusterCount = Counter()
			kmclusters = False # Change to False for meanshift
			if kmclusters:
				pbar = ProgressBar()
				for nclusters in pbar(reversed(range(len(docMatrix)-1))):
					#print("{} Clusters".format(nclusters+1))
					clusters = KMclusterer(nclusters+1,cMatrix)
					for c in range(nclusters+1):
						#print(c,"has:",[i for i,x in enumerate(clusters) if x == c])
						for clusterpair in list(combinations([i for i,x in enumerate(clusters) if x == c],2)):
							combo = (docList[clusterpair[0]],docList[clusterpair[1]])
							clusterCount[combo] +=1
			else:
				clusters = KMclusterer(int(len(docMatrix)*0.67),docMatrix)
				#clusters = MSclusterer(cMatrix)#cMatrixdocMatrix
				for clusterpair in list(combinations([i for i,x in enumerate(clusters)],2)):
					combo = (docList[clusterpair[0]],docList[clusterpair[1]])
					clusterCount[combo] +=1

			x = 0.0 
			scoreList[0] += truthCounter[True]
			deleteList = []
			#print("Most common cluster is in {}%".format((float(clusterCount.most_common(20)[19][1])/len(docMatrix))*100))
			for combo in nnDict.keys():
				if combo not in clusterCount.keys():
					deleteList.append(combo)
			y = 0.0
			for item in deleteList:
				if item in cList:
					y+=1
				del nnDict[item]
			scores = sharedNN(docDict, nnDict)
			print("Deleted pairs are {}% of total correct pairs, {}% of deleted pairs was wrongly deleted".format(round(y/len(cList)*100.0,2), round(y/len(deleteList)*100.0,2)))

			for combo in clusterCount.most_common(20):
				if combo[0] in cList:
					x += 1
					scoreList[1] += 1
			print("prec: {}".format(x/20))
			#print("Document score is {} clusters correct out of {} (accuracy {})".format(x, truthCounter[True], x/truthCounter[True]))
			#print("prec: {} \nrec: {}".format(x/20, x/len(nnDict.values())))

	#print("Total precision  is {}, {} clusters correct".format(scoreList[1]/scoreList[0], scoreList[1]))


			if not os.path.exists('answers/'+problem):
				os.mkdir('answers/'+problem)
			clusDict = defaultdict(list)
			rankDict = defaultdict(list)
			for i, cluster in enumerate(list(clusters)):
				clusDict[cluster] .append({"document": docList[i]})
				rankDict[cluster] .append(docList[i])
			with open('answers/'+problem+'/clustering.json', "w") as jsonFile:
				ujson.dump(list(clusDict.values()), jsonFile, indent=4)
			rankList = []
			for value in rankDict.values():
				if len(value) > 1 :
					pairs = combinations(value,2)
				for pair in pairs:
					rankList.append({"document1": pair[0], "document2": pair[1], "score":  scores[pair][0]})
			with open('answers/'+problem+'/ranking.json', "w") as jsonFile:
				ujson.dump(rankList, jsonFile, indent=4)
def main(dname,
         encode_dir,
         raw_dir,
         odir='./resources/skipgrams/',
         mode='local'):
    # load corpus data
    raw_corpus = pd.read_csv(raw_dir + dname + '.tsv', sep='\t')

    # load user data
    user_idx = json.load(open(raw_dir + 'user_idx.json'))
    user_info = dict()
    user_control = set()  # control if renew user_info sample method
    with open(encode_dir + 'users.json') as dfile:
        for line in dfile:
            line = json.loads(line)
            user_info[line['uid']] = line
            user_info[line['uid']]['count'] = 0

    # load tokenizer
    tok = pickle.load(open(encode_dir + dname + '.tkn', 'rb'))
    params = {
        'window': 5,
        'vocab_size': tok.num_words,
        'user_size': len(user_info) + 1,  # +1 for unknown
        'emb_dim': 300,
        'word_emb_path': './resources/word_emb.npy',
        'user_emb_path': './resources/user_emb.npy',
        'word_emb_train': True,
        'user_emb_train': True,
        'user_task_weight': 1,
        'word_task_weight': 1,
        'epochs': 5,
        'optimizer': 'adam',
        'lr': 1e-5,
    }
    word_sampling_table = make_sampling_table(size=params['vocab_size'])

    ww_model, uw_model = build_model(params)
    print()
    print(params)

    for epoch in range(params['epochs']):
        loss = 0
        # shuffle the data
        raw_corpus = raw_corpus.sample(frac=1).reset_index(drop=True)
        for step, entry in raw_corpus.iterrows():
            '''word info, ww: word-word'''
            encode_doc = tok.texts_to_sequences([entry.text])
            ww_pairs, ww_labels = skipgrams(
                sequence=encode_doc[0],
                vocabulary_size=params['vocab_size'],
                window_size=params['window'])

            word_pairs = [np.array(x) for x in zip(*ww_pairs)]
            ww_labels = np.array(ww_labels, dtype=np.int32)
            '''user info, uw: user-word'''
            cur_user = user_info[entry.uid]

            if mode == 'local':
                uw_pairs, uw_labels = utils.user_word_sampler(
                    uid=cur_user['uid_encode'],
                    sequence=encode_doc[0],
                    vocab_size=params['vocab_size'],
                    filter_words=set(cur_user['words']),
                    negative_samples=1)
                uw_pairs = [np.array(x) for x in zip(*uw_pairs)]
                uw_labels = np.array(uw_labels, dtype=np.int32)
            elif mode == 'decay':
                decay_num = utils.sample_decay(cur_user['count'])
                if decay_num > np.random.random():
                    uw_pairs, uw_labels = utils.user_word_sampler(
                        uid=cur_user['uid_encode'],
                        sequence=set(cur_user['words']),
                        vocab_size=params['vocab_size'],
                        negative_samples=1)
                    uw_pairs = [np.array(x) for x in zip(*uw_pairs)]
                    uw_labels = np.array(uw_labels, dtype=np.int32)

                    user_info[entry.uid]['count'] += 1
                    user_control.add(entry.uid)
                else:
                    uw_pairs = None
                    uw_labels = None

                if len(user_control) >= len(user_info) - 10:
                    # restart the control for sampling
                    for uid in user_info:
                        user_info[uid]['count'] = 0
                    user_control.clear()
            elif mode == 'global':
                uw_pairs, uw_labels = utils.user_word_sampler(
                    uid=cur_user['uid_encode'],
                    sequence=set(cur_user['words']),
                    vocab_size=params['vocab_size'],
                    negative_samples=1)
                uw_pairs = [np.array(x) for x in zip(*uw_pairs)]
                uw_labels = np.array(uw_labels, dtype=np.int32)
            else:
                raise ValueError('Mode {} does not exist!'.format(mode))
            '''Train'''
            if word_pairs:
                loss += ww_model.train_on_batch(word_pairs, ww_labels)
            if uw_pairs:
                loss += uw_model.train_on_batch(uw_pairs, uw_labels)

            loss_avg = loss / step
            if step % 100 == 0:
                print('Epoch: {}, Step: {}'.format(epoch, step))
                print('\tLoss: {}.'.format(loss_avg))
                print('-------------------------------------------------')

        # save the model
        ww_model.save(odir + 'ww_model_{}.h5'.format(epoch))
        uw_model.save(odir + 'uw_model_{}.h5'.format(epoch))
        # save the word embedding
        np.save(odir + 'word_{}.npy'.format(epoch),
                ww_model.get_layer(name='word_emb').get_weights()[0])
        # save the user embedding
        np.save(odir + 'user_{}.npy'.format(epoch),
                uw_model.get_layer(name='user_emb').get_weights()[0])

    # save the model
    ww_model.save(odir + 'ww_model.h5')
    uw_model.save(odir + 'uw_model.h5')
    # save the word embedding
    np.save(odir + 'word.npy',
            ww_model.get_layer(name='word_emb').get_weights()[0])
    # save the user embedding
    np.save(odir + 'user.npy',
            uw_model.get_layer(name='user_emb').get_weights()[0])
	sampling_table = sequence.make_sampling_table(vocab_size)


	for e in range(nb_epoch):
		print('-'*40)
		print('Epoch',e)
		print('-'*40)

		progbar = generic_utils.Progbar(tokenizer.document_count)
		samples_seen = 0
		losses = []


		for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
			# get skipgram couples for one text in the dataset
			couples, labels = sequence.skipgrams(seq, vocab_size, window_size=4, negative_samples=1., sampling_table=sampling_table)
			if couples:
				X1,X2 = zip(*couples)
				X1 = np.array(X1,dtype="int32")
				X2 = np.array(X2,dtype="int32")
				loss = model.train_on_batch([X1,X2], labels)
				losses.append(loss)
				if len(losses) % 100 == 0:
					progbar.update(i, values=[("loss", np.mean(losses))])
					losses = []
				samples_seen += len(labels)
		print('Samples seen:', samples_seen)
	print("Training completed!")


	if save:
 def sg(sentence):
     return seq.skipgrams(sentence,
                          vocab_size,
                          window_size=np.random.randint(window - 1) + 1,
                          negative_samples=nb_negative_samples)
    return analogies, pd.Series(idx)


analogies, categories = get_analogies('en')
analogies_id = analogies.apply(lambda x: x.map(token_to_id))

test_set = analogies_id.dropna().astype(int)
a, b, c, actual = test_set.values.T
actual = actual.reshape(-1, 1)
n_analogies = len(actual)

sampling_table = sequence.make_sampling_table(vocab_size)

couples, labels = skipgrams(sequence=data,
                            vocabulary_size=vocab_size,
                            window_size=WINDOW_SIZE,
                            sampling_table=sampling_table,
                            negative_samples=1.0,
                            shuffle=True)

target_word, context_word = np.array(couples, dtype=np.int32).T
labels = np.array(labels, dtype=np.int8)
del couples

with pd.HDFStore(PATH / 'data.h5') as store:
    store.put('id_to_token', pd.Series(id_to_token))
    store.put('analogies', test_set)


def model_graph():
    #### Scalar Input Variables
    input_target = Input((1, ), name='target_input')
Exemple #14
0
def main():
    vocabulary_size = 2000
    X_train = []
    y_train = []

    train, test = load_card_data()
    corpus = [t[2] for t in train]

    tokenizer = MTGTokenizer(nb_words=vocabulary_size, filters=None, lower=True, split=" ")
    tokenizer.fit_on_texts(corpus)
    train_tokens = tokenizer.texts_to_sequences(corpus)


    for token in train_tokens:
        couples, labels = skipgrams(token, vocabulary_size) 
            
        X_train += couples
        y_train += labels
    """
        couples is a list of 2-elements lists of int: [word_index, other_word_index].
        labels is a list of 0 and 1, where 1 indicates that other_word_index was found in the same window as word_index, and 0 indicates that other_word_index was random.
    """
    
    X_train = np.asarray(X_train)
    y_train = np.asarray(y_train)  
    embedding_size = 256
    model = Sequential()
    model.add(Embedding(vocabulary_size+1, 256))
    model.add(SimpleRNN(128, return_sequences=False))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(optimizer="Adam", loss="binary_crossentropy")
#    model.load_weights("mtgW2V.mdl")
#    embedding_weights = model.layers[0].get_weights()[0]
    model.fit(X_train, y_train, nb_epoch=5, batch_size=1024)
    model.save_weights("mtgW2V.mdl")
    return
    
    from scipy.spatial.distance import cosine
    embedding_dict = tokenizer.word_index
    top5 = []
    for word1 in embedding_dict:
        print word1
        top5 = [] 
        bottom5 = []
        scores = []
        for word2 in embedding_dict:
            score = 1 - cosine(embedding_weights[embedding_dict[word1]], embedding_weights[embedding_dict[word2]])
            if score > min(top5):
                top5.append(word2)
            if score < max(bottom5):
                bottom5.append(word2)
        
    
    
    while True:
        try:
                word1 = raw_input("What is the first word you want to compare?")
                word2 = raw_input("What is the second word you want to compare?")
                print embedding_dict[word1], embedding_dict[word2]
                #1 - gets us similarity instead of distance
                print 1-cosine(embedding_weights[embedding_dict[word1]], embedding_weights[embedding_dict[word2]])
        except KeyError:
            pass
        except IndexError:
            pass
Exemple #15
0
sampling_table = make_sampling_table(vocab_size)

for i in range(n_epochs):
    loss = 0
    for seq in tokenizer.texts_to_sequences_generator(text_generator()):
        # generate skip-gram training examples
        # - `couples` consists of the pivots (i.e. target words) and surrounding contexts
        # - `labels` represent if the context is true or not
        # - `window_size` determines how far to look between words
        # - `negative_samples` specifies the ratio of negative couples
        #    (i.e. couples where the context is false)
        #    to generate with respect to the positive couples;
        #    i.e. `negative_samples=4` means "generate 4 times as many negative samples"
        couples, labels = skipgrams(seq,
                                    vocab_size,
                                    window_size=5,
                                    negative_samples=4,
                                    sampling_table=sampling_table)
        if couples:
            pivot, context = zip(*couples)
            pivot = np.array(pivot, dtype='int32')
            context = np.array(context, dtype='int32')
            labels = np.array(labels, dtype='int32')
            loss += model.train_on_batch([pivot, context], labels)
    print('epoch %d, %0.02f' % (i, loss))

###

embeddings = model.get_weights()[0]

###
Exemple #16
0
# -*- coding: utf-8 -*-
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams

text = "I love green eggs and ham ."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

wids = [word2id[w] for w in text_to_word_sequence(text)]
pairs, labels = skipgrams(wids, len(word2id))
print(len(pairs), len(labels))
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        id2word[pairs[i][0]], pairs[i][0], 
        id2word[pairs[i][1]], pairs[i][1],
        labels[i]))

Exemple #17
0
def process(args):

  print "Loading graph..."
  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    #print("Walking...")
    #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
    #                                    path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    max_features = len(G.nodes())  # vocabulary size
    dim_proj = args.representation_size  # embedding space dimension
    nb_epoch = 1   # number of training epochs

    # Neural network ( in Keras )
    model = Sequential()
    model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
    model.compile(loss='mse', optimizer='rmsprop')
    sampling_table = sequence.make_sampling_table(max_features)

    print("Fitting tokenizer on walks...")
    tokenizer = text.Tokenizer(nb_words=max_features)

    print "Epochs: %d" % nb_epoch
    #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length))

    for e in range(nb_epoch):
        print('-'*40)
        print('Epoch', e)
        print('-'*40)

        #progbar = generic_utils.Progbar(tokenizer.document_count)
        samples_seen = 0
        losses = []

#        for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )):

        for i, seq in enumerate( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) ):
            # get skipgram couples for one text in the dataset
            couples, labels = sequence.skipgrams(seq, max_features, window_size=5, negative_samples=1., sampling_table=sampling_table)
            if couples:
                # one gradient update per sentence (one sentence = a few 1000s of word couples)
                X = np.array(couples, dtype="int32")
                print "Started fitting..."
                loss = model.fit(X, labels)

                print "Dumping..."

                # Dump weights to a temp file
                weights = model.layers[0].get_weights()[0]

                norm_weights = np_utils.normalize(weights)

                # TODO: save weights with indices
                np.savetxt( args.output, norm_weights )

                losses.append(loss)
                if len(losses) % 100 == 0:
    #                progbar.update(i, values=[("loss", np.mean(losses))])
                    losses = []
                samples_seen += len(labels)
        print('Samples seen:', samples_seen)
    print("Training completed!")

  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    #TODO: IMPLEMENT THAT
    print "Not implemented yet..."
    sys.exit(1)

  print "Optimization done. Saving..."
  # recover the embedding weights trained with skipgram:
  weights = model.layers[0].get_weights()[0]

  # we no longer need this
  del model

  norm_weights = np_utils.normalize(weights)

  # TODO: save weights with indices
  np.savetxt( args.output, norm_weights )
  print "Saved!"
Exemple #18
0
data_str = "".join(data)

# 将原文按句子划分
sents = nltk.sent_tokenize(data_str)
# Tokenizer()函数将文本生产一个token序列
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sents)
# 建立单词和ID的映射表
word2index = tokenizer.word_index
word2index[0] = "UNK"
index2word = {value : key for key, value in word2index.items()}
# 构造训练集和测试集
wids = [word2index[w] for w in text_to_word_sequence(data_str)]
# 使用skipgrams函数对文本进行采样,window_size是半个窗口,即[center_word-window_size, center_word+window_size+1]
pairs, labels = skipgrams(wids, vocabulary_size = len(word2index), window_size = 1, negative_samples = 1)
pairs, labels = np.array(pairs), np.array(labels)
x_train, x_test, y_train, y_test = train_test_split(pairs, labels, test_size = 0.3)
# 搭建模型
voca_size = len(word2index)
embed_size = 300

input_word = Input(shape = (2,), name = "input_1")
embedding = Embedding(input_dim = voca_size, output_dim = embed_size, 
                      input_length = 2, embeddings_initializer = "glorot_uniform", name = "embedding_1")(input_word)
lambda_dot = Lambda(lambda x : K.prod(x, axis = 1), output_shape = (embed_size,))(embedding)
dense1 = Dense(units = 1, activation = "sigmoid", name = "dense_1")(lambda_dot)

model = Model(inputs = input_word, outputs = dense1)
model.summary()
model.compile(optimizer = "adam", loss = "mse")
Exemple #19
0

skip_gram = build_sg()
skip_gram.compile(loss='binary_crossentropy', optimizer='adam')
skip_gram.summary()

# Training
nb_epochs = 5
t_start = time()
t0 = t_start
for epoch in range(nb_epochs):
    print "Epoch %d:" % (epoch + 1)
    for i, review in enumerate(revs):
        # skipgram function helps build data/labels with positive/negative smaples
        data, labels = skipgrams(sequence=review,
                                 vocabulary_size=vocab_size,
                                 window_size=2,
                                 negative_samples=5.)
        x = [np.array(x) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        #skip_gram.fit(x, y, batch_size=256, nb_epoch=1)
        loss = skip_gram.train_on_batch(x, y)
        if i % 1000 == 0:
            print "training %d, with loss %.6f, elapsed time %.3fs." % (
                i, loss, time() - t0)
            t0 = time()
            #print "\n-------training %d, elapsed time %.3fs.\n" %(i, time()-t0)
    print "Loss %d, and elapsed time %.3fs for the epoch." % (loss,
                                                              time() - t_start)

# See the results
embeddings = skip_gram.get_weights()[0]
Exemple #20
0
    for each_word in each_words:
        if each_word not in word2idx:
            word2idx[each_word] = 1
            idx2word.append(each_word)
        else:
            word2idx[each_word] += 1


sents_as_ids = list()
count_num = 0
for each_words in normalized_corpus:
    sents_as_ids.append([])
    for each_word in each_words:
        index = idx2word.index(each_word)
        sents_as_ids[count_num].append(index)
    count_num += 1

print('\nSample word2idx: ', list(word2idx.items())[:10])

print(len(word2idx))
print(len(idx2word))
print('\nSample word2idx: ', list(word2idx.items())[:10])
print('\nSample normalized corpus:', normalized_corpus[:3])
print('\nAbove sentence as a list of ids:' , sents_as_ids[:3])
print(len(sents_as_ids))


# training
from keras.preprocessing.sequence import skipgrams
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=5) for sent in sents_as_ids]
Exemple #21
0
def run_preprocessing(texts,
                      data_dir,
                      run_name,
                      min_freq_threshold=10,
                      max_length=100,
                      bad=[],
                      vectors="en_core_web_lg",
                      num_threads=2,
                      token_type="lemma",
                      only_keep_alpha=False,
                      write_every=10000,
                      merge=False):
    """This function abstracts the rest of the preprocessing needed
    to run Lda2Vec in conjunction with the NlpPipeline

    Parameters
    ----------
    texts : TYPE
        Python list of text
    data_dir : TYPE
        directory where your data is held
    run_name : TYPE
        Name of sub-directory to be created that will hold preprocessed data
    min_freq_threshold : int, optional
        If words occur less frequently than this threshold, then purge them from the docs
    max_length : int, optional
        Length to pad/cut off sequences
    bad : list, optional
        List or Set of words to filter out of dataset
    vectors : str, optional
        Name of vectors to load from spacy (Ex. "en", "en_core_web_sm")
    num_threads : int, optional
        Number of threads used in spacy pipeline
    token_type : str, optional
        Type of tokens to keep (Options: "lemma", "lower", "orth")
    only_keep_alpha : bool, optional
        Only keep alpha characters
    write_every : int, optional
        Number of documents' data to store before writing cache to skipgrams file
    merge : bool, optional
        Merge noun phrases or not
    """
    def clean(line):
        return ' '.join(w for w in line.split()
                        if not any(t in w for t in bad))

    # Location for preprocessed data to be stored
    file_out_path = data_dir + "/" + run_name

    if not os.path.exists(file_out_path):

        # Make directory to save data in
        os.makedirs(file_out_path)

        # Remove tokens with these substrings
        bad = set(bad)

        # Preprocess data

        # Convert to unicode (spaCy only works with unicode)
        texts = [str(clean(d)) for d in texts]

        # Process the text, no file because we are passing in data directly
        SP = NlpPipeline(None,
                         max_length,
                         texts=texts,
                         num_threads=num_threads,
                         only_keep_alpha=only_keep_alpha,
                         token_type=token_type,
                         vectors=vectors,
                         merge=merge)

        # Computes the embed matrix along with other variables
        SP._compute_embed_matrix()

        print("converting data to w2v indexes")
        # Convert data to word2vec indexes
        SP.convert_data_to_word2vec_indexes()

        print("trimming 0's")
        # Trim zeros from idx data
        SP.trim_zeros_from_idx_data()

        # This extracts the length of each document (needed for pyldaviz)
        doc_lengths = [len(x) for x in SP.idx_data]

        # Find the cutoff idx
        for i, freq in enumerate(SP.freqs):
            if freq < min_freq_threshold:
                cutoff = i
                break
        # Then, cut off the embed matrix
        embed_matrix = SP.embed_matrix[:cutoff]
        # Also, replace all tokens below cutoff in idx_data
        for i in range(len(SP.idx_data)):
            SP.idx_data[i][SP.idx_data[i] > cutoff - 1] = 0
        # Next, cut off the frequencies
        freqs = SP.freqs[:cutoff]

        print("converting to skipgrams")

        data = []
        num_examples = SP.idx_data.shape[0]
        # Sometimes docs can be less than the required amount for
        # the skipgram function. So, we must manually make a counter
        # instead of relying on the enumerated index (i)
        doc_id_counter = 0
        # Additionally, we will keep track of these lower level docs
        # and will purge them later
        purged_docs = []
        for i, t in enumerate(SP.idx_data):
            pairs, _ = skipgrams(t,
                                 vocabulary_size=SP.vocab_size,
                                 window_size=5,
                                 shuffle=True,
                                 negative_samples=0)
            # Pairs will be 0 if document is less than 2 indexes
            if len(pairs) > 2:
                for pair in pairs:
                    temp_data = pair
                    # Appends doc ID
                    temp_data.append(doc_id_counter)
                    # Appends document index
                    temp_data.append(i)
                    data.append(temp_data)
                doc_id_counter += 1
            else:
                purged_docs.append(i)
            if i // write_every:
                temp_df = pd.DataFrame(data)
                temp_df.to_csv(file_out_path + "/skipgrams.txt",
                               sep="\t",
                               index=False,
                               header=None,
                               mode="a")
                del temp_df
                data = []
            if i % 500 == 0:
                print("step", i, "of", num_examples)
        temp_df = pd.DataFrame(data)
        temp_df.to_csv(file_out_path + "/skipgrams.txt",
                       sep="\t",
                       index=False,
                       header=None,
                       mode="a")
        del temp_df

        # Save embed matrix
        np.save(file_out_path + "/embed_matrix", embed_matrix)
        # Save the doc lengths to be used later, also, purge those that didnt make it into skipgram function
        np.save(file_out_path + "/doc_lengths",
                np.delete(doc_lengths, np.array(purged_docs)))
        # Save frequencies to file
        np.save(file_out_path + "/freqs", freqs)
        # Save vocabulary dictionaries to file
        idx_to_word_out = open(file_out_path + "/" + "idx_to_word.pickle",
                               "wb")
        pickle.dump(SP.idx_to_word, idx_to_word_out)
        idx_to_word_out.close()
        word_to_idx_out = open(file_out_path + "/" + "word_to_idx.pickle",
                               "wb")
        pickle.dump(SP.word_to_idx, word_to_idx_out)
        word_to_idx_out.close()
Exemple #22
0
vector_dim = 300
# Number of epochs
epochs = 2

# For validation to monitor performance
# pick 16 words 
valid_size = 16
# Pick from the top 100 words 
valid_window = 100  
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

# skip-gram function 
sampling_table = sequence.make_sampling_table(vocab_size)
# target and context words are returned in 'couples' and labels indicates if
# the pair is a positive or negative sample
couples, labels = skipgrams(data, vocab_size, window_size = window_size, 
                            sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

# The inputs will be the target word and the context word 
input_target = Input((1,))
input_context = Input((1,))
# The inputs will feed into an embedding layer
# Number of rows = vocabulary size 
# Number of columns = dimension of the vector 
embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
# The target and context words 
target = embedding(input_target)
from __future__ import print_function

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import skipgrams

text = "I love green eggs and ham ."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}

wids = [word2id[w] for w in text_to_word_sequence(text)]
pairs, labels = skipgrams(wids, len(word2id), window_size=1)
print(len(pairs), len(labels))
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        id2word[pairs[i][0]], pairs[i][0],
        id2word[pairs[i][1]], pairs[i][1],
        labels[i]))
# create dataset: word pairs and doc ids with positive and negative samples
window_size = 2
targets = []
contexts = []
labels = []
couples = []
doc_ids = []

for i in range(0,n_documents):
    if i % 1000 == 0 and i > 0:
        print (i)
    seq = sequences[i]
    sampling_table = sequence.make_sampling_table(vocab_size)
    couple, label = skipgrams(seq, 
                              vocab_size, 
                              window_size=window_size, 
                              sampling_table=sampling_table)
    if not couple:
        next
    try:
        target, context = zip(*couple)
        targets = targets + list(target)
        contexts = contexts + list(context)
        doc_ids = doc_ids + [i]*len(context)
        labels = labels + label
        couples = couples + couple
    except:
        print ("Error on " + str(seq))
    
data_target  = np.array(targets, dtype='int32')
data_context = np.array(contexts, dtype='int32')
Exemple #25
0
def main():
    list_of_words, words, word_count = readDataBible()
    '''
        list_of_words contains words in the order in which they appear. As it is
        word_count is the number of unique words in the vocabulary
        words is a dictionary. Key is word and value is the number of occurances of that word
    '''

    embedding_size = 64
    batch_size = 64
    window_size = 5

    vocabulary_size = word_count
    dec_word_list = sorted(words.items(),
                           key=operator.itemgetter(1),
                           reverse=True)
    dic = {}
    for word in dec_word_list:
        dic[word[0]] = word[1]

    reverse_dic = {}
    for word in dic:
        reverse_dic[dic[word]] = word
    '''
    print(vocabulary_size)
    print(dec_word_list)
    print(dic)
    print(reverse_dic)
    '''
    for i in range(len(list_of_words)):
        list_of_words[i] = dic[list_of_words[i]]

    sampling_table = sequence.make_sampling_table(vocabulary_size)
    couples, labels = skipgrams(list_of_words,
                                vocabulary_size,
                                window_size=window_size,
                                sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")

    print(couples[:10], labels[:10])

    input_target = Input((1, ))
    input_context = Input((1, ))

    embedding = Embedding(vocabulary_size,
                          embedding_size,
                          input_length=1,
                          name='embedding')
    target = embedding(input_target)
    target = Reshape((embedding_size, 1))(target)
    context = embedding(input_context)
    context = Reshape((embedding_size, 1))(context)

    # now perform the dot product operation to get a similarity measure
    dot_product = merge([target, context], mode='dot', dot_axes=1)
    dot_product = Reshape((1, ))(dot_product)
    # add the sigmoid output layer
    output = Dense(1, activation='sigmoid')(dot_product)
    # create the primary training model
    model = Model(input=[input_target, input_context], output=output)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    # setup a cosine similarity operation which will be output in a secondary model
    similarity = merge([target, context], mode='cos', dot_axes=0)
    # create a secondary validation model to run our similarity checks during training
    validation_model = Model(input=[input_target, input_context],
                             output=similarity)
Exemple #26
0
    if i > 50000: break
    vocabulary += list(item)
    texts.append(item)

vocabulary = set(vocabulary)
vocab_size = len(vocabulary)
print('vocabulary size: ', vocab_size)

word2id, id2word = {}, {}
for idx, word in enumerate(vocabulary):
    word2id[word] = idx
    id2word[idx] = word

text_seq = [word2id[word] for text in texts for word in text]

X_train, y_train = skipgrams(text_seq, len(word2id), window_size=4)
X_train, y_train = np.array(X_train), np.array(y_train)

print('show 5 training examples...')
for i in range(5):
    print(X_train[i], y_train[i])


def build_skipgram_model():
    input1 = Input(shape=(1, ))
    embeddings1 = Embedding(vocab_size, embedding_dims)(input1)
    output1 = Flatten()(embeddings1)

    input2 = Input(shape=(1, ))
    embeddings2 = Embedding(vocab_size, embedding_dims)(input2)
    output2 = Flatten()(embeddings2)
Exemple #27
0
def my_test_skipgrams():
    couples, labels = skipgrams([0, 1, 2, 3], vocabulary_size=4)
    print("couples: ", couples)
    print("labels: ", labels)
Exemple #28
0
# Convert text to numerical sequences

# Note that the Tokenizer starts numbering words with 1.  So we have vocabulary_size+1 words.  The 0-th word
# is considered to be the 'Out-of-vocabulary' token.
tokenizer = Tokenizer(num_words=vocabulary_size+1, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ', lower=True, split=' ',)
tokenizer.fit_on_texts(animal_corpus)
sequences = tokenizer.texts_to_sequences(animal_corpus)

# Generate (target, context) pairs with negative sampling

pairs = []
labels = []

for this_sequence in sequences:
    # Again note the vocabulary_size+1 expression
    c, l = skipgrams(this_sequence, vocabulary_size+1, window_size=WINDOW_SIZE, negative_samples=1, shuffle=True)
    for i in range(len(c)):
        pairs.append(c[i])
        labels.append(l[i])
pairs = np.array(pairs)
labels = np.array(labels)
    
print("There are {} (context,target) pairs in the dataset".format(len(pairs)))


from keras.layers import Embedding, Input, Dense, Reshape
from keras.layers.merge import Dot
from keras.models import Model
from keras.optimizers import RMSprop

Exemple #29
0
    return corpus_encoded, dict_3, dict_2


corpus_encoded, reverse_dict, corpus_dict = format_corpus(
    corpus, corpus_length)

window_size = 3
vector_dim = 600
epochs = 2000000

print(len(corpus_encoded), len(corpus))

sampling_table = sequence.make_sampling_table(corpus_length)
couples, labels = sequence.skipgrams(corpus_encoded,
                                     corpus_length,
                                     window_size=window_size,
                                     sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

# print(couples[:10], labels[:10])

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import dot
from tensorflow.keras.layers import multiply
from tensorflow.keras.layers import subtract
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
Exemple #30
0
def main():
    """
    Main function documentation template
    :return: None
    :rtype: None
    """
    logging.basicConfig(level=logging.DEBUG)

    # Read in text
    text_path = '../data/alice_in_wonderland.txt'
    text = open(text_path, 'r').read()
    logging.info('Read {} characters from {}'.format(len(text), text_path))

    # Change text to sequence of indices
    vectorizer = transformations.EmbeddingVectorizer()
    indices = vectorizer.fit_transform([[text]])

    vocab_size = numpy.max(indices) + 1

    # Change sequence of indices to skipgram training pair and T/F label (E.g. [[project, gutenberg], True]
    # TODO There must be a better way of getting a 1d array
    X, y = skipgrams(indices.tolist()[0],
                     vocabulary_size=vocab_size,
                     window_size=4,
                     categorical=True)
    X = numpy.array(X)
    y = numpy.array(y)
    logging.info('X shape: {}, y shape: {}'.format(X.shape, y.shape))

    # Create architecture
    # TODO Should be two separate inputs, rather than a timeseries w/ 2 time steps
    input_layer = Input(shape=(2, ), name='text_input')
    x = input_layer
    x = Embedding(input_dim=vocab_size,
                  output_dim=50,
                  input_length=2,
                  name='text_embedding')(x)
    x = Flatten()(x)
    x = Dense(2, activation='softmax', name='output')(x)

    model = Model(input_layer, x)

    model.compile(optimizer='Adam', loss='categorical_crossentropy')

    # Train architecture
    callbacks = [
        TensorBoard(
            os.path.expanduser('~/.logs/' + str(datetime.datetime.now())))
    ]
    model.fit(X,
              y,
              epochs=5,
              validation_split=.1,
              callbacks=callbacks,
              batch_size=2**13)

    embedding = model.get_layer('text_embedding')
    weights = embedding.get_weights()[0]
    print(weights)
    print(weights.shape)
    print(type(weights))

    # Store weights
    pickle.dump(weights, open('custom_embedding.pkl', 'wb'))
    pickle.dump(vectorizer.token_index_lookup,
                open('custom_vocab_index.pkl', 'wb'))
Exemple #31
0

docWords, freq, dictionary, inverseDict = collectDataset()

windowSize, vectorDim, epochs = 5, 300, 70000

valSize, valWindow = 20, 120  #5, 10
valExamples = np.random.choice(valWindow, valSize, replace=False)

wordTargetArray = np.zeros((1, ))
wordContextArray = np.zeros((1, ))
labelsArray = np.zeros((1, ))

sampleTable = make_sampling_table(vocabSize)
pairs, labels = skipgrams(docWords,
                          vocabSize,
                          window_size=windowSize,
                          sampling_table=sampleTable)
wordTarget, wordContext = zip(*pairs)
wordTarget = np.array(wordTarget, dtype="int32")
wordContext = np.array(wordContext, dtype="int32")

inputTarget, inputContext = Input((1, )), Input((1, ))
embedding = Embedding(vocabSize, vectorDim, input_length=1, name='embedding')
target = embedding(inputTarget)
target = Reshape((vectorDim, 1))(target)
context = embedding(inputContext)
context = Reshape((vectorDim, 1))(context)

similarity = merge([target, context], mode='cos', dot_axes=0)

dotProduct = merge([target, context], mode='dot', dot_axes=1)
Exemple #32
0
o = Reshape((1, ), input_shape=(1, 1))(o)
o = Activation('sigmoid')(o)

SkipGram = Model(inputs=[w_inputs, c_inputs], outputs=o)
SkipGram.summary()
SkipGram.compile(loss='binary_crossentropy', optimizer='adam')

from math import ceil

batch_size = 256

for _ in range(5):
    loss = 0.
    for i, doc in enumerate(tokenizer.texts_to_sequences(corpus)):
        data, labels = skipgrams(sequence=doc,
                                 vocabulary_size=V,
                                 window_size=5,
                                 negative_samples=5.)
        x = [np.array(x).reshape(-1, 1) for x in zip(*data)]
        y = np.array(labels, dtype=np.int32)
        if x:
            for i in range(int(len(x) / batch_size)):
                x_batch = [
                    x[0][i * batch_size:(i + 1) * batch_size],
                    x[1][i * batch_size:(i + 1) * batch_size]
                ]
                loss += SkipGram.train_on_batch(
                    x_batch, y[i * batch_size:(i + 1) * batch_size])

    print(loss)

f = open('vectors_negative_sampling.txt', 'w')
Exemple #33
0
def dumb_word_embedding_shit_idegaf(vocab):
    window_size = 2
    vector_dim = embedding_dim
    epochs = 50000

    global valid_size
    global valid_examples
    valid_size = 6  # Random set of words to evaluate similarity on.
    valid_window = 10  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    sampling_table = sequence.make_sampling_table(len(vocab))
    print(sampling_table)
    couples, labels = sequence.skipgrams(data,
                                         len(vocab),
                                         window_size=window_size,
                                         sampling_table=sampling_table)
    word_target, word_context = zip(*couples)
    word_target = np.array(word_target, dtype="int32")
    word_context = np.array(word_context, dtype="int32")

    print(couples[:10], labels[:10])

    # create some input variables
    input_target = Input((1, ))
    input_context = Input((1, ))

    embedding = Embedding(len(vocab),
                          vector_dim,
                          input_length=1,
                          name='embedding')
    target = embedding(input_target)
    target = Reshape((vector_dim, 1))(target)
    context = embedding(input_context)
    context = Reshape((vector_dim, 1))(context)

    # setup a cosine similarity operation which will be output in a secondary model
    similarity = keras.layers.dot([target, context], 1)

    # now perform the dot product operation to get a similarity measure
    dot_product = keras.layers.dot([target, context], 1, normalize=True)
    dot_product = Reshape((1, ))(dot_product)
    # add the sigmoid output layer
    output = Dense(1, activation='sigmoid')(dot_product)

    # create the primary training model
    model = Model(input=[input_target, input_context], output=output)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    # create a secondary validation model to run our similarity checks during training
    global validation_model
    validation_model = Model(input=[input_target, input_context],
                             output=similarity)

    arr_1 = np.zeros((1, ))
    arr_2 = np.zeros((1, ))
    arr_3 = np.zeros((1, ))
    for cnt in range(epochs):
        idx = np.random.randint(0, len(labels) - 1)
        arr_1[0, ] = word_target[idx]
        arr_2[0, ] = word_context[idx]
        arr_3[0, ] = labels[idx]
        loss = model.train_on_batch([arr_1, arr_2], arr_3)
        if cnt % 100 == 0:
            print("Iteration {}, loss={}".format(cnt, loss))
        if cnt % 10000 == 0:
            sim_cb.run_sim()

    sim_cb.run_sim()

    model.save("dumb_embeddings")

    return model
    return data, count, dictionary, reverse_dictionary

vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

window_size = 3
vector_dim = 300
epochs = 200000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)
Exemple #35
0
    stream_gen = stream_generator(args.input, meta=meta, field=args.field, session=True)
    context_streamer = SessionContextStreamer(stream_gen)

    feature_map = read_feature_map(args.sampling_table)
    sampling_table = read_sampling_table(args.sampling_table)

    skipgram_streamer = SkipGramStreamer(context_streamer, neg_pairs=args.neg_pairs, window_size=args.window_size,
        feature_map=, sampling_table=)
    batch_streamer = Batch2BatchStreamer(skipgram_streamer, batch_size=args.batch_size)

    sampling_table = preprocessing.read_sampling_table()
    num_features = len(sampling_table)

    for i, seq in enumerate(streamer):
        # get skipgram couples for one text in the dataset
        couples, labels = sequence.skipgrams(seq)
        if couples:
            X1, X2 = zip(*couples)
            X1 = np.array(X1, dtype="int32")
            X2 = np.array(X2, dtype="int32")
            loss = model.train_on_batch([X1, X2], labels)









# -*- coding: utf-8 -*-
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams

text = "I love green eggs and ham ."

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}

wids = [word2id[w] for w in text_to_word_sequence(text)]
pairs, labels = skipgrams(wids, len(word2id))
print(len(pairs), len(labels))
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(id2word[pairs[i][0]],
                                                      pairs[i][0],
                                                      id2word[pairs[i][1]],
                                                      pairs[i][1], labels[i]))
 def sg(sentence):
     return seq.skipgrams(sentence, vocab_size,
                          window_size=np.random.randint(window - 1) + 1,
                          negative_samples=nb_negative_samples)
wordDict, wordWeights = getPretrainedEmbeddings(wordDict,globalWordTokens,globalWordVectors)
## Add some noise for context vectors since wordVecs != contextVecs
contextWeights = wordWeights + np.random.uniform()
del globalWordVectors, globalWordTokens

## Convert wordList to list of integers
vocab_size = len(sampleMatrix)
wordListInts = []
for ind,item in enumerate(wordList):
	wordListInts.append(wordDict[item])
del wordList
print 'Generating Skipgram pairs for training...will take a while...'

## TODO : Make samplingMatrix realistic for freq-sampling and remove next line
sampleMatrix = None
couples,labels = skipgrams(wordListInts, vocab_size, window_size=int(options.window), negative_samples=int(options.nsamples), shuffle=True,
							categorical=False, sampling_table=sampleMatrix)


## This part taken from @zachmayer
model_word = Sequential()
model_word.add(Embedding(vocab_size, veclen, input_length=1, weights = (wordWeights,)))
model_word.add(Reshape((1,veclen)))

model_context = Sequential()
model_context.add(Embedding(vocab_size,veclen, input_length=1, weights = (contextWeights,)))
model_context.add(Reshape((1,veclen,)))

model = Sequential()
model.add(Merge([model_word,model_context], mode='dot',dot_axes=2))
model.add(Flatten())
model.add(Dense(1))
                            n_components=2,
                            init='pca',
                            n_iter=3500,
                            random_state=32)
    embeddings_en_2d = np.array(
        tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m,
                                                                  k))).reshape(
                                                                      n, m, 2)
    tsne_plot_similar_words('Similar words from vocabulary', keys,
                            embeddings_en_2d, word_clusters, 0.7)


class Visualiser(Callback):
    def on_epoch_end(self, epoch, logs=None):
        prep_for_plot()


couples, labels = skipgrams(data,
                            vocabulary_size=data_size,
                            window_size=3,
                            sampling_table=make_sampling_table(data_size))
word_target, word_context = zip(*couples)
word_target, word_context = np.array(word_target,
                                     dtype="int32"), np.array(word_context,
                                                              dtype="int32")

model.fit([word_target, word_context],
          labels,
          epochs=5,
          callbacks=[Visualiser()])
wordDict, wordWeights = getPretrainedEmbeddings(wordDict,globalWordTokens,globalWordVectors)
## Add some noise for context vectors since wordVecs != contextVecs
contextWeights = wordWeights + np.random.uniform()
del globalWordVectors, globalWordTokens

## Convert wordList to list of integers
vocab_size = len(sampleMatrix)
wordListInts = []
for ind,item in enumerate(wordList):
	wordListInts.append(wordDict[item])
del wordList
print 'Generating Skipgram pairs for training...will take a while...'

## TODO : Make samplingMatrix realistic for freq-sampling and remove next line
sampleMatrix = None
couples,labels = skipgrams(wordListInts, vocab_size, window_size=int(options.window), negative_samples=int(options.nsamples), shuffle=True,
							categorical=False, sampling_table=sampleMatrix)


## This part taken from @zachmayer
model_word = Sequential()
model_word.add(Embedding(vocab_size, veclen, input_length=1, weights = (wordWeights,)))
model_word.add(Reshape((1,veclen)))

model_context = Sequential()
model_context.add(Embedding(vocab_size,veclen, input_length=1, weights = (contextWeights,)))
model_context.add(Reshape((1,veclen,)))

model = Sequential()
model.add(Merge([model_word,model_context], mode='dot',dot_axes=2))
model.add(Flatten())
model.add(Dense(1))
Exemple #41
0
    def train(self, model_config, wids, word2id):
        vocab_size = len(word2id.values())
        sampling_table = sequence.make_sampling_table(vocab_size)
        wids_flat = [word for sentence in wids for word in sentence]
        couples, labels = skipgrams(wids_flat, vocab_size, window_size=model_config['window_size'], sampling_table=sampling_table)
        word_target, word_context = zip(*couples)
        word_target = np.array(word_target, dtype="int32")
        word_context = np.array(word_context, dtype="int32")

        input_target = Input((1,))
        input_context = Input((1,))
        vector_dim = model_config['number_of_dimensions_in_hidden_layer']

        embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
        target = embedding(input_target)
        target = Reshape((vector_dim, 1))(target)
        context = embedding(input_context)
        context = Reshape((vector_dim, 1))(context)

        # setup a cosine similarity operation which will be output in a secondary model
        similarity = merge([target, context], mode='cos', dot_axes=0)

        # now perform the dot product operation to get a similarity measure
        dot_product = merge([target, context], mode='dot', dot_axes=1)
        dot_product = Reshape((1,))(dot_product)
        # add the sigmoid output layer
        output = Dense(1, activation='sigmoid')(dot_product)
        # create the primary training model
        model = Model(input=[input_target, input_context], output=output)
        model.compile(loss='binary_crossentropy', optimizer='rmsprop')

        validation_model = Model(input=[input_target, input_context], output=similarity)

        class SimilarityCallback:
            def run_sim(self):
                valid_size = 16  # Random set of words to evaluate similarity on.
                valid_window = 100  # Only pick dev samples in the head of the distribution.
                valid_examples = np.random.choice(valid_window, valid_size, replace=False)
                reverse_dictionary = dict(zip(word2id.values(), word2id.keys()))
                for i in range(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    sim = self._get_sim(valid_examples[i])
                    nearest = (-sim).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)

            @staticmethod
            def _get_sim(valid_word_idx):
                sim = np.zeros((vocab_size,))
                in_arr1 = np.zeros((1,))
                in_arr2 = np.zeros((1,))
                in_arr1[0,] = valid_word_idx
                for i in range(vocab_size):
                    in_arr2[0,] = i
                    out = validation_model.predict_on_batch([in_arr1, in_arr2])
                    sim[i] = out
                return sim

        sim_cb = SimilarityCallback()

        arr_1 = np.zeros((1,))
        arr_2 = np.zeros((1,))
        arr_3 = np.zeros((1,))
        for cnt in range(model_config['epochs']):
            idx = np.random.randint(0, len(labels) - 1)
            arr_1[0,] = word_target[idx]
            arr_2[0,] = word_context[idx]
            arr_3[0,] = labels[idx]
            loss = model.train_on_batch([arr_1, arr_2], arr_3)
            if cnt % 100 == 0:
                print("Iteration {}, loss={}".format(cnt, loss))
            # if cnt % 10000 == 0:
            #     sim_cb.run_sim()

        return model, model.get_weights()[0]
Exemple #42
0
word2id = tokenizer.word_index
id2word = {v: k for k, v in word2id.items()}

vocab_size = len(word2id) + 1
embed_size = 100

wids = [[word2id[w] for w in text.text_to_word_sequence(doc)]
        for doc in norm_bible]
print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

from keras.preprocessing.sequence import skipgrams

# generate skip-grams
skip_grams = [
    skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in wids
]

from keras.layers import Merge
from keras.layers.core import Dense, Reshape
from keras.layers.embeddings import Embedding
from keras.models import Sequential

# build skip-gram architecture
word_model = Sequential()
word_model.add(
    Embedding(vocab_size,
              embed_size,
              embeddings_initializer="glorot_uniform",
              input_length=1))
word_model.add(Reshape((embed_size, )))
Exemple #43
0
def process(args):

    print "Loading graph..."
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        #print("Walking...")
        #walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
        #                                    path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        max_features = len(G.nodes())  # vocabulary size
        dim_proj = args.representation_size  # embedding space dimension
        nb_epoch = 1  # number of training epochs

        # Neural network ( in Keras )
        model = Sequential()
        model.add(
            WordContextProduct(max_features, proj_dim=dim_proj,
                               init="uniform"))
        model.compile(loss='mse', optimizer='rmsprop')
        sampling_table = sequence.make_sampling_table(max_features)

        print("Fitting tokenizer on walks...")
        tokenizer = text.Tokenizer(nb_words=max_features)

        print "Epochs: %d" % nb_epoch
        #tokenizer.fit_on_texts( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length))

        for e in range(nb_epoch):
            print('-' * 40)
            print('Epoch', e)
            print('-' * 40)

            #progbar = generic_utils.Progbar(tokenizer.document_count)
            samples_seen = 0
            losses = []

            #        for i, seq in enumerate(tokenizer.texts_to_sequences_generator( build_deepwalk_corpus_minibatch_iter(G, args.number_walks, args.walk_length) )):

            for i, seq in enumerate(
                    build_deepwalk_corpus_minibatch_iter(
                        G, args.number_walks, args.walk_length)):
                # get skipgram couples for one text in the dataset
                couples, labels = sequence.skipgrams(
                    seq,
                    max_features,
                    window_size=5,
                    negative_samples=1.,
                    sampling_table=sampling_table)
                if couples:
                    # one gradient update per sentence (one sentence = a few 1000s of word couples)
                    X = np.array(couples, dtype="int32")
                    print "Started fitting..."
                    loss = model.fit(X, labels)

                    print "Dumping..."

                    # Dump weights to a temp file
                    weights = model.layers[0].get_weights()[0]

                    norm_weights = np_utils.normalize(weights)

                    # TODO: save weights with indices
                    np.savetxt(args.output, norm_weights)

                    losses.append(loss)
                    if len(losses) % 100 == 0:
                        #                progbar.update(i, values=[("loss", np.mean(losses))])
                        losses = []
                    samples_seen += len(labels)
            print('Samples seen:', samples_seen)
        print("Training completed!")

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")

        #TODO: IMPLEMENT THAT
        print "Not implemented yet..."
        sys.exit(1)

    print "Optimization done. Saving..."
    # recover the embedding weights trained with skipgram:
    weights = model.layers[0].get_weights()[0]

    # we no longer need this
    del model

    norm_weights = np_utils.normalize(weights)

    # TODO: save weights with indices
    np.savetxt(args.output, norm_weights)
    print "Saved!"
tokenizer.fit_on_texts(text_generator())

# ----- 训练 -----

model = Sequential()
model.add(WordContextProduct(max_features, proj_dim=dim_proj, init="uniform"))
model.compile(loss='mse', optimizer='rmsprop')

sampling_table = sequence.make_sampling_table(max_features)

for e in range(nb_epoch):
    print 'Epoch:', e
    progbar = generic_utils.Progbar(tokenizer.document_count)
    samples_seen, losses = 0, []
    for i, seq in enumerate(tokenizer.texts_to_sequences_generator(text_generator())):
        couples, labels = sequence.skipgrams(seq, max_features, window_size=4, negative_samples=1., sampling_table=sampling_table)
        if couples:
            X = np.array(couples, dtype="int32")
            loss = model.train_on_batch(X, labels)
            losses.append(loss)
            if len(losses) % 100 == 0:
                progbar.update(i, values=[("loss", np.mean(losses))])
                losses = []
            samples_seen += len(labels)

weights = model.layers[0].get_weights()[0]
weights[:skip_top] = np.zeros((skip_top, dim_proj))
norm_weights = np_utils.normalize(weights)
del model

word_index = tokenizer.word_index
Exemple #45
0
    for i, line in enumerate(smart_open(fname, "r")):
        arr = line.strip().split(" ")

        #        if len(arr) < max_doc_len: continue

        seq = []

        for word in arr:
            if word not in top_vocab_list:
                seq.append(1)
            else:
                seq.append(top_vocab_list.index(word) + 2)

        temp = skipgrams(seq,
                         vocab_size,
                         window_size=window_size,
                         negative_samples=negative_samples)
        pairs += temp[0]
        labels += temp[1]
        #        print(temp)

        if i % 100000 == 0:
            for w_c, label in zip(pairs, labels):
                if len(w_c) > 0:
                    print(*w_c, time, label, sep="\t", file=fw)
            print("Processed {} sentence".format(i), file=sys.stderr)
            pairs, labels = [], []

            if i > max_sents: break

fw.close()