Example #1
3
    def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
                 sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0,
                 dm_mean=0, train_words=True, train_lbls=True, **kwargs):
        """
        Initialize the model from an iterable of `sentences`. Each sentence is a
        LabeledSentence object that will be used for training.

        The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora,
        consider an iterable that streams the sentences directly from disk/network.

        If you don't supply `sentences`, the model is left uninitialized -- use if
        you plan to initialize it in some other way.

        `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used.
        Otherwise, `dbow` is employed.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate (will linearly drop to zero as training progresses).

        `seed` = for the random number generator.

        `min_count` = ignore all words with total frequency lower than this.

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
                default is 0 (off), useful value is 1e-5.

        `workers` = use this many worker threads to train the model (=faster training with multicore machines).

        `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

        `negative` = if > 0, negative sampling will be used, the int for negative
        specifies how many "noise words" should be drawn (usually between 5-20).

        `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
        Only applies when dm is used.

        """
        Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count,
                          sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
                          sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs)
        self.train_words = train_words
        self.train_lbls = train_lbls
        if sentences is not None:
            self.build_vocab(sentences)
            self.train(sentences)
Example #2
0
File: wsp.py Project: harrhunt/WSP
 def wsp_similarity(cls, start: WSP, end: WSP):
     total_vector = np.zeros(shape=(300, ), dtype=np.float32)
     for word in start.onyms:
         if Word2Vec.contains(word):
             total_vector = np.add(Word2Vec.vector(word), total_vector)
     Word2Vec.add_vector(start.name, total_vector)
     return Word2Vec.similarity(start.name, end.word)
Example #3
0
    def __init__(self,
                 sentences,
                 model_file=None,
                 size=200,
                 alpha=0.025,
                 window=5,
                 min_count=5,
                 sample=0,
                 seed=1,
                 workers=16,
                 min_alpha=0.0001,
                 model="cb",
                 hs=1,
                 negative=0,
                 cbow_mean=0,
                 iteration=1,
                 word_learn=1,
                 init_adjust=True,
                 update_mode=0,
                 normalize_each_epoch=False):
        self.sg = 1 if model == "sg" or model == "dbow" else 0
        self.table = None  # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.iteration = iteration
        self.word_learn = int(word_learn)
        self.layer1_size = size
        self.min_count = min_count
        self.sent_no_hash = {}  #mapping sent_id to index of self.sents
        self.sent_id_list = []  #mapping sent_no to sent_id
        self.sane_vec_len = 100000  #for sanity check
        self.sane_max_sim10 = 0.9  #for sanity check
        self.init_adjust = init_adjust  #for adjustment of initialization
        self.update_mode = update_mode  #0:SGD, 1: AdaGrad, 2:AdaDelta, (3:ADAM not implemented)
        self.normalize_each_epoch = normalize_each_epoch

        if sentences:
            if model_file:
                self.w2v = Word2Vec.load(model_file)
                self.vocab = self.w2v.vocab
                self.layer1_size = self.w2v.layer1_size
                self.build_vec(sentences, has_vocab=True)
            else:
                self.word_learn = 1
                self.w2v = Word2Vec(None, self.layer1_size, self.alpha,
                                    self.window, self.min_count, self.sample,
                                    self.seed, self.workers, self.min_alpha,
                                    self.sg, self.hs, self.negative,
                                    self.cbow_mean)
                self.build_vec(sentences, has_vocab=False)
            self.train_iteration(sentences, iteration=iteration)
Example #4
0
    def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
                 sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0,
                 dm_mean=0, train_words=True, train_lbls=True, **kwargs):
        """
        Initialize the model from an iterable of `sentences`. Each sentence is a
        LabeledSentence object that will be used for training.

        The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora,
        consider an iterable that streams the sentences directly from disk/network.

        If you don't supply `sentences`, the model is left uninitialized -- use if
        you plan to initialize it in some other way.

        `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used.
        Otherwise, `dbow` is employed.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate (will linearly drop to zero as training progresses).

        `seed` = for the random number generator.

        `min_count` = ignore all words with total frequency lower than this.

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
                default is 0 (off), useful value is 1e-5.

        `workers` = use this many worker threads to train the model (=faster training with multicore machines).

        `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

        `negative` = if > 0, negative sampling will be used, the int for negative
        specifies how many "noise words" should be drawn (usually between 5-20).

        `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
        Only applies when dm is used.

        """
        Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count,
                          sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
                          sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs)
        self.train_words = train_words
        self.train_lbls = train_lbls
        if sentences is not None:
            self.build_vocab(sentences)
            self.train(sentences)
def trainWord2Vec(doc_list=None,
                  buildvoc=1,
                  passes=10,
                  sg=1,
                  size=100,
                  dm_mean=0,
                  window=5,
                  hs=0,
                  negative=5,
                  min_count=1,
                  workers=1):
    model = Word2Vec(size=size,
                     sg=sg,
                     window=window,
                     hs=hs,
                     negative=negative,
                     min_count=min_count,
                     workers=workers,
                     compute_loss=True)

    if buildvoc == 1:
        print('Building Vocabulary')
        model.build_vocab(doc_list)  # build vocabulate with words + nodeID

    for epoch in range(passes):
        print('Iteration %d ....' % epoch)
        # shuffle(doc_list)  # shuffling gets best results

        model.train(doc_list, total_examples=len(doc_list), epochs=1)
        print(model.running_training_loss)

    print(model.sg, model.window, model.hs, model.min_count)
    print('batch words', model.batch_words)
    return model
Example #6
0
	def init_embedder(dataset):
		'''
		initialize the embedder by load it from file if available
		or build the model by the dataset and save it
		'''
	
		fname = DIR_MODEL + '%s_embedder.pkl'%(prefix)

		if os.path.exists(fname):
			print >> sys.stderr, 'embedding model %s found and loaded'%(fname)
			return Word2Vec.load(fname)
		else:
			class x_iterator:
				def __init__(self, dataset):	
					self.dataset = dataset
				
				def __iter__(self):
					for set_x, set_y in self.dataset:
						for x in set_x:
							yield x

			embedder = Word2Vec()
			embedder.build(x_iterator(dataset), dim_proj)
			embedder.dump(fname)
	
		return embedder
Example #7
0
 def __init__(self):
     self.db = MySQLdb.connect(host="127.0.0.1",
                               user="******",
                               passwd="wmmkscsie",
                               db="recommender_system",
                               charset="utf8")
     self.cursor = self.db.cursor()
     # sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_bert FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_bert !=''"
     sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_w2v_sg FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_w2v_sg !=''"
     print(sql)
     self.cursor.execute(sql)
     self.movies_information = self.cursor.fetchall()
     # Relationship Model
     #######################
     # self.model = CNN_E2V_BERT()
     # For Produce Vector
     # self.bert_embedding = BertEmbedding(model = 'bert_12_768_12', dataset_name='wiki_cn', max_seq_length = 50)
     # self.relationship_e2v_bert = []
     # self.scenario_e2v_bert = []
     #######################
     self.model = CNN_E2V_W2V_SG()
     # 產生一個 word2vec 物件
     self.t = Word2Vec()
     self.t.train_file_setting("segmentation.txt", "e2v_w2v_sg")
     self.t.load_model()
     self.dimension = self.t.size
     self.relationship_e2v_w2v_sg = []
     self.scenario_e2v_w2v_sg = []
Example #8
0
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
Example #9
0
    def test_skipgram(self):
        skipgram = Word2Vec(learning_rate=self.learning_rate)
        W1_m, W2_m, loss_m = skipgram.skipgram(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.)

        with tf.name_scope("skipgram"):
            x = tf.placeholder(shape=[self.V, 1], dtype=tf.float32, name="x")
            W1_tf = tf.Variable(self.W1, dtype=tf.float32)
            W2_tf = tf.Variable(self.W2, dtype=tf.float32)
            h = tf.matmul(tf.transpose(W1_tf), x)
            u = tf.stack([tf.matmul(tf.transpose(W2_tf), h) for i in range(len(self.context_words))])
            loss_tf = -tf.reduce_sum([u[i][int(np.where(c == 1)[0])]
                                      for i, c in zip(range(len(self.context_words)), self.context_words)], axis=0)\
                      + tf.reduce_sum(tf.log(tf.reduce_sum(tf.exp(u), axis=1)), axis=0)

            grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf])

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2],
                                                             feed_dict={x: self.center_word.reshape(self.V, 1)})

        W1_tf -= self.learning_rate * dW1_tf
        W2_tf -= self.learning_rate * dW2_tf

        for i in range(self.V):
            for j in range(self.N):
                self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5)

        for i in range(self.N):
            for j in range(self.V):
                self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5)

        self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
Example #10
0
    def test_cbow(self):
        cbow = Word2Vec(learning_rate=self.learning_rate)
        W1_m, W2_m, loss_m = cbow.cbow(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.)

        with tf.name_scope("cbow"):
            x = tf.placeholder(shape=[self.V, len(self.context_words)], dtype=tf.float32, name="x")
            W1_tf = tf.Variable(self.W1, dtype=tf.float32)
            W2_tf = tf.Variable(self.W2, dtype=tf.float32)
            hh = [tf.matmul(tf.transpose(W1_tf), tf.reshape(x[:, i], [self.V, 1]))
                  for i in range(len(self.context_words))]
            h = tf.reduce_mean(tf.stack(hh), axis=0)
            u = tf.matmul(tf.transpose(W2_tf), h)
            loss_tf = -u[int(np.where(self.center_word == 1)[0])] + tf.log(tf.reduce_sum(tf.exp(u), axis=0))
            grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf])

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2],
                                                             feed_dict={x: self.context_words.T})

        W1_tf -= self.learning_rate * dW1_tf
        W2_tf -= self.learning_rate * dW2_tf

        for i in range(self.V):
            for j in range(self.N):
                self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5)

        for i in range(self.N):
            for j in range(self.V):
                self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5)

        self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
Example #11
0
def process(args):
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    else:
        raise Exception(
            "unknown file format: '%s'. valid formats: 'adjlist', 'edgelist'" %
            args.format)

    print("number of nodes: {}".format(len(G.nodes())))  # .format 格式化字符串(取代{})

    num_walks = len(G.nodes()) * args.number_walks  # 每个节点有多个walks
    print("number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length
    print("data size (walk*length): {}".format(data_size))

    print("walking...")
    walk_file = walks.write_walks_to_disk(G,
                                          args.output,
                                          num_paths=args.number_walks,
                                          path_length=args.walk_length,
                                          alpha=0,
                                          rand=random.Random(args.seed))
    model = Word2Vec(walk_file,
                     args.output,
                     emb_dimension=args.representation_size,
                     window_size=args.window_size,
                     min_count=0)
    print("Training...")

    model.skip_gram_train()
Example #12
0
def main(input,
         output,
         iter=5,
         size=128,
         worker=4,
         batch_nodes=10000,
         negative=5,
         sample=1e-4,
         output_format="gensim"):

    # load karate graph in csr matrix
    RWG = RandomWalksGeneratorCSR(path=input)
    # init model
    skipgram = Word2Vec(sg=1,
                        iter=iter,
                        min_count=0,
                        size=size,
                        workers=worker,
                        batch_words=batch_nodes,
                        sample=sample,
                        negative=negative)
    # build vocab
    skipgram.build_vocab(RWG)
    # learn embbeding
    skipgram.train(RWG)
    if output_format == "gensim":
        skipgram.save(output)
    elif output_format == "txt":
        skipgram.save_word2vec_format(output)
Example #13
0
File: wsp.py Project: harrhunt/WSP
 def average_distance(cls, start: WSP, end: WSP):
     total_distance = 0
     for word in start.onyms:
         if Word2Vec.contains(word):
             total_distance += cls.distance(word, end.word)
     average_distance = total_distance / len(start.onyms)
     return average_distance
Example #14
0
    def load_embeddings(self):

        # check if embeddings saved in cache
        if os.path.exists(R.EMB.format(self.dim)):
            # read from cache; return
            return pickle.load(open(R.EMB.format(self.dim), 'rb'))

        # read model from word2vec
        model = Word2Vec(self.dim).get_model()
        embeddings = []
        for w in self._vocab:
            # if word in model
            if w in model:
                emb = model[w]
            # else check if lower-case of w in model
            elif w.lower() in model:
                emb = model[w.lower()]
            # return zero vector
            else:
                emb = np.zeros(self.dim)
            # keep track of embedding
            embeddings.append(emb)
        # np.array
        embeddings = np.stack(embeddings)
        # attach to self
        self.emb = embeddings
        # write to cache
        pickle.dump(self.emb, open(R.EMB.format(self.dim), 'wb'))
        # make sure vocab size == num of embeddings
        assert self.vocab_size() == self.emb.shape[0]

        return self.emb
Example #15
0
def test_word2vec():
    data = [
        'Merge multiple sorted inputs into a single sorted output',
        'The API below differs from textbook heap algorithms in two aspects'
    ]
    wv = Word2Vec(vec_len=50)
    wv.train(data, model='cbow')
    print(wv['into'])
Example #16
0
def export_vocab(tweets, vocab_size, export=True):
    words = []
    for tweet in tweets:
        words.extend(tweet)
    vocab = Word2Vec.vocab_to_num(words, vocab_size)
    if export:
        np.save('./data/vocab.npy', vocab)
    return vocab
Example #17
0
 def key_words(self, string, top_number=10):
     # use the pretrained model with 5.9 million pretrained model
     model_name = '5.9m'
     model_download = W2VModelDownload(bq_project)
     model_download.download_w2v_model('patent_landscapes', model_name)
     word2vec5_9m = Word2Vec('5.9m')
     w2v_runtime = word2vec5_9m.restore_runtime()
     return w2v_runtime.find_similar(string, top_number)
Example #18
0
def initialise_model(data):
	input_file = 'test.txt'
	f = open(input_file,'w')
	input_txt = get_all_text(data)
	f.write(input_txt)
	f.close()
	model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=1, workers=8)
	model.save(input_file + '.model')
	model.save_word2vec_format(input_file + '.vec')
Example #19
0
 def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None):
     """
     Load sentence vectors
     """
     model = Category2Vec(None)
     count = 0
     if cat_model:
         logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model))
         for line in open(cat_model,"r"):
             line = line.rstrip()
             if count == 0:
                 info = line.split()
                 model.cat_len = int(info[0])
                 model.layer1_size = int(info[1])
                 model.sg = int(info[2])
                 model.hs = int(info[3])
                 model.negative = int(info[4])
                 model.cbow_mean = int(info[5])
                 model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL)
                 model.cat_no_hash = {}
                 model.cat_id_list = []
             else:
                 idx = count - 1
                 row = line.split("\t")
                 cat_id = utils.to_unicode(row[0])
                 model.cat_no_hash[cat_id] = idx
                 model.cat_id_list.append(cat_id)
                 vals = row[1].split()
                 for j in xrange(model.layer1_size):
                     model.cats[idx][j] = float(vals[j])
             count += 1
     count = 0
     if sent_model:
         logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model))
         for line in open(sent_model,"r"):
             line = line.rstrip()
             if count == 0:
                 info = line.split()
                 model.sents_len = int(info[0])
                 model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL)
                 model.sent_no_hash = {}
                 model.sent_id_list = []
             else:
                 idx = count - 1
                 row = line.split("\t")
                 sent_id = utils.to_unicode(row[0])
                 model.sent_no_hash[sent_id] = idx
                 model.sent_id_list.append(sent_id)
                 vals = row[1].split()
                 for j in xrange(model.layer1_size):
                     model.sents[idx][j] = float(vals[j])
             count += 1
     if word_model:
         logger.info("loading word2vec from %s" % word_model)
         model.w2v = Word2Vec.load(word_model)
         model.vocab = model.w2v.vocab
     return model
Example #20
0
def train_model(window_size, embedding_dim, batch_size_word2vec):
    file_to_save_trained_data = '../../results/word2vec/ver6/ws-' + str(
        window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str(
            batch_size_word2vec) + '.pkl'
    word2vec = Word2Vec(window_size=window_size,
                        epoch_word2vec=epoch_word2vec,
                        embedding_dim=embedding_dim,
                        batch_size_word2vec=batch_size_word2vec,
                        file_to_save_trained_data=file_to_save_trained_data)
    vectors, word2int, int2word = word2vec.train()
def main(args: argparse.Namespace) -> None:
    """Main entrypoint for the script."""
    # Make sure we have at least one file.
    if len(args.filenames) == 0:
        logger.error('At least one text file is required!')
        exit(1)

    set_seed(args.seed)
    tokenizer = Tokenizer(max_tokens=args.max_tokens,
                          min_word_frequency=args.min_word_frequency,
                          sample_threshold=args.sample_threshold)

    start_time = time.time()
    logger.info('Building vocabulary from corpus...')

    tokenizer.build(filenames=args.filenames)

    logger.info('Finished building vocabulary (took {:.2f} seconds)'.format(
        time.time() - start_time))

    dataset = make_dataset(args.filenames,
                           tokenizer,
                           window_size=args.window_size,
                           batch_size=args.batch_size,
                           epochs=args.epochs)
    model = Word2Vec(tokenizer,
                     hidden_size=args.hidden_size,
                     batch_size=args.batch_size,
                     n_negative_samples=args.n_negative_samples,
                     lambda_power=args.lambda_power,
                     bias=args.bias)

    # Create output directory
    args.output_dir.mkdir(parents=True, exist_ok=True)

    run_name = args.run_name or args.filenames[0].stem
    logdir = args.output_dir / get_next_run_id(args.output_dir, run_name)

    logger.info('Starting training (for {} epochs).'.format(args.epochs))
    model.train(dataset, logdir, args.initial_lr, args.target_lr,
                args.log_freq, args.save_freq)

    # Save embeddings and vocab
    #
    # The weights of the projection layer are components of the
    # embedding vectors. The i-th row of the weight matrix is the
    # embedding vector for the word whose encoded index is i.
    proj = model.weights[0].numpy()
    np.save(logdir / 'proj_weights', proj)
    # Save the tokenizer state
    tokenizer.save(logdir / 'tokenizer.json')
    # Save a list of the vocabulary words
    with open(logdir / 'vocab.txt', 'w') as file:
        for word in tokenizer.words:
            file.write(f'{word}\n')
def export_vocab(comments, categories, vocab_size, export=True):
    words = []
    for key in comments:
        words.extend(list(itertools.chain.from_iterable(comments[key])))
    for key in categories:
        words.extend(list(itertools.chain.from_iterable(categories[key])))
    vocab = Word2Vec.vocab_to_num(words, vocab_size)
    if export:
        with open("../resources/vocab.json", "w") as f:
            json.dump(vocab, f, indent=2)
    return vocab
Example #23
0
def init_from_config(args):
    global w2v, sparql_backend, entity_linker, facts_ranker, facts_extractor
    global wiki_url
    config_options = globals.config

    w2v = Word2Vec.init_from_config(config_options)
    sparql_backend = globals.get_sparql_backend(config_options)
    wiki_url = WikiUrl(config_options)
    entity_linker = EntityLinker.init_from_config(config_options, wiki_url)
    facts_ranker = Ranker.init_from_config(config_options)
    facts_extractor = FactExtractor.init_from_config(config_options)
    def __init__(self, dataset_file=None, cv_folds=10):
        """
        :param embeddings_file: path to the embeddings file.
        :param dataset_file: path to a labeled dataset file.
        :param cv_folds: int, number of folds for cross validation
        """

        self.dataset_file = dataset_file
        self.cv_folds = cv_folds

        # read dataset
        dataset = pd.read_csv(self.dataset_file)
        text = dataset['tweet']
        self.Y = dataset['label']

        # Option 1-- word2vec using embedding -- #
        w2v = Word2Vec()
        self.X = w2v.getVectors(text, )

        # -- Option 2 count vectorization -- #
        #self.X = self.features_extraction(text)

        # -- Option 3 TFIDF -- #
        #self.X = self.tfidfFeatureExtraction(text)

        info('Done loading and vectorizing data.')
        info("--- Sentiment CLASSIFIERS ---")
        info("fitting ... ")

        self.accuracies = {}

        # classifiers to use
        classifiers = [
            #RandomForestClassifier(n_estimators=100),
            #SGDClassifier(),
            LinearSVC(),
            #LinearDiscriminantAnalysis(),
            #LogisticRegression(),
            #GaussianNB(),
            #DecisionTreeClassifier()
        ]

        # RUN classifiers
        for c in classifiers:
            self.classify(c)

        info('results ...')
        for k, v in self.accuracies.items():
            string = '\tAcc. {:.2f}% F1. {:.2f}% P. {:.2f} R. {:.2f} : {}'
            print(
                string.format(v[0] * 100, v[1] * 100, v[2] * 100, v[3] * 100,
                              k))

        info("DONE!")
Example #25
0
def word2vec(rdd):

    sentences = parse_sentences(rdd)
    sentences_without_id = sentences.map(lambda (_id, sent): sent)
    model = Word2Vec(size=100, hs=0, negative=8)

    dd2v = DistDoc2VecFast(model, learn_hidden=True, num_partitions=15, num_iterations=20)
    dd2v.build_vocab_from_rdd(sentences_without_id)

    print "*** done training words ****"
    print "*** len(model.vocab): %d ****" % len(model.vocab)
    return dd2v, sentences
Example #26
0
def ExtractSent2Vec(filename):
    model = Word2Vec(LineSentence(filename),
                     size=512,
                     window=5,
                     sg=0,
                     min_count=5,
                     workers=8)
    model.save(filename + '.model')
    model.save_word2vec_format(filename + '-01.vec')

    model = Sent2Vec(LineSentence(filename), model_file=filename + '.model')
    model.save_sent2vec_format(filename + '-02.vec')
Example #27
0
def main():
    contexts = np.fromfile("./data/npcontexts.dat", dtype=int)
    neighbors = np.fromfile("./data/npneighbors.dat", dtype=int)
    skipgram = Word2Vec(contexts,
                        neighbors,
                        35000,
                        10,
                        0.001,
                        64,
                        "sg.ckpt",
                        batch_size=500)
    skipgram.train(2)
Example #28
0
def create_dataset(tweets, window, datafile="mapped_tweets.npy", export=True):
    if tweets is None:
        try:
            tweets = np.load(datafile).item()
        except FileNotFoundError:
            print("cannot find " + datafile)
            exit(1)
    contexts, neighbors = Word2Vec.create_dataset(tweets, window)
    if export:
        print("saving train set to file")
        contexts = np.array(contexts)
        neighbors = np.array(neighbors)
        contexts.tofile('./data/npcontexts.dat')
        neighbors.tofile('./data/npneighbors.dat')
Example #29
0
 def __init__(self, data):
     self.data = data
     self.corpus = None
     self.liu = LiuLexicon()
     self.subj = SubjLexicon()
     self.buildTweetCorpus()
     self.word_vec_model = Word2Vec(self.corpus)
     self.glove_vec_model = Glove(100, self.corpus)
     self.clusters = Cluster(100)
     self.initEncoders()
     self.topicVecs = self.word_vec_model.getVectorsForTopics(
         self.topicenc.classes_)
     self.collectTopUnigrams()
     self.collectTopBigrams()
Example #30
0
def main(text):

    params = getattr(parameters, text)

    w2v = Word2Vec(params['file'],
                   window_size=params['window_size'],
                   learning_rate=params['learning_rate'],
                   vocab_size=params['vocab_size'],
                   embedding_size=params['embedding_size'],
                   n_negative=params['n_negative'])

    w2v.fit(n_iter=params['n_iter'], num_proc=params['num_proc'])

    print(w2v.process_time)
    print(w2v.process_time[-1] - w2v.process_time[0])
Example #31
0
def main():
    # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/contexts.json") as fp:
    #     contexts = json.load(fp)
    # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/neighbors.json") as fp:
    #     neighbors = json.load(fp)
    print("Reading dat files")
    npn = np.fromfile("npneighbors.dat", dtype=int)
    print(str(npn.shape[0]))
    npc = np.fromfile("npcontexts.dat", dtype=int)
    print(str(npc.shape[0]))
    print("finished read")
    # train skipgram model
    skipgram = Word2Vec(npn,
                        npc,
                        35000,
                        10,
                        0.001,
                        64,
                        "sg.ckpt",
                        batch_size=500)
    skipgram.train(5)
    # train cbow model
    cbow = Word2Vec(npc, npn, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500)
    cbow.train(5)
def getTextualFeature(text_reading_path):
    # Train and save the Word2Vec model for the text file.
    # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'.
    model = Word2Vec(LineSentence(text_reading_path),
                     size=500,
                     window=5,
                     sg=0,
                     min_count=5,
                     workers=8)
    model.save(text_reading_path + '.model')

    # Train and save the Sentence2Vec model for the sentence file.
    model = Sent2Vec(LineSentence(text_reading_path),
                     model_file=text_reading_path + '.model')
    model.save_sent2vec_format(text_reading_path + '.vec')

    program = os.path.basename(sys.argv[0])
Example #33
0
def create_trainset(window, export=True):
    with open("mapped_comments.json") as f:
        comments = json.load(f)
    sentences = []
    for key, index in zip(comments, range(len(comments))):
        progress(index, len(comments), "combining sentences")
        sentences.extend(comments[key])

    sentences = list(filter(lambda x: x, sentences))
    print("finished")
    sentences = np.array(sentences)
    contexts, neighbors = Word2Vec.create_dataset(sentences, window)
    if export:
        npc = np.array(contexts)
        npn = np.array(neighbors)
        npc.tofile('npcontexts.dat')
        npn.tofile('npneighbors.dat')
Example #34
0
    def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count = 5,
                 sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0,
                 iteration=1, word_learn=1, init_adjust=True, update_mode = 0, normalize_each_epoch = False):
        self.sg = 1 if model == "sg" or model == "dbow" else 0
        self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.iteration = iteration
        self.word_learn = int(word_learn)
        self.cat_learn = 1
        self.layer1_size = size
        self.min_count = min_count
        self.sent_no_hash = {} # mapping sent_id to index of self.sents
        self.sent_id_list = [] # mapping sent_no to sent_id
        self.cat_no_hash = {} # mapping cat_id to index of self.cats
        self.cat_id_list = [] # mapping cat_no to cat_id
        self.sane_vec_len = 100000 # for sanity check
        self.sane_max_sim10 = 0.9 # for sanity check
        self.init_adjust = init_adjust # for adjustment of initialization
        self.update_mode = update_mode # 0:SGD, 1: AdaGrad, 2:AdaDelta, 3:ADAM
        self.normalize_each_epoch = normalize_each_epoch # normalize vectors after each epoch

        if sentences:
            if model_file:
                self.w2v = Word2Vec.load(model_file)
                self.vocab = self.w2v.vocab
                self.layer1_size = self.w2v.layer1_size
                self.build_vec(sentences, has_vocab = True)
            else:
                self.word_learn = 1
                self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean)
                self.build_vec(sentences, has_vocab = False)
            self.train_iteration(sentences, iteration=iteration)
Example #35
0
def word2vec_feat(reviews):
    w2v_model_file = "../../models/laptop.word2vec.model"
    w2v_model = Word2Vec.load(w2v_model_file)
    bags = []
    for review in reviews:
        bag = []
        for sent in review.sentences:
            instance = None
            count = 0.
            for w in sent:
                if w not in w2v_model:
                    continue
                if count == 0:
                    instance = w2v_model[w]
                    count += 1.
                else:
                    instance += w2v_model[w]
                    count += 1.
            instance /= count
            bag.append(instance.tolist())
        bags.append(bag)

    save_sparse_feature(corpus_name="laptop", view_name="word2vec", features=bags)
    save_view_info(view_name="word2vec", dim=100, data_format="sparse", view_type="continuous")
Example #36
0
 def load(cls, fname, mmap=None):
     model = super(Sentence2Vec, cls).load(fname, mmap)
     if os.path.isfile(fname+"_w2v"):
         model.w2v = Word2Vec.load(fname+"_w2v", mmap)
         model.vocab = model.w2v.vocab
     return model
Example #37
0

w2v = Word2Vec(vocabulary_size=vocabulary_size, 
	architecture='cbow',
	# loss_type='nce_loss',
	n_steps=2001)

# print w2v.get_params()
w2v.fit(words)
print(w2v.final_embeddings.shape)
print(len(w2v.sort('the')))

print('words closest to %s:' % 'the')
print(w2v.sort('the')[:10])

# print([reverse_dictionary[i] for i in range(3)])
# print(w2v.transform([0,1,2,3]).shape)

save_path = w2v.save('models/test_model')
print(w2v.final_embeddings[0,0])

print save_path

# restore a saved model
w2c_restored = Word2Vec.restore(save_path)
print(w2c_restored.final_embeddings[0,0])
print(w2c_restored.dictionary['the'])
print(w2c_restored.reverse_dictionary.items()[:5])


Example #38
0
def main():
    optparser = OptionParser()
    optparser.add_option("-p", "--pro", dest="product")
    (options, args) = optparser.parse_args()

    (train_file, test_file) = CORPUS[options.product]
    train_reviews = load_dataset(DATA_PATH + train_file)
    test_reviews = load_dataset(DATA_PATH + test_file)

    n_cates, cate_index = get_categories(train_reviews + test_reviews)
    vocab_size = 1000
    vocab_index = get_vocab(train_reviews, vocab_size)

    train_bags = [extract_unigram(vocab_index, vocab_size, review)\
            for review in train_reviews]
    train_X = [bag2vec(bag) for bag in train_bags]
    train_labels = [extract_labels(cate_index, review)\
            for review in train_reviews]

    test_bags = [extract_unigram(vocab_index, vocab_size, review)\
            for review in test_reviews]
    test_X = [bag2vec(bag) for bag in test_bags]
    test_labels = [extract_labels(cate_index, review)\
            for review in test_reviews]


    # add word2vec feature
    w2v_model_file = "../../models/laptop.word2vec.model"
    w2v_model = Word2Vec.load(w2v_model_file)
    train_X2 = word2vec_feat(train_reviews, w2v_model)
    train_X = merge_features(train_X, train_X2)
    test_X2 = word2vec_feat(test_reviews, w2v_model)
    test_X = merge_features(test_X, test_X2)


    labelwise_acc = []
    labelwise_output = []
    for cate in range(n_cates):
        # train a binary svm model
        train_Y = get_Y(train_labels, cate)
        prob = svm_problem(train_Y, train_X)
        #param = svm_parameter("-s 0 -t 0 -b 1")
        param = svm_parameter("-s 0 -t 2 -b 1")
        m = svm_train(prob, param)

        # test
        test_Y = get_Y(test_labels, cate)
        p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1')

        labelwise_acc.append(p_acc)
        labelwise_output.append(p_label)

    # evaluation 
    p, r, f = microF1(labelwise_output, test_labels)

    # output
    out_dir = "results/rbf/"
    out_dir = "results/"
    out_file = out_dir + options.product + ".txt"
    cates = list(cate_index.items())
    cates = sorted(cates, key=lambda x:x[1])
    labelwise_acc = [(cates[i][0], labelwise_acc[i][0]) for i in range(n_cates)]
    labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1])
    with open(out_file, 'w') as out:
        out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f))
        print("{}\n{}\n{}".format(p, r, f))
        for cate_i in range(n_cates):
            out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))
Example #39
0
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
        return result[:topn]
       
	        		
if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) # debug can see comprehensive result: oov and wrong predict
    test_words =[]
    import analogy as analogy
	# load model
 	#sys.argv 1: model 2: analogy question 3: res 4: word 5: vector
 	
 	# create analogy result
    model = Word2Vec.load_word2vec_format(sys.argv[1], binary=True, encoding='iso-8859-1')
    accuracy = model.accuracy(sys.argv[2], restrict_vocab=30000, most_similar=analogy.most_similar, use_lowercase=False) # list return as incorrect, section, correct # list return as incorrect, section, correct
    print accuracy[0]
    # write Analogy result to file
    writeAnRes(accuracy, 'incorrect')
   		
   	# read analogy result file
    fullList = []
    with io.open("analogy_res.txt", 'r', encoding='utf-8') as infile:
		for line in infile.readlines():	
			test_words = line.split(":", 1)[1].split()
   			fullList.append(test_words)
   	
  	#writeVec2file(fullList)
   	
  	# read type and vector to form word2vec