Beispiel #1
0
def get_distr():
    ttp = DocumentTimePair("/local/kraljevic/twiter/processed/1M_train.dat")

    dict = {}
    start = None
    day = 0

    tags_f = open("/local/kraljevic/twiter/tags/1M_train.dat")

    for tweet in ttp:
        if start is None:
            #Get time for start
            start = int(tweet[0])
        #3600 * 24 for days
        if int((int(tweet[0]) - start) / (3600 * 24)) > day:
            day += 1

        tags = tags_f.readline().strip()
        
        for tag in tags.split(" "):
            if tag != "__NONE__":
                if tag in dict:
                    dict[tag][day] += 1
                else:
                    dict[tag] = np.zeros(220)
                    dict[tag][day] += 1

    return dict
Beispiel #2
0
    def generate_eval_batch(self, data_path):
        opts = self._options
        data = DocumentTimePair(data_path, opts.time_transform, opts._start_time)
        batch = []
        time = []

        for example in data:
            tmp = [self._word2id[x] for x in example[1] if x in self._word2id]
            if len(tmp) == 0:
                continue
            time.append(example[0])
            batch.append(tmp)

        batch = np.array(batch)
        time = np.array(time)
        return batch, time
Beispiel #3
0
    def load_data(self):
        opts = self._options

        # Load word2vec model
        word2vec = Word2Vec.load(opts.word2vec_path)

        # Sort the vocabulary from word2vec, descending - most frequent word first
        sorted_vocab_pairs = sorted(word2vec.vocab.items(), key=operator.itemgetter(1), reverse=True)

        vocab_from_data = {}
        dtp = DocumentTimePair(opts.train_data) 
        for pair in dtp:
            snt = pair[1]
            for word in snt:
                if word in vocab_from_data:
                    vocab_from_data[word] += 1
                else:
                    vocab_from_data[word] = 1
        
        #Remove pairs with frequency lower than min
        tmp = {}
        for key in vocab_from_data.keys():
            if vocab_from_data[key] > opts.min_count:
                tmp[key] = vocab_from_data[key]
        vocab_from_data = tmp


        # Fill id2word array
        for pair in sorted_vocab_pairs:
            if pair[0] in vocab_from_data:
                self._id2word.append(pair[0])

        # Fill word2id dictionary
        for ind, value in enumerate(self._id2word):
            self._word2id[value] = ind

        # Fill _cent_word and _cent_cntx
        for word in self._id2word:
            #Normalization
            #self._cent_word.append(gensim.matutils.unitvec(word2vec.syn0[word2vec.vocab[word].index]) / 100.0)
            #self._cent_cntx.append(gensim.matutils.unitvec(word2vec.syn1neg[word2vec.vocab[word].index]) / 100.0)
            
            self._cent_word.append(word2vec.syn0[word2vec.vocab[word].index])
            self._cent_cntx.append(word2vec.syn1neg[word2vec.vocab[word].index])


        opts.vocab_size = len(self._id2word)
Beispiel #4
0
def main(opts=None):
    if opts is None:
        opts = Options()
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.device("/cpu:0"):
            twe = TempWordEmb(opts, session)
            generator = Process(target=batch_generator, args=(twe, opts,))
            generator.start()
            for i in range(opts.nepochs):
                twe._epoch = i
                print("Started epoch: {}".format(i))
                if i % 30 == 0:
                    #Save the tf model
                    twe.saver.save(twe._session, opts.save_path + "model", global_step=i)

                    #Test clustering
                    rho = twe.rho.eval()
                    clst_time = twe.clst_time.eval()
                    tau = twe.tau.eval()
                    tags_clst_files = opts.tags_clst_files
                    nclst = opts.nclst
                    beta = opts.vm_beta
                    iter = i
                    statistics_file = opts.save_path + "statistics.txt"
                    data = DocumentTimePair(opts.train_data, opts.time_transform, opts._start_time)

                    if tags_clst_files is not None:
                        test_clustering(rho, data, twe._word2id, clst_time, tau, tags_clst_files, nclst, beta,
                                iter, statistics_file)

                    test_time_pred(rho, data, twe._word2id, clst_time, tau)

                    #Print top words for clusters
                    #rho_exp = 1 / (1 + np.exp(-rho))
                    """
                    for irow, row in enumerate(np.transpose(rho_exp)):
                        row = np.argsort(-row)
                        print("-------------- CLST: {} ---------------".format(irow))

                        for ind in row[0:40]:
                            print(twe._id2word[ind])
                    """
                twe.train()
                twe.test()
            print("--------DONE-------")
            generator.join()
Beispiel #5
0
    def generate_fullbatch(self, data, dataset_name='test'):
        opts = self._options
        data = DocumentTimePair(data, opts.time_transform, opts._start_time)
        data_params = {}

        # Check do we already have the data length
        if hasattr(opts, dataset_name + '_data_params'):
            data_params = getattr(opts, dataset_name + '_data_params')
        else:
            data_params = self.get_data_params(data)
            setattr(opts, dataset_name + '_data_params', data_params)

        batch = []
        labels = []
        time = []

        # Generate a batch using all samples and all words from a sample
        for example in data:
            t = example[0]
            tmp = example[1]
            # Filter words in doc by probability 
            #TODO: None or nothing
            doc = [w if w in self._word2id and (data_params['probs'][self._word2id[w]] >= 1 or 
                data_params['probs'][self._word2id[w]] > np.random.rand()) else None for w in tmp]

            if len(doc) > 0:
                for wid in np.random.choice(len(doc), len(doc), replace=False):
                    # Choose second word from the window around the first word
                    if doc[wid] is None:
                        continue
                    for wid2 in np.arange(max(0,   np.random.randint(wid - opts.window_size, wid)),
                            min(len(doc), np.random.randint(wid + 1, wid + opts.window_size + 2))):
                        if doc[wid2] == doc[wid] or doc[wid2] is None:
                            continue
                        batch.append(self._word2id[doc[wid]])
                        labels.append(self._word2id[doc[wid2]])
                        time.append(t)
        batch = np.reshape(np.array(batch), [-1, 1])
        labels = np.reshape(np.array(labels), [-1, 1])
        time = np.reshape(np.array(time), [-1, 1])

        return batch, labels, time 
Beispiel #6
0
    def calc_kmeans(self, tweets_file, niter=100, random_range=(0, 10)):
        mb_kmeans = MiniBatchKMeans(self.nclst, batch_size=1)
        t_stream = DocumentTimePair(tweets_file)        
        for iter in range(niter):
            #Create dataset
            data = []
            random_skip = np.random.randint(random_range[0], random_range[1])
            cnt = 0
            for tweet in t_stream:
                tweet = tweet[1]
                if cnt < random_skip:
                    cnt = cnt + 1
                    continue
                random_skip = np.random.randint(random_range[0], random_range[1])
                cnt = 0
                data.append(self._process_tweet(tweet))
            print("Kmeans iteration: {} out of {} with dataset of: {}".format(iter, niter, len(data)))
            data = np.array(data)
            mb_kmeans.partial_fit(data)

        self.kmeans = mb_kmeans
Beispiel #7
0
    def predict(self, tweets_file, out_file, batch_size=100000):
        t_stream = DocumentTimePair(tweets_file)        
        cnt = 0
        out = open(out_file, 'w')
        data = []
        for tweet in t_stream:
            tweet = tweet[1]
            data.append(self._process_tweet(tweet))
            cnt += 1
            if cnt == batch_size:
                data = np.array(data)
                pred = self.kmeans.predict(data)
                for one in pred:
                    out.write("%d\n" % one)
                cnt = 0
                data = []
        if cnt != 0:
            data = np.array(data)
            pred = self.kmeans.predict(data)
            for one in pred:
                out.write("%d\n" % one)

        out.close() 
Beispiel #8
0
    def generate_batch(self, data_path, dataset_name='train'):
        opts = self._options
        data_params = {}
        # Crate a data iterator over the whole dataset, no skips
        data = DocumentTimePair(data_path, opts.time_transform, opts._start_time)

        # Check do we already have the data length
        if hasattr(opts, dataset_name + '_data_params'):
            data_params = getattr(opts, dataset_name + '_data_params')
        else:
            data_params = self.get_data_params(data)
            setattr(opts, dataset_name + '_data_params', data_params)

        # The probability of choosing one item to be part of the batch.
        #It has the effect of random choice of a certain size from the
        #whole dataset.
        select_prob = opts.epoch_size / data_params['len'] / opts.max_pairs_from_sample

        batch = np.zeros((opts.epoch_size, 1), dtype=int)
        labels = np.zeros((opts.epoch_size, 1), dtype=int)
        time = np.zeros((opts.epoch_size, 1), dtype=float)
        n_added = 0
        test = []

        # Create new data iterator with select_probability
        data = DocumentTimePair(data_path, opts.time_transform, opts._start_time, select_prob)
        while True:
            for example in data:
                t = example[0]
                tmp = example[1]
                doc = []
                #Filter words in doc by probability 
                #TODO: None or nothing
                doc = [w if w in self._word2id and (data_params['probs'][self._word2id[w]] >= 1 or 
                    data_params['probs'][self._word2id[w]] > np.random.rand()) else None for w in tmp]
                added_pairs = 0
                # Randomly choose first word
                if len(doc) > 0:
                    for wid in np.random.choice(len(doc), len(doc), replace=False):
                        # Choose second word from the window around the first word
                        if doc[wid] is None:
                            continue

                        range_wid2 = np.arange(max(0,   np.random.randint(wid - opts.window_size, wid)),
                                min(len(doc), np.random.randint(wid + 1, wid + opts.window_size + 2)))
                        # Select random words from the window of 'wid', limit number to max_same_target
                        for wid2 in np.random.choice(range_wid2, min(len(range_wid2), opts.max_same_target)):
                            # Skip a pair containing the same words, or if the second word is not 
                            #in the vocab
                            if doc[wid2] == doc[wid] or doc[wid2] is None:
                                continue

                            batch[n_added] = self._word2id[doc[wid]]
                            labels[n_added] = self._word2id[doc[wid2]]
                            time[n_added] = t

                            n_added += 1
                            added_pairs += 1

                            if added_pairs >= opts.max_pairs_from_sample or n_added >= opts.epoch_size:
                                break
                        if added_pairs >= opts.max_pairs_from_sample or n_added >= opts.epoch_size:
                            break
                if n_added >= opts.epoch_size:
                    break
            if n_added >= opts.epoch_size:
                break
        return batch, labels, time
Beispiel #9
0
parser.add_argument('--sample', help='Subsampling', type=float, default=5e-5)
parser.add_argument('--nworkers',
                    help='Number of threads to use',
                    type=int,
                    default=4)
parser.add_argument('--nneg',
                    help='Number of negative samples to use',
                    type=int,
                    default=5)

args = parser.parse_args()
if args is None:
    parser.print_help()
    sys.exit(0)

print("Started word2vec training")
dtp = DocumentTimePair(args.data)
word2vec = models.Word2Vec(sentences=dtp,
                           workers=args.nworkers,
                           negative=args.nneg,
                           hs=0,
                           min_count=args.min_frequency,
                           window=args.window_size,
                           size=args.emb_dim,
                           sample=args.sample)
print("Finished training, saving the model")

#Save the model
word2vec.save(args.save_path)
print("Model saved.")