def label_counter(self):
     # Create a counter to count the occurences of a sign (label)
     # values = list(set(self.y))
     # data_counter = Counter(values)
     # # We count each label occurence and store it in our label_counter
     # for label in self.y:
     #     data_counter[label] += 1
     return utils.get_data_count(self.X, self.y)
Beispiel #2
0
def get_required_augmentation_for_labels(X_data, y_data, threshold=800):
    # This gets the number of requiered augmentations for each label below the threshold
    train_data_counter = utils.get_data_count(X_data, y_data)
    # Threshold for labels representaion
    requiered_augmentation = {}
    for label_no in list(set(y_data)):
        count = train_data_counter[label_no]
        if threshold > (count):  # if count is far from the threshold
            requiered_augmentation[label_no] = threshold - count  # Augment to reach the threshold
    return requiered_augmentation
Beispiel #3
0
    def train(self, db, db_defs, dictionary, reverse_dictionary, param):
        '''
    This routine takes in the database connection to the 
    training data target_words and 
    context pairing and trains the word embedding, using the 
    model architecture fed into the solver object.
    
    The model variables will be initialized as specified, unless
    a previous model has been saved to the models destination directory.
    
    All logs and model data will be saved and loaded from
     './model.name/id/' as defined in the parameters.
     
     
    All hyper parameters can be adjusted in paramters.yml. 
    
    '''

        # define params locally for readability
        model = self.m
        learning_rate = param['learning_rate']
        epoch = param['epoch']
        batch_size = param['batch_size']
        id = param['id']
        top_k = param['print_top_k']
        shard_size = param['shard_size']

        # set up all the directories to send the model and logs...
        _, model_dest, log_dir = generate_directories(model.name, id)

        # get training op.. set learning rate...
        self.optimize(learning_rate)

        # set the training feed.
        fetches = []
        fetches.extend([self.train_op])

        # determine the counter values.
        num_train = get_data_count(db, db_defs)

        iterations_per_shard = max(shard_size // batch_size, 1)
        if (shard_size % batch_size) != 0:
            iterations_per_shard += 1

        shards_per_epoch = max(num_train // shard_size, 1)
        iterations_per_epoch = iterations_per_shard*shards_per_epoch   \
          + (num_train % shard_size) // batch_size

        num_iterations = epoch * iterations_per_epoch

        print('***********************************************')
        print('Begining training of %s model is id: %s' % (model.name, id))
        print('Training Points: %d' % (num_train))
        print('Batch size = %d' % (batch_size))
        print('Shard Size: %d (iters/shard): %d' %
              (shard_size, iterations_per_shard))
        print('Epoch: %d  shards/epoch: %d iters/epoch: %d' %
              (epoch, shards_per_epoch, iterations_per_epoch))
        print('Total Iterations: %d' % (num_iterations))
        print('model will be saved to: %s' % model_dest)
        print('logs will be stored in: %s' % log_dir)

        with tf.Session() as sess:

            # setup a saver object to save model...
            # create model saver
            saver = tf.train.Saver()

            # initialize variables -> need to add code to load if loading trained model.
            if os.path.exists(model_dest):
                saver.restore(sess, model_dest)
            else:
                tf.global_variables_initializer().run()

            # create session writers
            writer = tf.summary.FileWriter(log_dir, sess.graph)  # for 1.0
            #test_writer = tf.summary.FileWriter(os.path.join(test_dir , model.model_name)) # for 1.0
            merged = tf.summary.merge_all()

            print('Begin Training')
            print('***********************************************')
            for e in range(epoch):
                # create a mask to shuffle the data
                # uniquely each epoch.

                # draw data from db in controllable shards.
                No = 0
                n = shard_size
                shard_n = 0

                while (n == shard_size):

                    target_words, context, n = grab_data_shard(
                        db, db_defs, No, shard_size)

                    # make sure we do not put in an empty data set
                    if n == 0:
                        print('n = 0')
                        break

                    No += shard_size
                    shard_n += 1

                    mask = np.arange(n)
                    np.random.shuffle(mask)

                    for i in range(iterations_per_shard):

                        # print out tracking information to make sure everything is running correctly...
                        if ((i + iterations_per_shard * (shard_n - 1)) %
                                param['read_out'] == 0):
                            print(
                                '*********************************************'
                            )
                            print('%d of %d for shard %d' %
                                  (i, iterations_per_shard, shard_n))
                            print('%d of %d for epoch %d' %
                                  (i + iterations_per_shard *
                                   (shard_n - 1), iterations_per_epoch, e))

                        # Grab the batch data... (handle modified batches...)
                        if batch_size * (i + 1) > len(target_words):
                            print('last batch of shard...')
                            target_batch = target_words[mask[batch_size * i:]]
                            context_batch = context[mask[batch_size * i:]]
                        else:
                            target_batch = target_words[mask[batch_size *
                                                             i:batch_size *
                                                             (i + 1)]]
                            context_batch = context[mask[batch_size *
                                                         i:batch_size *
                                                         (i + 1)]]

                        feed_dict = {
                            model.target_words: target_batch,
                            model.context: context_batch,
                            model.is_training: True
                        }

                        # do training on batch, return the summary and any results...
                        [summary, _] = sess.run([merged, fetches], feed_dict)

                        # write summary to writer
                        writer.add_summary(summary,
                                           i + e * iterations_per_epoch)

                # epoch done, check word similarities....
                # Note: I do not have access to the word library so
                # I cannot create my own reverse lookup...
                # we can set this up pretty easy tho.
                if (e % param['similarity_readout'] == 0):
                    [sim] = sess.run([model.similarity])
                    for i in range(model.test_size):
                        if reverse_dictionary == None:
                            valid_word = model.valid_examples[i]
                        else:
                            valid_word = reverse_dictionary[
                                model.valid_examples[i]]
                        #valid_word = model.valid_examples[i]
                        #x = -sim[i, :].argsort()
                        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                        log_str = 'Nearest to %s:' % valid_word
                        for k in range(top_k):
                            #close_word = reverse_dictionary[nearest[k]]
                            close_word = nearest[k]
                            if reverse_dictionary == None:
                                log_str = '%s %d,' % (log_str, close_word)
                            else:
                                log_str = '%s %s,' % (
                                    log_str, reverse_dictionary[close_word])
                        print(log_str)

                # checkpoint the model while training...
                if (e % param['check_point'] == 0):
                    saver.save(sess, model_dest, global_step=e + 1)
                print('%d of %d epoch complete.' % (1 + e, epoch))

            ## TRAINING FINISHED ##
            # saves variables learned during training
            saver.save(sess, model_dest)

            # make sure the log writer closes and sess is done.
            writer.close()
            sess.close()

        print('***********************************************')
        print('Done training')
        print('model saved to: %s' % model_dest)
        print('logs stored in: %s' % log_dir)
        print('***********************************************')
        return
Beispiel #4
0
# -*- coding: utf-8 -*-
import hot_news
import utils
from datetime import datetime
import time

print "program start at", datetime.now()

index = 0
while True:

    update_datetime = datetime.now().replace(hour=23, minute=59, second=59)

    if utils.get_data_count() > 100:
        index += 1
        print "=" * 16 + str(index) + "=" * 16 + str(datetime.now())
        if utils.time_to_update(update_datetime):
            hot_news.calculate()
            print "update_lastDays..."
            utils.update_lastDays()
            print "sleep 61 minuteszzzzzzzzzzzzzzzzzzzz" + str(datetime.now())
            time.sleep(61 * 60)
        else:
            hot_news.calculate()

    else:
        if utils.time_to_update(update_datetime):
            print "update_lastDalys>>>>>>>>>>>>" + str(datetime.now())
            utils.update_lastDays()
            print "sleep 61 minuteszzzzzzzzzzzzzzzzzzzz" + str(datetime.now())
            time.sleep(61 * 60)
start=datetime.now()

threads = []
t1 = threading.Thread(target=hot_news.separate_calculate_hots_multi_thread_weibo(),args=())
threads.append(t1)
t2 = threading.Thread(target=hot_news.separate_calculate_hots_multi_thread_weixin(),args=())
threads.append(t2)
t3 = threading.Thread(target=hot_news.separate_calculate_hots_multi_thread_website(),args=())
threads.append(t3)

if __name__ == '__main__':

    '''过时热点删除并不是每一次运行都需要跑的,可以设置在每天的一个固定时间,少一次遍历'''
    update_datetime = datetime.now().replace(hour=23, minute=40, second=0)
    print update_datetime
    if utils.time_to_update(update_datetime):
        print 'beginging delete old hotID'
        obsolete_hot = utils.get_obsolete_hot()
        if obsolete_hot != []:
            utils.delete_bact_hot(obsolete_hot)

    if utils.get_data_count():
        for t in threads:
            t.setDaemon(True)
            t.start()

        t.join()

    print(datetime.now()-start)
    print'processde over'