Exemple #1
0
def comment(repo_dir, report):
    good = [] # All commits on all branches
    bad = [] # No commits
    ugly = [] # Partial uplift
    good, bad, ugly = classify_gbu(report)
    failed_bugs = []

    def x(bug_id):
        del report[bug_id]
        util.write_json(uplift.uplift_report_file, report)

    for i, j in (good, good_bug_comment), (bad, bad_bug_comment), (ugly, ugly_bug_comment):
        for bug_id in i:
            print "Commenting on bug %s" % bug_id
            try:
                j(repo_dir, bug_id, report[bug_id])
                x(bug_id)
            except FailedToComment:
                failed_bugs.append(bug_id)
                
    if len(failed_bugs) > 0:
        filename = os.path.abspath('failed_comments_%s.json' % util.time_str())
        print "The following bugs had commenting failures"
        print util.e_join(failed_bugs)
        print "Creating a file to use with the 'uplift comments' file to try just these."
        print "Fix the issue then run: uplift comments %s" % filename
        util.write_json(filename, report)
Exemple #2
0
def populate_output_dir(out_dir):
    """
    Populates outout dir with info files.
    """
    #copying model generator file to dir
    shutil.copy(model.__file__, os.path.join(out_dir, "model.py"))
    #copying this file to dir
    shutil.copy(cfg.__file__, os.path.join(out_dir, "config.py"))
    #info file
    with open(os.path.join(out_dir, "info.txt"), "w") as f:
        print("date created (y-m-d):", util.date_str(), file=f)
        print("time created:", util.time_str(), file=f)
        print("git commit hash:", util.git_hash(), file=f)
Exemple #3
0
def populate_out_dir(out_dir, train_set, val_set):
    '''
    Populates output dir with info files.
    '''
    #info file
    with open(os.path.join(out_dir, 'etc', 'train-log', 'info.txt'), 'w') as f:
        print('date created (y-m-d):', util.date_str(), file=f)
        print('time created:', util.time_str(), file=f)
        print('git commit hash:', util.git_hash(), file=f)

    #saving train/val filepaths
    with open(os.path.join(out_dir, 'input', 'train.csv'), 'w') as f:
        for path in train_set:
            print(path, file=f)

    with open(os.path.join(out_dir, 'input', 'val.csv'), 'w') as f:
        for path in val_set:
            print(path, file=f)
Exemple #4
0
def main(_):
    ''' 1.load data(X:list of lint,y:int). 2.create session. 3.feed data. 4.training (5.validation) ,(6.prediction) '''

    word2index, label2index, trainX, trainY, vaildX, vaildY, testX, testY = load_data(
    )
    max_f1, max_p, max_r, max_acc, test_f1, test_acc, test_p, test_r = [0] * 8
    voc_size, num_class, max_time_str = [len(word2index), len(label2index), '']

    print("cnn_model.voc_size: {}, num_class: {}".format(voc_size, num_class))

    num_examples, FLAGS.sentence_len = trainX.shape
    print("num_examples of training:", num_examples, ";sentence_len:",
          FLAGS.sentence_len)
    ''' print some message for debug purpose '''
    print("trainX[0:10]:", trainX[0:10])
    print("trainY[0]:", trainY[0:10])
    print("train_y_short:", trainY[0])
    ''' 2.create session '''
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        ''' Instantiate Model '''
        textCNN = TextCNN(filter_sizes,
                          FLAGS.num_filters,
                          num_class,
                          FLAGS.learning_rate,
                          FLAGS.batch_size,
                          FLAGS.decay_steps,
                          FLAGS.decay_rate,
                          FLAGS.sentence_len,
                          voc_size,
                          FLAGS.embed_size,
                          multi_label_flag=FLAGS.multi_label_flag)
        ''' Initialize Save '''
        saver = tf.train.Saver()
        if False:
            # if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:
                index2word = {v: k for k, v in word2index.items()}
                load_embedding(sess, index2word, textCNN.Embedding,
                               FLAGS.embedding_name)
        current_epoch = sess.run(textCNN.epoch_step)
        ''' 3.feed data & training '''
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        iteration = 0
        for epoch in range(current_epoch, FLAGS.num_epochs):
            loss, counter = 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                if not epoch and not counter:
                    print("trainX[start:end]:", trainX[start:end])
                feed_dict = {
                    textCNN.input_x: trainX[start:end],
                    textCNN.dropout_keep_prob: 0.8,
                    textCNN.is_training_flag: FLAGS.is_training_flag
                }
                if not FLAGS.multi_label_flag:
                    feed_dict[textCNN.input_y] = trainY[start:end]
                else:
                    feed_dict[textCNN.input_y_multilabel] = trainY[start:end]
                curr_loss, lr, _ = sess.run([
                    textCNN.loss_val, textCNN.learning_rate, textCNN.train_op
                ], feed_dict)
                loss, counter = loss + curr_loss, counter + 1
                if not counter % 50:
                    print(
                        "%s Epoch %d\tBatch %d\tTrain Loss:%.3f\tLearning rate:%.5f"
                        % (time_str(), epoch, counter, loss / float(counter),
                           lr))
            ''' vaild model '''
            if not epoch % FLAGS.validate_every:
                eval_loss, f1, r, p, acc = do_eval(sess, textCNN, vaildX,
                                                   vaildY, num_class)
                print(
                    "Epoch %d Validation Loss:%.3f\tR:%.3f\tP:%.3f\tF1 Score:%.3f\tacc:%.3f"
                    %
                    (epoch, eval_loss, r * 100, p * 100, f1 * 100, acc * 100))
                if r > max_r:
                    max_time_str = time_str()
                    max_f1, max_acc, max_p, max_r = [f1, acc, p, r]
                    eval_loss, test_f1, test_r, test_p, test_acc = do_eval(
                        sess, textCNN, testX, testY, num_class)
                    print("Test Loss:%.3f|%.3f|%.3f|%.3f|%.3f" %
                          (eval_loss, test_r * 100, test_p * 100,
                           test_f1 * 100, test_acc * 100))
                ''' save model to checkpoint '''
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)
            sess.run(textCNN.epoch_increment)
            ''' test model '''
            if not epoch % 100:
                eval_loss, f1, r, p, acc = do_eval(sess, textCNN, testX, testY,
                                                   num_class)
                print(
                    "%s Epoch %d Test Loss:%.3f\tR:%.3f\tP:%.3f\tF1 Score:%.3f\tacc:%.3f"
                    % (time_str(), epoch, eval_loss, r * 100, p * 100,
                       f1 * 100, acc * 100))
        ''' print train best '''
        print("%s Train MAX F1_micro:%.3f|%.3f|%.3f|%.3f" %
              (max_time_str, max_r * 100, max_p * 100, max_f1 * 100,
               max_acc * 100))
        print("%s Test F1_micro:%.3f|%.3f|%.3f|%.3f" %
              (max_time_str, test_r * 100, test_p * 100, test_f1 * 100,
               test_acc * 100))
Exemple #5
0
    def trainIters(self,
                   pairs,
                   first_iter,
                   last_iter,
                   logger,
                   evaluator,
                   log_every=100):
        start_total_time = time.time()
        start_epoch_time = time.time()  # Reset every log_every
        start_train_time = time.time()  # Reset every log_every

        total_loss = 0  # Reset every log_every
        avg_loss_history = []
        avg_bleu_history = []
        avg_rouge_history = []
        avg_f1_history = []
        num_unique_names_history = []

        for iter in range(first_iter, last_iter + 1):
            training_pair = random.choice(pairs)
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]

            loss = self.train(input_tensor, target_tensor)
            total_loss += loss

            if iter % log_every == 0:
                train_time_elapsed = time.time() - start_train_time

                torch.save(self.state_dict(), 'results/trained_model.pt')

                with open(
                        os.path.join(constants.LOGS_DIR,
                                     'iters_completed.txt'), 'w') as f:
                    f.write(str(iter))

                start_eval_time = time.time()
                names = evaluator.evaluate(self)
                eval_time_elapsed = time.time() - start_eval_time

                avg_loss_history.append(total_loss / log_every)
                avg_bleu_history.append(names['BLEU'].mean())
                avg_rouge_history.append(names['ROUGE'].mean())
                avg_f1_history.append(names['F1'].mean())
                num_unique_names_history.append(len(
                    names['Our Name'].unique()))

                epoch_time_elapsed = time.time() - start_epoch_time
                total_time_elapsed = time.time() - start_total_time

                log_dict = OrderedDict([
                    ("Iteration",
                     '{}/{} ({:.1f}%)'.format(iter, last_iter,
                                              iter / last_iter * 100)),
                    ("Average loss", avg_loss_history[-1]),
                    ("Average BLEU", avg_bleu_history[-1]),
                    ("Average ROUGE", avg_rouge_history[-1]),
                    ("Average F1", avg_f1_history[-1]),
                    ("Unique names", num_unique_names_history[-1]),
                    ("Epoch time", time_str(epoch_time_elapsed)),
                    ("Training time", time_str(train_time_elapsed)),
                    ("Evaluation time", time_str(eval_time_elapsed)),
                    ("Total training time", time_str(total_time_elapsed))
                ])

                logger.write_training_log(
                    log_dict, os.path.join(constants.LOGS_DIR,
                                           'train-log.txt'))

                logger.plot_and_save_histories(avg_loss_history,
                                               avg_bleu_history,
                                               avg_rouge_history,
                                               avg_f1_history,
                                               num_unique_names_history)

                logger.save_dataframe(
                    names,
                    os.path.join(constants.RESULTS_DIR, 'valid_names.csv'))

                histories = pd.DataFrame(
                    OrderedDict([('Loss', avg_loss_history),
                                 ('BLEU', avg_bleu_history),
                                 ('ROUGE', avg_rouge_history),
                                 ('F1', avg_f1_history),
                                 ('num_names', num_unique_names_history)]))

                logger.save_dataframe(
                    histories,
                    os.path.join(constants.RESULTS_DIR, 'histories.csv'))

                # Reseting counters
                total_loss = 0
                start_epoch_time = time.time()
                start_train_time = time.time()
    def trainIters(self, env, evaluator):
        start_total_time = time.time() - env.total_training_time
        start_epoch_time = time.time()  # Reset every LOG_EVERY iterations
        start_train_time = time.time()  # Reset every LOG_EVERY iterations
        total_loss = 0  # Reset every LOG_EVERY iterations

        for iter in range(env.iters_completed + 1, constants.NUM_ITER + 1):
            row = env.train_methods.iloc[np.random.randint(
                len(env.train_methods))]
            input_tensor = row['source']
            target_tensor = row['name']

            loss = self.train(input_tensor, target_tensor)
            total_loss += loss

            if iter % constants.LOG_EVERY == 0:
                log('Completed {} iterations'.format(iter))

                train_time_elapsed = time.time() - start_train_time

                log('Evaluating on validation set')
                start_eval_time = time.time()

                names = evaluator.evaluate(self)
                # save_dataframe(names, constants.VALIDATION_NAMES_FILE)

                eval_time_elapsed = time.time() - start_eval_time

                env.history = env.history.append(
                    {
                        'Loss': total_loss / constants.LOG_EVERY,
                        'BLEU': names['BLEU'].mean(),
                        'ROUGE': names['ROUGE'].mean(),
                        'F1': names['F1'].mean(),
                        'num_names': len(names['GeneratedName'].unique())
                    },
                    ignore_index=True)

                epoch_time_elapsed = time.time() - start_epoch_time
                total_time_elapsed = time.time() - start_total_time

                env.total_training_time = total_time_elapsed

                history_last_row = env.history.iloc[-1]

                log_dict = OrderedDict([
                    ("Iteration", '{}/{} ({:.1f}%)'.format(
                        iter, constants.NUM_ITER,
                        iter / constants.NUM_ITER * 100)),
                    ("Average loss", history_last_row['Loss']),
                    ("Average BLEU", history_last_row['BLEU']),
                    ("Average ROUGE", history_last_row['ROUGE']),
                    ("Average F1", history_last_row['F1']),
                    ("Unique names", int(history_last_row['num_names'])),
                    ("Epoch time", time_str(epoch_time_elapsed)),
                    ("Training time", time_str(train_time_elapsed)),
                    ("Evaluation time", time_str(eval_time_elapsed)),
                    ("Total training time", time_str(total_time_elapsed))
                ])

                write_training_log(log_dict, constants.TRAIN_LOG_FILE)
                plot_and_save_histories(env.history)

                env.iters_completed = iter
                env.save_train()

                # Reseting counters
                total_loss = 0
                start_epoch_time = time.time()
                start_train_time = time.time()
Exemple #7
0
import prettytable as pt

import git
import bzapi
import branch_logic
import util
import find_commits
import reporting
import configuration as c


# Should be smarter about these cache files and either manage them in sets
# or use a single file which contains *all* the information only ever added to
requirements_file = os.path.abspath("requirements.json")
uplift_report_file = os.path.abspath("uplift_report.json")
uplift_dated_file = os.path.abspath("uplift_outcome_%s.json" % util.time_str())
push_info_file = os.path.abspath("uplift_info_%s.json" % util.time_str())
skip_bugs_file = os.path.abspath("skip_bugs.json")


def find_bugs(queries):
    bug_data = []
    all_queries = []
    for q in queries:
        all_queries.extend(bzapi.parse_bugzilla_query(q))
    print "Running Bugzilla searches"
    for q in all_queries:
        sys.stdout.write('.')
        sys.stdout.flush()
        search_data = bzapi.search(q)
        for bug in search_data: