Example #1
0
 def configure_logging(self, logging, initialization_time):
     print('toplevel: logging: ' + str(logging))
     if logging == 1:
         print('toplevel: Logging planning to console.')
     else:
         print('toplevel: Logging planning to files.')
         __logger_process = utils.start_logger(subprocess.PIPE, initialization_time, 'Planning', logging)
         sys.stderr = __logger_process.stdin
         sys.stdout = __logger_process.stdin
Example #2
0
 def configure_logging(self, logging, initialization_time):
     print('toplevel: logging: ' + str(logging))
     if logging == 1:
         print('toplevel: Logging planning to console.')
     else:
         print('toplevel: Logging planning to files.')
         __logger_process = utils.start_logger(subprocess.PIPE,
                                               initialization_time,
                                               'Planning', logging)
         sys.stderr = __logger_process.stdin
         sys.stdout = __logger_process.stdin
Example #3
0
def runGetSeqENA(args):
    start_time = time.time()

    listENA_IDs = utils.getListIDs(os.path.abspath(args.listENAids.name))
    outdir = os.path.abspath(args.outdir)
    utils.check_create_directory(outdir)
    asperaKey = args.asperaKey
    if asperaKey is not None:
        asperaKey = os.path.abspath(asperaKey.name)

    # Start logger
    logfile = utils.start_logger(outdir)

    # Get general information
    utils.general_information(logfile, version)

    # Check programms
    requiredPrograms(args)

    runs_successfully = 0
    with open(os.path.join(outdir, 'getSeqENA.report.txt'), 'wt') as writer:
        header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download']
        writer.write('#sample' + '\t' + '\t'.join(header_sequencing) + '\n')
        for ena_id in listENA_IDs:
            if args.maximumSamples is None:
                maximumSamples = runs_successfully + 1
            else:
                maximumSamples = args.maximumSamples

            if runs_successfully < maximumSamples:
                print '\n' + 'Download ENA_ID ' + ena_id

                ena_id_folder = os.path.join(outdir, ena_id)
                utils.check_create_directory(ena_id_folder)

                sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
                time_taken, run_successfully, fastq_files, sequencingInformation = download.run_download(ena_id, args.downloadLibrariesType, asperaKey, ena_id_folder, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA, args.SRAopt)

                if run_successfully:
                    runs_successfully += 1
                else:
                    utils.removeDirectory(ena_id_folder)
                    print ena_id + ' was not downloaded'

                writer.write(ena_id + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\n')
            else:
                    break

    time_taken = utils.runTime(start_time)
    del time_taken

    if runs_successfully == 0:
        sys.exit('No ENA_IDs were successfully downloaded!')
Example #4
0
    parser.add_argument("--preprocessed_dataset_path",
                        type=str,
                        default=preprocessed_dataset_path)
    parser.add_argument("--embeddings_filename",
                        type=str,
                        default=embeddings_filename)
    parser.add_argument("--weights_filename",
                        type=str,
                        default=weights_filename)
    parser.add_argument("--logging_filename",
                        type=str,
                        default=logging_filename)
    parser.add_argument("--num_epochs", type=int, default=100)
    parser.add_argument("--batch_size", type=int, default=32)
    args = parser.parse_args()
    start_logger(args.logging_filename)
    atexit.register(stop_logger)

    index_filename = os.path.join(args.preprocessed_dataset_path, "index.pkl")
    print("Loading filename: {}".format(index_filename))
    with open(index_filename, mode="rb") as in_file:
        index = pickle.load(in_file)
        token2id = index["token2id"]
        id2token = index["id2token"]
        m_out2id = index["m_out2id"]
        id2m_out = index["id2m_out"]
        r_out2id = index["r_out2id"]
        id2r_out = index["id2r_out"]

    train_filename = os.path.join(args.preprocessed_dataset_path, "train.pkl")
    print("Loading filename: {}".format(train_filename))
Example #5
0
from utils import start_logger, stop_logger

if __name__ == "__main__":
    random_seed = 12345
    os.environ["PYTHONHASHSEED"] = str(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    tf.set_random_seed(random_seed)
    parser = ArgumentParser()
    parser.add_argument("--test_filename", type=str, required=True)
    parser.add_argument("--model_filename", type=str, required=True)
    parser.add_argument("--img_names_filename", type=str, required=True)
    parser.add_argument("--img_features_filename", type=str, required=True)
    parser.add_argument("--result_filename", type=str, required=True)
    args = parser.parse_args()
    start_logger(args.result_filename + ".log")
    atexit.register(stop_logger)

    print("-- Loading params")
    with open(args.model_filename + ".params", mode="r") as in_file:
        params = json.load(in_file)

    print("-- Loading index")
    with open(args.model_filename + ".index", mode="rb") as in_file:
        index = pickle.load(in_file)
        token2id = index["token2id"]
        id2token = index["id2token"]
        label2id = index["label2id"]
        id2label = index["id2label"]
        num_tokens = len(token2id)
        num_labels = len(label2id)
Example #6
0
    parser.add_argument("--embeddings_size", type=int, default=300)
    parser.add_argument("--train_embeddings", type=bool, default=True)
    parser.add_argument("--img_features_size", type=int, default=2048)
    parser.add_argument("--rnn_hidden_size", type=int, default=512)
    parser.add_argument("--dropout_ratio", type=float, default=0.5)
    parser.add_argument("--multimodal_fusion_hidden_size",
                        type=int,
                        default=512)
    parser.add_argument("--classification_hidden_size", type=int, default=512)
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument("--num_epochs", type=int, default=100)
    parser.add_argument("--learning_rate", type=float, default=0.001)
    parser.add_argument("--l2_reg", type=float, default=0.000005)
    parser.add_argument("--patience", type=int, default=3)
    args = parser.parse_args()
    start_logger(args.model_save_filename + ".train_log")
    atexit.register(stop_logger)

    print("-- Building vocabulary")
    embeddings, token2id, id2token = load_glove(args.vectors_filename,
                                                args.max_vocab,
                                                args.embeddings_size)
    label2id = {"neutral": 0, "entailment": 1, "contradiction": 2}
    id2label = {v: k for k, v in label2id.items()}
    num_tokens = len(token2id)
    num_labels = len(label2id)
    print("Number of tokens: {}".format(num_tokens))
    print("Number of labels: {}".format(num_labels))

    with open(args.model_save_filename + ".params", mode="w") as out_file:
        json.dump(vars(args), out_file)
Example #7
0
                    required=True)

args = parser.parse_args()

path_signalmedia_json = 'signalmedia-1m.jsonl'
path_newsreader_nafs = 'naf'

for start in range(args.start_line, args.end_line, args.batch_size):
    end = start + args.batch_size

    exp_basename = '%s_%s' % (start, end)
    log_path = 'logs/%s.log' % exp_basename
    output_path = 'signalmedia_big_rdf/%s.ttl' % exp_basename

    g = Graph()
    logger = utils.start_logger(log_path)

    the_generator = utils.process_first_x_files(
        path_signalmedia_json,
        path_newsreader_nafs=path_newsreader_nafs,
        start=start,
        end=end)

    for counter, info_about_news_item in enumerate(the_generator, start):
        g = utils.json2rdf(info_about_news_item, g)
        #        break
        if counter % 100 == 0:
            logger.info('processed %s files' % counter)

    g.serialize(destination=output_path, format='turtle')
Example #8
0
def main(_):

    BATCH_SIZE_INFERENCE = 1

    random_seed = 12345
    os.environ["PYTHONHASHSEED"] = str(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    tf.set_random_seed(random_seed)

    start_logger(FLAGS.result_filename + ".log")
    atexit.register(stop_logger)

    print("-- Loading params")
    with open(FLAGS.model_filename + ".params", mode="r") as in_file:
        params = json.load(in_file)

    print("-- Loading index")
    with open(FLAGS.model_filename + ".index", mode="rb") as in_file:
        index = pickle.load(in_file)
        token2id = index["token2id"]
        id2token = index["id2token"]
        label2id = index["label2id"]
        id2label = index["id2label"]
        num_tokens = len(token2id)
        num_labels = len(label2id)

    print("Number of tokens: {}".format(num_tokens))
    print("Number of labels: {}".format(num_labels))

    model_config.set_vocab_size(num_tokens)
    print("Vocab size set!")

    print("-- Loading test set")
    test_labels, test_padded_explanations, test_padded_premises, test_padded_hypotheses, test_img_names, test_original_explanations, test_original_premises, test_original_hypotheses, test_max_length, test_pairIDs = \
        load_e_vsnli_dataset(
            FLAGS.test_filename,
            token2id,
            label2id,
            buffer_size=FLAGS.buffer_size,
        )

    if FLAGS.imbalance == True:
        #class_freqs = np.load(FLAGS.model_filename + '_class_freqs.npy')

        test_num_examples = test_labels.shape[0]
        class_freqs = np.bincount(test_labels) / test_num_examples
        class_weights = 1 / (class_freqs * num_labels)
        print("Class frequencies: ", class_freqs)
        print("Weights: ", class_weights)

    test_original_premises = np.array(test_original_premises)
    test_original_hypotheses = np.array(test_original_hypotheses)
    test_original_explanations = np.array(test_original_explanations)

    print("-- Loading images")
    image_reader = ImageReader(FLAGS.img_names_filename,
                               FLAGS.img_features_filename)

    ilabel2itoken = {}
    for i in id2label:
        label = id2label[i]
        if label in token2id:
            j = token2id[label]
        else:
            j = token2id["#unk#"]
        ilabel2itoken[i] = j
    print("label_id --> token_id: constructed!")

    model_config.set_vocab_size(num_tokens)
    model_config.set_alpha(params['alpha'])

    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():
        # Build the model.

        model = build_model(model_config,
                            embeddings=None,
                            mode=mode,
                            inference_batch=BATCH_SIZE_INFERENCE)

        generator = LabelExplanationGenerator(
            model,
            vocab=token2id,
            ilabel2itoken=ilabel2itoken,
            max_explanation_length=model_config.padded_length - 1)

        # run training
        init = tf.global_variables_initializer()
        with tf.Session() as session:

            session.run(init)

            model['saver'].restore(session, FLAGS.model_filename + ".ckpt")

            print("Model restored! Last step run: ",
                  session.run(model['global_step']))

            print("-- Evaluating model")
            test_num_examples = test_labels.shape[0]
            test_batches_indexes = np.arange(test_num_examples)
            test_num_correct = 0
            y_true = []
            y_pred = []

            with open(FLAGS.result_filename + ".predictions",
                      mode="w") as out_file:
                writer = csv.writer(out_file, delimiter="\t")
                for indexes in batch(test_batches_indexes, FLAGS.batch_size):

                    test_batch_pairIDs = test_pairIDs[indexes]

                    test_batch_premises = test_padded_premises[indexes]
                    test_batch_hypotheses = test_padded_hypotheses[indexes]
                    test_batch_labels = test_labels[indexes]
                    test_batch_explanations = test_padded_explanations[indexes]
                    batch_img_names = [test_img_names[i] for i in indexes]
                    batch_img_features = image_reader.get_features(
                        batch_img_names)

                    test_batch_original_premises = test_original_premises[
                        indexes]
                    test_batch_original_hypotheses = test_original_hypotheses[
                        indexes]
                    test_batch_original_explanations = test_original_explanations[
                        indexes]

                    #                     pred_explanations, pred_labels = _step_test(session, test_batch_hypotheses, test_batch_labels, test_batch_explanations, test_batch_img_features, BATCH_SIZE_INFERENCE, model, 1.0) # the output is size (32, 16)
                    #pred_explanations = [unpack.reshape(-1, 1) for unpack in pred_explanations]
                    #pred_explanations = np.concatenate(pred_explanations, 1)

                    pred_labels, pred_explanations = run_inference(
                        session, test_batch_hypotheses, batch_img_features,
                        generator, 1.0)

                    # don't decode the first token which corresponds to the prepended label
                    # nor the last because it is <end>
                    pred_explanations_decoded = [
                        decode(pred_explanations[i][1:-1], id2token)
                        for i in range(len(indexes))
                    ]

                    #batch_bleu = corpus_bleu(test_batch_original_explanations, pred_explanations_decoded)
                    #print("Current BLEU score: ", batch_bleu)

                    if FLAGS.imbalance == True:
                        test_num_correct += np.dot(
                            (pred_labels == test_batch_labels),
                            class_weights[pred_labels])
                    else:
                        test_num_correct += (
                            pred_labels == test_batch_labels).sum()

                    # add explanations in result file
                    for i in range(len(indexes)):

                        writer.writerow([
                            id2label[test_batch_labels[i]],
                            id2label[pred_labels[i]],
                            " ".join([
                                id2token[id] for id in test_batch_premises[i]
                                if id != token2id["#pad#"]
                            ]),
                            " ".join([
                                id2token[id] for id in test_batch_hypotheses[i]
                                if id != token2id["#pad#"]
                            ]),
                            batch_img_names[i],
                            test_batch_original_premises[i],
                            test_batch_original_hypotheses[i],
                            #test_batch_original_explanations[i],
                            " ".join([
                                id2token[id]
                                for id in test_batch_explanations[i]
                                if id != token2id["#pad#"]
                            ]),
                            pred_explanations_decoded[i],
                            [],
                            test_batch_pairIDs[i]
                            #list(np.where(pred_atts[i]>0.1)[0])
                        ])
                        y_true.append(id2label[test_batch_labels[i]])

                        y_pred.append(id2label[pred_labels[i]])
            test_accuracy = float(test_num_correct) / test_num_examples
            print("Mean test accuracy: {}".format(test_accuracy))
            y_true = pd.Series(y_true, name="Actual")
            y_pred = pd.Series(y_pred, name="Predicted")
            confusion_matrix = pd.crosstab(y_true, y_pred, margins=True)
            confusion_matrix.to_csv(FLAGS.result_filename +
                                    ".confusion_matrix")

            # TODO: evaluation for explanations

            data = pd.read_csv(FLAGS.result_filename + ".predictions",
                               sep="\t",
                               header=None,
                               names=[
                                   "gold_label", "predicted_label",
                                   "premise_toks", "hypothesis_toks", "jpg",
                                   "premise", "hypothesis",
                                   "original_explanation",
                                   "generated_explanation", "top_rois",
                                   "pairID"
                               ])

            print("Overall accuracy: {}".format(
                accuracy_score(data["gold_label"], data["predicted_label"])))

            data_entailment = data.loc[data["gold_label"] == "entailment"]
            print("Accuracy for 'entailment': {}".format(
                accuracy_score(data_entailment["gold_label"],
                               data_entailment["predicted_label"])))

            data_contradiction = data.loc[data["gold_label"] ==
                                          "contradiction"]
            print("Accuracy for 'contradiction': {}".format(
                accuracy_score(data_contradiction["gold_label"],
                               data_contradiction["predicted_label"])))

            data_neutral = data.loc[data["gold_label"] == "neutral"]
            print("Accuracy for 'neutral': {}".format(
                accuracy_score(data_neutral["gold_label"],
                               data_neutral["predicted_label"])))

def get_first_collection_status(collection_id: str):
    if ENDPOINTS["first_collection"] is None:
        raise Exception("You must run get_collections_status() before running this test")

    return get_endpoint_status(ENDPOINTS["first_collection"], "first_collection")


def get_first_collection_features_status():
    if ENDPOINTS["first_collection_features"] is None:
        raise Exception("You must run get_first_collection_status() before running this test")

    return get_endpoint_status(ENDPOINTS["first_collection_features"], "first_collection_features")


def get_first_collection_first_feature_status():
    if ENDPOINTS["first_collection_first_feature"] is None:
        raise Exception("You must run get_first_collection_features_status() before running this test")

    return get_endpoint_status(ENDPOINTS["first_collection_first_feature"], "first_collection_first_feature")


if __name__ == "__main__":
    start_logger()
    logging.info("running test_endpoints main()")
    get_landing_page_status("https://google.com/ggggg")



Example #10
0
def main(_):

    random_seed = 12345
    os.environ["PYTHONHASHSEED"] = str(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    tf.set_random_seed(random_seed)

    start_logger(FLAGS.model_save_filename + ".train_log")
    atexit.register(stop_logger)

    print("-- Building vocabulary")
    #embeddings, token2id, id2token = load_glove(args.vectors_filename, args.max_vocab, args.embeddings_size)

    label2id = {"neutral": 0, "entailment": 1, "contradiction": 2}
    id2label = {v: k for k, v in label2id.items()}

    #num_tokens = len(token2id)

    num_labels = len(label2id)

    #print("Number of tokens: {}".format(num_tokens))
    print("Number of labels: {}".format(num_labels))

    # Load e_vsnli
    # Explanations are encoded/padded, we ignore original explanations
    print("-- Loading training set")

    train_labels, train_explanations, train_premises, train_hypotheses, train_img_names, _, _, _, train_max_length, embeddings, token2id, id2token, _ = \
        load_e_vsnli_dataset_and_glove(
            FLAGS.train_filename,
            label2id,
            FLAGS.vectors_filename,
            FLAGS.max_vocab,
            model_config.embedding_size,
            buffer_size=FLAGS.buffer_size,
            min_threshold = FLAGS.min_threshold,
        )

    num_tokens = len(token2id)
    print("Number of tokens after filtering: ", num_tokens)

    print("-- Loading development set")
    dev_labels, dev_explanations, dev_premises, dev_hypotheses, dev_img_names, dev_original_explanations, _, _, dev_max_length, _ = \
        load_e_vsnli_dataset(
            FLAGS.dev_filename,
            token2id,
            label2id,
            buffer_size=FLAGS.buffer_size,
            padding_length=train_max_length,
        )

    if FLAGS.imbalance == True:
        dev_num_examples = dev_labels.shape[0]
        class_freqs = np.bincount(dev_labels) / dev_num_examples
        class_weights = 1 / (class_freqs * num_labels)
        print("Class frequencies: ", class_freqs)
        print("Weights: ", class_weights)
        np.save(FLAGS.model_save_filename + '_class_freqs.npy', class_freqs)
    print("-- Loading images")
    image_reader = ImageReader(FLAGS.img_names_filename,
                               FLAGS.img_features_filename)

    print("-- Saving parameters")
    with open(FLAGS.model_save_filename + ".params", mode="w") as out_file:
        json.dump(vars(FLAGS), out_file)
        print("Params saved to: {}".format(FLAGS.model_save_filename +
                                           ".params"))

        with open(FLAGS.model_save_filename + ".index", mode="wb") as out_file:
            pickle.dump(
                {
                    "token2id": token2id,
                    "id2token": id2token,
                    "label2id": label2id,
                    "id2label": id2label
                }, out_file)
            print("Index saved to: {}".format(FLAGS.model_save_filename +
                                              ".index"))

    model_config.set_vocab_size(num_tokens)
    print("Vocab size, set to %d" % model_config.vocab_size)
    model_config.set_alpha(FLAGS.alpha)
    print("alpha = %f, set!" % model_config.alpha)

    ilabel2itoken = {}
    for i in id2label:
        label = id2label[i]
        if label in token2id:
            j = token2id[label]
        else:
            j = token2id["#unk#"]
        ilabel2itoken[i] = j

    print("label_id --> token_id: constructed!")

    num_examples = train_labels.shape[0]
    num_batches = num_examples // FLAGS.batch_size

    dev_num_examples = dev_labels.shape[0]
    dev_batches_indexes = np.arange(dev_num_examples)
    num_batches_dev = dev_num_examples // FLAGS.dev_batch_size

    tf.reset_default_graph()

    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():

        model = build_model(model_config,
                            embeddings,
                            ilabel2itoken=ilabel2itoken,
                            mode=mode)

        # Set up the learning rate.
        learning_rate_decay_fn = None
        learning_rate = tf.constant(training_config.initial_learning_rate)
        if training_config.learning_rate_decay_factor > 0:
            num_batches_per_epoch = (num_examples / FLAGS.batch_size)
            decay_steps = int(num_batches_per_epoch *
                              training_config.num_epochs_per_decay)

            def _learning_rate_decay_fn(learning_rate, global_step):
                return tf.train.exponential_decay(
                    learning_rate,
                    global_step,
                    decay_steps=decay_steps,
                    decay_rate=training_config.learning_rate_decay_factor,
                    staircase=True)

            learning_rate_decay_fn = _learning_rate_decay_fn

        # Set up the training ops.
        train_op = tf.contrib.layers.optimize_loss(
            loss=model['total_loss'],
            global_step=model['global_step'],
            learning_rate=learning_rate,
            optimizer=training_config.optimizer,
            clip_gradients=training_config.clip_gradients,
            learning_rate_decay_fn=learning_rate_decay_fn)

        dev_best_accuracy = -1
        stopping_step = 0
        best_epoch = None
        should_stop = False

        # initialize all variables
        init = tf.global_variables_initializer()

        with tf.Session() as session:
            session.run(init)
            #session.run(tf.initializers.tables_initializer(name='init_all_tables'))

            t = 0  # counting iterations

            time_now = datetime.now()

            for epoch in range(training_config.total_num_epochs):
                if should_stop:
                    break

                print("\n==> Online epoch # {0}".format(epoch + 1))
                progress = Progbar(num_batches)
                batches_indexes = np.arange(num_examples)
                np.random.shuffle(batches_indexes)

                np.random.shuffle(batches_indexes)
                batch_index = 1
                loss_history = []
                epoch_loss = 0

                for indexes in batch(batches_indexes, FLAGS.batch_size):

                    t += 1
                    batch_hypotheses = train_hypotheses[indexes]
                    batch_labels = train_labels[indexes]

                    # explanations have been encoded / padded when loaded
                    batch_explanations = train_explanations[indexes]
                    batch_explanation_lengths = [
                        len(expl) for expl in batch_explanations
                    ]

                    batch_img_names = [train_img_names[i] for i in indexes]
                    batch_img_features = image_reader.get_features(
                        batch_img_names)

                    total_loss_value = _step(
                        session, batch_hypotheses, batch_labels,
                        batch_explanations, batch_img_features, train_op,
                        model, model_config.lstm_dropout_keep_prob
                    )  # run each training step

                    progress.update(batch_index, [("Loss", total_loss_value)])
                    loss_history.append(total_loss_value)
                    epoch_loss += total_loss_value
                    batch_index += 1

                    if FLAGS.print_every > 0 and t % FLAGS.print_every == 0:
                        print(
                            '(Iteration %d) loss: %f, and time elapsed: %.2f minutes'
                            % (t + 1, float(loss_history[-1]),
                               (datetime.now() - time_now).seconds / 60.0))

                print("Current mean training loss: {}\n".format(epoch_loss /
                                                                num_batches))

                print("-- Validating model")

                progress = Progbar(num_batches_dev)

                dev_num_correct = 0
                dev_batch_index = 0

                for indexes in batch(dev_batches_indexes,
                                     FLAGS.dev_batch_size):

                    t += 1

                    dev_batch_num_correct = 0

                    dev_batch_index += 1
                    dev_batch_hypotheses = dev_hypotheses[indexes]
                    dev_batch_labels = dev_labels[indexes]

                    # explanations have been encoded / padded when loaded
                    dev_batch_explanations = dev_explanations[indexes]
                    dev_batch_img_names = [dev_img_names[i] for i in indexes]
                    dev_batch_img_features = image_reader.get_features(
                        dev_batch_img_names)

                    pred_explanations, pred_labels = _run_validation(
                        session, dev_batch_hypotheses, dev_batch_labels,
                        dev_batch_explanations, dev_batch_img_features,
                        len(indexes), ilabel2itoken, model, 1.0)

                    if FLAGS.imbalance == True:
                        dev_batch_num_correct += np.dot(
                            pred_labels == dev_batch_labels,
                            class_weights[dev_batch_labels])
                    else:
                        dev_batch_num_correct += (
                            pred_labels == dev_batch_labels).sum()
                    dev_num_correct += dev_batch_num_correct

                    progress.update(
                        dev_batch_index,
                        [("Proportion of correct labels",
                          float(dev_batch_num_correct) / len(indexes))])
                    if FLAGS.sample_every > 0 and (
                            t + 1) % FLAGS.sample_every == 0:
                        pred_explanations = [
                            unpack.reshape(-1, 1)
                            for unpack in pred_explanations
                        ]
                        pred_explanations = np.concatenate(
                            pred_explanations, 1)
                        pred_explanations_decoded = [
                            decode(pred_explanations[i], id2token)
                            for i in range(len(indexes))
                        ]
                        print("\nExample generated explanation: ",
                              pred_explanations_decoded[0])  #TODO: decode it
                        #print("Original explanation: ", dev_original_explanations[indexes][0])

                dev_accuracy = float(dev_num_correct) / dev_num_examples
                print("Current mean validation accuracy: {}".format(
                    dev_accuracy))

                #if True:
                if dev_accuracy > dev_best_accuracy:
                    stopping_step = 0
                    best_epoch = epoch + 1
                    dev_best_accuracy = dev_accuracy
                    model['saver'].save(session,
                                        FLAGS.model_save_filename + ".ckpt")
                    print(
                        "Best mean validation accuracy: {} (reached at epoch {})"
                        .format(dev_best_accuracy, best_epoch))
                    print("Best model saved to: {}".format(
                        FLAGS.model_save_filename))
                else:
                    stopping_step += 1
                    print("Current stopping step: {}".format(stopping_step))
                if stopping_step >= FLAGS.patience:
                    print("Early stopping at epoch {}!".format(epoch + 1))
                    print(
                        "Best mean validation accuracy: {} (reached at epoch {})"
                        .format(dev_best_accuracy, best_epoch))
                    should_stop = True
                if epoch + 1 >= training_config.total_num_epochs:
                    print("Stopping at epoch {}!".format(epoch + 1))
                    print(
                        "Best mean validation accuracy: {} (reached at epoch {})"
                        .format(dev_best_accuracy, best_epoch))
from glob import glob
import os
from multiprocessing.dummy import Pool as ThreadPool

import utils

logger = utils.start_logger('log.txt')

main_input_folder = '/mnt/scistor1/group/marten/babelfied-wikipediaXML/'
main_output_folder = 'output'

for input_folder in glob(main_input_folder + '*'):
    if os.path.isdir(input_folder):
        dir_name = os.path.basename(input_folder)

        if dir_name != '39':
            continue

        synset_output_path = main_output_folder + '/synset/' + dir_name + '.txt'
        hdn_output_path = main_output_folder + '/hdn/' + dir_name + '.txt'

        iterable = glob(input_folder + '/*.xml.gz')

        logger.info('starting with folder %s' % input_folder)

        pool = ThreadPool(20)
        all_generators = pool.map(utils.get_instances, iterable)

        num_docs = len(all_generators)
        synset_count = 0
        hdn_count = 0
Example #12
0
def runCampyGenomes(args):
    start_time = time.time()

    listRunIDs = utils.getListIDs(os.path.abspath(args.listRunIDs.name))
    outdir = os.path.abspath(args.outdir)
    utils.check_create_directory(outdir)
    asperaKey = args.asperaKey.name
    threads_to_use = [j for j in general_threads_to_use if j <= args.threads]

    # Start logger
    logfile, time_str = utils.start_logger(outdir)

    # Get general information
    utils.general_information(logfile, version, outdir, time_str)

    # Check programms
    requiredPrograms()

    # Randomize the list with Run IDs
    random.shuffle(listRunIDs)

    number_process = determineNumberProcess(threads_to_use)

    samples_each_threads = determineBatchSamples(listRunIDs, threads_to_use)

    run_successfully = 0
    with open(
            os.path.join(outdir, 'samples_with_problems.' + time_str + '.tab'),
            'wt') as writer_success:
        with open(os.path.join(outdir, 'running_times.' + time_str + '.tab'),
                  'wt') as writer_times:

            for threads in samples_each_threads:
                print '\n' + 'Running for ' + str(threads) + ' threads' + '\n'
                threads_dir = os.path.join(outdir,
                                           str(threads) + '_threads', '')
                utils.check_create_directory(threads_dir)

                pool = multiprocessing.Pool(processes=number_process[threads])
                for sample in samples_each_threads[threads]:
                    pool.apply_async(downloadAndINNUca,
                                     args=(
                                         threads_dir,
                                         sample,
                                         asperaKey,
                                         threads,
                                     ))
                pool.close()
                pool.join()

                removeFiles(threads_dir, '.log')
                removeFiles(threads_dir, 'getSeqENA.samples_with_problems.txt')
                removeFiles(threads_dir, '.cpu.txt')

                samples_directories = [
                    d for d in os.listdir(threads_dir) if not d.startswith('.')
                    and os.path.isdir(os.path.join(threads_dir, d, ''))
                ]
                for sample_dir in samples_directories:
                    sample_dir_path = os.path.join(threads_dir, sample_dir, '')

                    files = [
                        f for f in os.listdir(sample_dir_path)
                        if not f.startswith('.')
                        and os.path.isfile(os.path.join(sample_dir_path, f))
                    ]
                    for file_found in files:
                        file_path = os.path.join(sample_dir_path, file_found)
                        if file_found == sample_dir + '_run_successfully.pkl':
                            sample_run_successfully = utils.extractVariableFromPickle(
                                file_path)
                            if not sample_run_successfully:
                                writer_success.write(sample_dir + '\t' +
                                                     threads_dir + '\n')
                            else:
                                run_successfully += 1
                            os.remove(file_path)
                        elif file_found == sample_dir + '_downloadAndINNUca_time.pkl':
                            time_taken = utils.extractVariableFromPickle(
                                file_path)
                            writer_times.write(sample_dir + '\t' +
                                               threads_dir + '\t' +
                                               str(time_taken) + '\n')
                            os.remove(file_path)

    time_taken = utils.runTime(start_time)
    del time_taken

    if run_successfully == 0:
        sys.exit('No RunIDs were successfully run!')
    else:
        print str(run_successfully) + ' samples out of ' + str(
            len(listRunIDs)) + ' run successfully'
Example #13
0
def main(_):

    BATCH_SIZE_INFERENCE = 1

    random_seed = 12345
    os.environ["PYTHONHASHSEED"] = str(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    tf.set_random_seed(random_seed)

    start_logger(FLAGS.result_filename + ".log")
    atexit.register(stop_logger)

    print("-- Loading params")
    with open(FLAGS.model_filename + ".params", mode="r") as in_file:
        params = json.load(in_file)

    print("-- Loading index")
    with open(FLAGS.model_filename + ".index", mode="rb") as in_file:
        index = pickle.load(in_file)
        token2id = index["token2id"]
        id2token = index["id2token"]
        label2id = index["label2id"]
        id2label = index["id2label"]
        num_tokens = len(token2id)
        num_labels = len(label2id)

    print("Number of tokens: {}".format(num_tokens))
    print("Number of labels: {}".format(num_labels))

    model_config.set_vocab_size(num_tokens)
    print("Vocab size set!")

    print("-- Loading test set")
    test_labels, test_padded_explanations, test_padded_premises, test_padded_hypotheses, test_img_names, test_original_explanations, test_original_premises, test_original_hypotheses, test_max_length, test_pairIDs = \
        load_e_vsnli_dataset(
            FLAGS.test_filename,
            token2id,
            label2id,
            buffer_size=FLAGS.buffer_size,
        )

    if FLAGS.imbalance == True:
        #class_freqs = np.load(FLAGS.model_filename + '_class_freqs.npy')

        test_num_examples = test_labels.shape[0]
        class_freqs = np.bincount(test_labels) / test_num_examples
        class_weights = 1 / (class_freqs * num_labels)
        print("Class frequencies: ", class_freqs)
        print("Weights: ", class_weights)

    test_original_premises = np.array(test_original_premises)
    test_original_hypotheses = np.array(test_original_hypotheses)
    test_original_explanations = np.array(test_original_explanations)

    print("-- Loading images")
    image_reader = ImageReader(FLAGS.img_names_filename,
                               FLAGS.img_features_filename)

    model_config.set_vocab_size(num_tokens)
    model_config.set_alpha(params['alpha'])

    # Build the TensorFlow graph and train it
    g = tf.Graph()
    with g.as_default():
        # Build the model.

        model = build_model(model_config,
                            embeddings=None,
                            mode=mode,
                            inference_batch=BATCH_SIZE_INFERENCE)

        generator = AttentionExplanationGenerator(
            model,
            vocab=token2id,
            max_explanation_length=model_config.padded_length - 1)

        # run training
        init = tf.global_variables_initializer()
        with tf.Session() as session:

            session.run(init)

            model['saver'].restore(session, FLAGS.model_filename + ".ckpt")

            print("Model restored! Last step run: ",
                  session.run(model['global_step']))

            print("-- Evaluating model")
            test_num_examples = test_labels.shape[0]
            test_batches_indexes = np.arange(test_num_examples)
            test_num_correct = 0
            y_true = []
            y_pred = []

            with open(FLAGS.result_filename + ".predictions",
                      mode="w") as out_file:
                writer = csv.writer(out_file, delimiter="\t")
                for indexes in batch(test_batches_indexes, FLAGS.batch_size):

                    test_batch_pairIDs = test_pairIDs[indexes]

                    test_batch_premises = test_padded_premises[indexes]
                    test_batch_hypotheses = test_padded_hypotheses[indexes]
                    test_batch_labels = test_labels[indexes]
                    test_batch_explanations = test_padded_explanations[indexes]
                    batch_img_names = [test_img_names[i] for i in indexes]
                    batch_img_features = image_reader.get_features(
                        batch_img_names)

                    test_batch_original_premises = test_original_premises[
                        indexes]
                    test_batch_original_hypotheses = test_original_hypotheses[
                        indexes]
                    test_batch_original_explanations = test_original_explanations[
                        indexes]

                    pred_attns, pred_explanations = run_inference_attn(
                        session, test_batch_hypotheses, batch_img_features,
                        generator, 1.0)

                    # don't decode the first token which corresponds to the prepended label
                    # nor the last because it is <end>
                    pred_explanations_decoded = [
                        decode(pred_explanations[i][1:-1], id2token)
                        for i in range(len(indexes))
                    ]

                    #batch_bleu = corpus_bleu(test_batch_original_explanations, pred_explanations_decoded)
                    #print("Current BLEU score: ", batch_bleu)

                    # add explanations in result file
                    for i in range(len(indexes)):

                        writer.writerow([
                            id2label[test_batch_labels[i]],
                            " ".join([
                                id2token[id] for id in test_batch_premises[i]
                                if id != token2id["#pad#"]
                            ]),
                            " ".join([
                                id2token[id] for id in test_batch_hypotheses[i]
                                if id != token2id["#pad#"]
                            ]),
                            batch_img_names[i],
                            test_batch_original_premises[i],
                            test_batch_original_hypotheses[i],
                            " ".join([
                                id2token[id]
                                for id in test_batch_explanations[i]
                                if id != token2id["#pad#"]
                            ]),
                            pred_explanations_decoded[i],
                            list(np.where(pred_attns[i] > 0.05)[0]),
                            #pred_attns[i][0],
                            test_batch_pairIDs[i]
                        ])

            data = pd.read_csv(FLAGS.result_filename + ".predictions",
                               sep="\t",
                               header=None,
                               names=[
                                   "gold_label", "premise_toks",
                                   "hypothesis_toks", "jpg", "premise",
                                   "hypothesis", "original_explanation",
                                   "generated_explanation", "top_rois"
                               ])