def evaluate_classification(embed_type, model_path, dataSetPath, otherDatasetPath):
    model = get_model(embed_type, model_path)
    ids = set()
    with open(dataSetPath, 'r', encoding="UTF-8") as data_file:
        data = json.load(data_file)
        for row in data:
            ids.add(row['url_1'].split('/')[-1])
            ids.add(row['url_2'].split('/')[-1])

    skipped = 0
    label2distance = {}

    with open(otherDatasetPath, 'r', encoding="UTF-8") as other:
        df = pd.read_csv(other)
#        df = df.sample(frac=0.05, replace=False, random_state=1)
        
        print(df.count)
        for _, row in df.iterrows():
            if row['q1_Id'] in ids or row['q2_Id'] in ids:
                skipped += 1
                continue
            srcEmbed = embed_sentences([row['q1_AnswersBody']], model, embed_type)
            dstEmbed = embed_sentences([row['q2_AnswersBody']], model, embed_type)
            linkedDist = distance.cosine(srcEmbed, dstEmbed)
            lbl = row['class'].strip().replace('"','')
            if lbl not in label2distance:
                label2distance[lbl] = []
            label2distance[lbl].append(linkedDist)

    print('skipped:' + str(skipped))
    
    for key in label2distance:
        print(key)
        print(np.mean(np.asarray(label2distance[key])))
        print(len(label2distance[key]))
    direct = label2distance['direct']
    duplicate = label2distance['duplicate']
    indirect = label2distance['indirect']
    isolated = label2distance['isolated']

    print('direct-isolated')
    print(stats.ttest_ind(direct, isolated))
    print('dup - isolated')
    print(stats.ttest_ind(duplicate, isolated))
    print('indirect - isolated')
    print(stats.ttest_ind(indirect, isolated))
Beispiel #2
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""
        config = util.initialize_from_env(use_tpu=FLAGS.use_tpu,
                                          config_params=FLAGS.config_params,
                                          config_file=FLAGS.config_filename)

        input_ids = features["flattened_input_ids"]
        input_mask = features["flattened_input_mask"]
        text_len = features["text_len"]
        speaker_ids = features["speaker_ids"]
        genre = features["genre"]
        gold_starts = features["span_starts"]
        gold_ends = features["span_ends"]
        cluster_ids = features["cluster_ids"]
        sentence_map = features["sentence_map"]
        # span_mention = features["span_mention"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = util.get_model(config, model_sign="corefqa")

        if FLAGS.use_tpu:
            tf.logging.info(
                "****************************** Training on TPU ******************************"
            )

            def tpu_scaffold():
                return tf.train.Scaffold()

            scaffold_fn = tpu_scaffold
        else:
            scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.TRAIN ******************************"
            )
            tf.logging.info("********* Features *********")
            for name in sorted(features.keys()):
                tf.logging.info("  name = %s, shape = %s" %
                                (name, features[name].shape))

            total_loss, topk_span_starts, topk_span_ends, top_antecedent_scores = model.get_predictions_and_loss(
                input_ids, input_mask, text_len, speaker_ids, genre,
                is_training, gold_starts, gold_ends, cluster_ids,
                sentence_map)  # , span_mention)

            if config["tpu"]:
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=config['learning_rate'],
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-08)
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
                train_op = optimizer.minimize(total_loss,
                                              tf.train.get_global_step())
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=tf.estimator.ModeKeys.TRAIN,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn)
            else:
                optimizer = RAdam(learning_rate=config['learning_rate'],
                                  epsilon=1e-8,
                                  beta1=0.9,
                                  beta2=0.999)
                train_op = optimizer.minimize(total_loss,
                                              tf.train.get_global_step())

                training_logging_hook = tf.train.LoggingTensorHook(
                    {"loss": total_loss}, every_n_iter=1)
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=tf.estimator.ModeKeys.TRAIN,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn,
                    training_hooks=[training_logging_hook])

        elif mode == tf.estimator.ModeKeys.EVAL:
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.EVAL ******************************"
            )
            tf.logging.info(
                "@@@@@ MERELY support tf.estimator.ModeKeys.PREDICT ! @@@@@")
            tf.logging.info(
                "@@@@@ YOU can EVAL your checkpoints after the training process. @@@@@"
            )
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.EVAL ******************************"
            )

        elif mode == tf.estimator.ModeKeys.PREDICT:
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.PREDICT ******************************"
            )
            total_loss, topk_span_starts, topk_span_ends, top_antecedent_scores = model.get_predictions_and_loss(
                input_ids, input_mask, text_len, speaker_ids, genre,
                is_training, gold_starts, gold_ends, cluster_ids,
                sentence_map)  #, span_mention)
            top_antecedent = tf.math.argmax(top_antecedent_scores, axis=-1)
            predictions = {
                "total_loss": total_loss,
                "topk_span_starts": topk_span_starts,
                "topk_span_ends": topk_span_ends,
                "top_antecedent_scores": top_antecedent_scores,
                "top_antecedent": top_antecedent,
                "cluster_ids": cluster_ids,
                "gold_starts": gold_starts,
                "gold_ends": gold_ends
            }

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=tf.estimator.ModeKeys.PREDICT,
                predictions=predictions,
                scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Please check the the mode ! ")
        return output_spec
                str(np.array(map_elts_some).mean()))
            print(
                str(i) + ": mrrse_some: " +
                str(scipy.stats.sem(mrr_elts_some)))
            print(
                str(i) + ": mapse_some: " +
                str(scipy.stats.sem(map_elts_some)))
            print(str(i) + ": empty " + str(empty))
            print(str(i) + ": total " + str(len(countExpected)))


if __name__ == '__main__':
    if len(sys.argv) > 5:
        embedType = sys.argv[5]

    util.get_model(embedType)
    hierarchyPath = sys.argv[1]
    docPath = sys.argv[2]
    classPath = sys.argv[3]
    usagePath = sys.argv[4]
    #    hierarchyMaps = build_sibling_maps(hierarchyPath)
    (index, docList, docsToClasses, embeddedDocText,
     classesToDocs) = util.build_index_docs(docPath, embedType)
    top_k = 10
    query_distances, query_neighbors = index.search(embeddedDocText, top_k + 1)
    classesToDocstringNeighbors = compute_neighbor_docs(
        query_distances, query_neighbors, index, docList, docsToClasses,
        embeddedDocText)
    docstringsToDocstringNeighbors = util.compute_neighbor_docstrings(
        query_neighbors, docList)
    #    print(str(classesToDocstringNeighbors))
Beispiel #4
0
def main(_):
    config = util.initialize_from_env(use_tpu=FLAGS.use_tpu,
                                      config_params=FLAGS.config_params,
                                      config_file=FLAGS.config_filename,
                                      print_info=True)

    tf.logging.set_verbosity(tf.logging.INFO)
    num_train_steps = config["num_docs"] * config["num_epochs"]

    keep_chceckpoint_max = max(
        math.ceil(num_train_steps / config["save_checkpoints_steps"]),
        FLAGS.keep_checkpoint_max)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    tf.gfile.MakeDirs(FLAGS.output_dir)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
        tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
        tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        evaluation_master=FLAGS.master,
        keep_checkpoint_max=keep_chceckpoint_max,
        save_checkpoints_steps=config["save_checkpoints_steps"],
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=True),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(config)
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        eval_on_tpu=FLAGS.use_tpu,
        warm_start_from=tf.estimator.WarmStartSettings(
            config["init_checkpoint"], vars_to_warm_start="bert*"),
        model_fn=model_fn,
        config=run_config,
        train_batch_size=1,
        eval_batch_size=1,
        predict_batch_size=1)

    seq_length = config["max_segment_len"] * config["max_training_sentences"]

    if FLAGS.do_train:
        estimator.train(input_fn=file_based_input_fn_builder(
            config["train_path"],
            seq_length,
            config,
            is_training=True,
            drop_remainder=True),
                        max_steps=num_train_steps)

    if FLAGS.do_eval:
        best_dev_f1, best_dev_prec, best_dev_rec, test_f1_when_dev_best, test_prec_when_dev_best, test_rec_when_dev_best = 0, 0, 0, 0, 0, 0
        best_ckpt_path = ""
        checkpoints_iterator = [
            os.path.join(FLAGS.eval_dir,
                         "model.ckpt-{}".format(str(int(ckpt_idx))))
            for ckpt_idx in range(0, num_train_steps +
                                  1, config["save_checkpoints_steps"])
        ]
        model = util.get_model(config, model_sign="corefqa")
        for checkpoint_path in checkpoints_iterator[1:]:
            dev_coref_evaluator = metrics.CorefEvaluator()
            for result in estimator.predict(file_based_input_fn_builder(
                    config["dev_path"],
                    seq_length,
                    config,
                    is_training=False,
                    drop_remainder=False),
                                            checkpoint_path=checkpoint_path,
                                            yield_single_examples=False):
                predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold = model.evaluate(
                    result["topk_span_starts"], result["topk_span_ends"],
                    result["top_antecedent"], result["cluster_ids"],
                    result["gold_starts"], result["gold_ends"])
                dev_coref_evaluator.update(predicted_clusters, gold_clusters,
                                           mention_to_predicted,
                                           mention_to_gold)
            dev_prec, dev_rec, dev_f1 = dev_coref_evaluator.get_prf()
            tf.logging.info("***** Current ckpt path is ***** : {}".format(
                checkpoint_path))
            tf.logging.info("***** EVAL ON DEV SET *****")
            tf.logging.info(
                "***** [DEV EVAL] ***** : precision: {:.4f}, recall: {:.4f}, f1: {:.4f}"
                .format(dev_prec, dev_rec, dev_f1))
            if dev_f1 > best_dev_f1:
                best_ckpt_path = checkpoint_path
                best_dev_f1 = dev_f1
                best_dev_prec = dev_prec
                best_dev_rec = dev_rec
                test_coref_evaluator = metrics.CorefEvaluator()
                for result in estimator.predict(
                        file_based_input_fn_builder(config["test_path"],
                                                    seq_length,
                                                    config,
                                                    is_training=False,
                                                    drop_remainder=False),
                        checkpoint_path=checkpoint_path,
                        yield_single_examples=False):
                    predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold = model.evaluate(
                        result["topk_span_starts"], result["topk_span_ends"],
                        result["top_antecedent"], result["cluster_ids"],
                        result["gold_starts"], result["gold_ends"])
                    test_coref_evaluator.update(predicted_clusters,
                                                gold_clusters,
                                                mention_to_predicted,
                                                mention_to_gold)

                test_pre, test_rec, test_f1 = test_coref_evaluator.get_prf()
                test_f1_when_dev_best, test_prec_when_dev_best, test_rec_when_dev_best = test_f1, test_pre, test_rec
                tf.logging.info("***** EVAL ON TEST SET *****")
                tf.logging.info(
                    "***** [TEST EVAL] ***** : precision: {:.4f}, recall: {:.4f}, f1: {:.4f}"
                    .format(test_pre, test_rec, test_f1))

        tf.logging.info("*" * 20)
        tf.logging.info(
            "- @@@@@ the path to the BEST DEV result is : {}".format(
                best_ckpt_path))
        tf.logging.info(
            "- @@@@@ BEST DEV F1 : {:.4f}, Precision : {:.4f}, Recall : {:.4f},"
            .format(best_dev_f1, best_dev_prec, best_dev_rec))
        tf.logging.info(
            "- @@@@@ TEST when DEV best F1 : {:.4f}, Precision : {:.4f}, Recall : {:.4f},"
            .format(test_f1_when_dev_best, test_prec_when_dev_best,
                    test_rec_when_dev_best))

    if FLAGS.do_predict:
        coref_evaluator = metrics.CorefEvaluator()
        model = util.get_model(config, model_sign="corefqa")
        for result in estimator.predict(file_based_input_fn_builder(
                config["eval_path"],
                seq_length,
                config,
                is_training=False,
                drop_remainder=False),
                                        yield_single_examples=False):

            predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold = model.evaluate(
                result["topk_span_starts"], result["topk_span_ends"],
                result["top_antecedent"], result["cluster_ids"],
                result["gold_starts"], result["gold_ends"])
            coref_evaluator.update(predicted_clusters, gold_clusters,
                                   mention_to_predicted, mention_to_gold)

        p, r, f = coref_evaluator.get_prf()
        tf.logging.info(
            "Average precision: {:.4f}, Average recall: {:.4f}, Average F1 {:.4f}"
            .format(p, r, f))
Beispiel #5
0
    if pars.prune:
        save_model_name = save_model_name + '_sparse_' + str(pars.sparse) + '_seed_' + str(pars.random_seed)
    if pars.emb_bag and not pars.qr_emb:
        save_model_name = save_model_name + '_emb_bag'
    if pars.qr_emb:
        save_model_name = save_model_name + '_qr'

    save_model_name = save_model_name + '_' + datetime.now().strftime("%Y%m%d%H%M%S")

    logger = get_logger(save_model_name[14:])
    logger.info(pars)

    logger.info("GET DATASET")
    field_size, train_dict, valid_dict, test_dict = get_dataset(pars)

    model = get_model(field_size=field_size, cuda=pars.use_cuda and torch.cuda.is_available(), feature_sizes=train_dict['feature_sizes'], pars=pars, logger=logger)
    #summary(model, [(train_dict['index'].shape[1], 1), (train_dict['value'].shape[1], )], dtypes=[torch.long, torch.float], device=torch.device("cpu"))

    if pars.use_cuda and torch.cuda.is_available():
        torch.cuda.empty_cache()
        #logger.info(torch.cuda.memory_summary(device=None, abbreviated=False))
        model = model.cuda()

    model.fit(train_dict['index'], train_dict['value'], train_dict['label'], valid_dict['index'],
              valid_dict['value'], valid_dict['label'],
              prune=pars.prune, prune_fm=pars.prune_fm, prune_r=pars.prune_r, prune_deep=pars.prune_deep,
              save_path=save_model_name, emb_r=pars.emb_r, emb_corr=pars.emb_corr, early_stopping=False)

    # measurements
    model = get_model(field_size=field_size, cuda=pars.time_on_cuda, feature_sizes=train_dict['feature_sizes'], pars=pars, logger=logger)
    model = load_model_dic(model, save_model_name, sparse=pars.prune)
    def mention_proposal_model_fn(features, labels, mode, params):
        """The `model_fn` for TPUEstimator."""
        input_ids = features["flattened_input_ids"]
        input_mask = features["flattened_input_mask"]
        text_len = features["text_len"]
        speaker_ids = features["speaker_ids"]
        gold_starts = features["span_starts"]
        gold_ends = features["span_ends"]
        cluster_ids = features["cluster_ids"]
        sentence_map = features["sentence_map"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = util.get_model(config, model_sign="mention_proposal")

        if config.use_tpu:

            def tpu_scaffold():
                return tf.train.Scaffold()

            scaffold_fn = tpu_scaffold
        else:
            scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.TRAIN ******************************"
            )
            tf.logging.info("********* Features *********")
            for name in sorted(features.keys()):
                tf.logging.info("  name = %s, shape = %s" %
                                (name, features[name].shape))

            instance = (input_ids, input_mask, sentence_map, text_len,
                        speaker_ids, gold_starts, gold_ends, cluster_ids)
            total_loss, start_scores, end_scores, span_scores = model.get_mention_proposal_and_loss(
                instance, is_training)
            gold_start_sequence_labels, gold_end_sequence_labels, gold_span_sequence_labels = model.get_gold_mention_sequence_labels_from_pad_index(
                gold_starts, gold_ends, text_len)

            if config.use_tpu:
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=config.learning_rate,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-08)
                optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
                train_op = optimizer.minimize(total_loss,
                                              tf.train.get_global_step())
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn)
            else:
                optimizer = RAdam(learning_rate=config.learning_rate,
                                  epsilon=1e-8,
                                  beta1=0.9,
                                  beta2=0.999)
                train_op = optimizer.minimize(total_loss,
                                              tf.train.get_global_step())

                train_logging_hook = tf.train.LoggingTensorHook(
                    {"loss": total_loss}, every_n_iter=1)
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn,
                    training_hooks=[train_logging_hook])

        elif mode == tf.estimator.ModeKeys.EVAL:
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.EVAL ******************************"
            )

            instance = (input_ids, input_mask, sentence_map, text_len,
                        speaker_ids, gold_starts, gold_ends, cluster_ids)
            total_loss, start_scores, end_scores, span_scores = model.get_mention_proposal_and_loss(
                instance, is_training)
            total_loss, start_scores, end_scores, span_scores = model.get_mention_proposal_and_loss(
                instance, is_training)
            gold_start_sequence_labels, gold_end_sequence_labels, gold_span_sequence_labels = model.get_gold_mention_sequence_labels_from_pad_index(
                gold_starts, gold_ends, text_len)

            def metric_fn(start_scores, end_scores, span_scores,
                          gold_span_label):
                start_scores = tf.reshape(start_scores,
                                          [-1, config.window_size])
                end_scores = tf.reshape(end_scores, [-1, config.window_size])
                start_scores = tf.tile(tf.expand_dims(start_scores, 2),
                                       [1, 1, config.window_size])
                end_scores = tf.tile(tf.expand_dims(end_scores, 2),
                                     [1, 1, config.window_size])
                sce_span_scores = (start_scores + end_scores + span_scores) / 3
                pred_span_label = tf.cast(
                    tf.reshape(
                        tf.math.greater_equal(sce_span_scores,
                                              config.mention_threshold), [-1]),
                    tf.bool)

                gold_span_label = tf.cast(
                    tf.reshape(gold_span_sequence_labels, [-1]), tf.bool)

                return {
                    "precision":
                    tf.compat.v1.metrics.precision(gold_span_label,
                                                   pred_span_label),
                    "recall":
                    tf.compat.v1.metrics.recall(gold_span_label,
                                                pred_span_label)
                }

            eval_metrics = (metric_fn, [start_scores, end_scores, span_scores])
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=tf.estimator.ModeKeys.EVAL,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)

        elif mode == tf.estimator.ModeKeys.PREDICT:
            tf.logging.info(
                "****************************** tf.estimator.ModeKeys.PREDICT ******************************"
            )

            instance = (input_ids, input_mask, sentence_map, text_len,
                        speaker_ids, gold_starts, gold_ends, cluster_ids)
            total_loss, start_scores, end_scores, span_scores = model.get_mention_proposal_and_loss(
                instance, is_training)
            gold_start_sequence_labels, gold_end_sequence_labels, gold_span_sequence_labels = model.get_gold_mention_sequence_labels_from_pad_index(
                gold_starts, gold_ends, text_len)
            predictions = {
                "total_loss": total_loss,
                "start_scores": start_scores,
                "start_gold": gold_starts,
                "end_gold": gold_ends,
                "end_scores": end_scores,
                "span_scores": span_scores
            }
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=tf.estimator.ModeKeys.PREDICT,
                predictions=predictions,
                scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Please check the the mode ! ")

        return output_spec
Beispiel #7
0
def embed_sentences(sentences, embed_type, model_dir=None):
    model = get_model(embed_type, model_dir)
    if embed_type == 'USE' and type(sentences) == str:
        sentences = [sentences]
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings
Beispiel #8
0
                if distanceOrder[x][1] == correctAnswer:
                    recipRanks.append(reciprocal)
                    break

        # if len(embed_dic)!=0:
        with open(file_path, 'wb') as handle:
            pickle.dump(embed_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

    meanRecipRank = sum(recipRanks) / len(recipRanks)
    print('MRR: standard error of the mean ', stat.sem(recipRanks))
    print("Mean reciprocal rank is:", meanRecipRank)
    print(
        f"Average distance from question to best answer (highest votes): euclid = {statistics.mean(euclid_distances_to_best_answer)}, "
        f"cosine = {statistics.mean(cosine_distances_to_best_answer)}")
    print(
        f"Average distance from question to worst answer (lowest votes):euclid = {statistics.mean(euclid_distances_to_worst_answer)}, "
        f"cosine = {statistics.mean(cosine_distances_to_worst_answer)}")


if __name__ == "__main__":
    # stackQandAPath = input("Please enter path to stackoverflow question and answer data")
    # oldPath = input("Please enter path to legacy data for paired t test calculation.")
    # stackQandAPath = '/Users/ibrahimabdelaziz/Downloads/stackoverflow_data_ranking_sample.json'
    # model =  ''
    # embed_dir = ''
    stackQandAPath = sys.argv[1]
    embed_type = sys.argv[2]
    model_path = sys.argv[3]
    model = get_model(embed_type, model_path)
    beginAnalysis(stackQandAPath, model, embed_type)
                        help="data directory where all the files exist",
                        required=True)
    parser.add_argument("--embed_type", help="embed type", required=True)
    parser.add_argument('--model_dir', help="model dir", required=False)
    parser.add_argument('--output_dir', help="output dir", required=False)
    args = parser.parse_args()

    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################
    data_dir = args.data_dir

    if args.model_dir:
        model = util.get_model(args.embed_type,
                               local_model_path=args.model_dir)
    else:
        model = util.get_model(args.embed_type)

    test_hierarchy_samples = create_hirerachy_examples('hierarchy_test.json',
                                                       data_dir,
                                                       model,
                                                       validate=None,
                                                       is_test=True)
    test_linked_posts = create_linked_posts(
        'stackoverflow_data_linkedposts__testing.json',
        data_dir,
        model,
        validate=None,
        is_test=True)
    test_class_posts = create_train_class_posts('class_posts_test_data.json',
Beispiel #10
0
if __name__ == '__main__':
    parser = get_parser()
    pars = parser.parse_args()

    logger = get_logger('Quantization')
    logger.info(pars)

    field_size, train_dict, valid_dict, test_dict = get_dataset(pars)

    if not pars.save_model_path:
        logger.info("no model path given: -save_model_path")
        sys.exit()

    model = get_model(field_size=field_size,
                      cuda=pars.use_cuda and torch.cuda.is_available(),
                      feature_sizes=train_dict['feature_sizes'],
                      pars=pars,
                      logger=logger)
    model = load_model_dic(model, pars.save_model_path, sparse=pars.prune)

    #summary(model, [(train_dict['index'].shape[1], 1), (train_dict['value'].shape[1], )], dtypes=[torch.long, torch.float], device=torch.device("cpu"))

    if pars.use_cuda:
        model.cuda()

    logger.info('Original model:')
    model.print_size_of_model()
    model.run_benchmark(test_dict['index'],
                        test_dict['value'],
                        test_dict['label'],
                        cuda=pars.use_cuda)
Beispiel #11
0
    def __init__(self, args, port):
        cfg = get_cfg()
        cfg.merge_from_file(args.config)
        self.cfg = cfg
        self.port = port
        assert os.path.exists(
            'saved_models'
        ), "Create a path to save the trained models: <default: ./saved_models> "
        self.model_dir = os.path.join('saved_models', cfg.NAME)
        self.writer = SummaryWriter(
            log_dir=os.path.join(self.model_dir, "summary"))
        self.iteration = 0
        print("Arguments used: {}".format(args), flush=True)

        self.trainset, self.testset = get_datasets(cfg)
        self.model = get_model(cfg)
        print("Using model: {}".format(self.model.__class__), flush=True)

        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            self.model, self.optimiser = self.init_distributed(cfg)
        # TODO: do not use distributed package in this case
        elif torch.cuda.is_available():
            self.model, self.optimiser = self.init_distributed(cfg)
        else:
            raise RuntimeError("CUDA not available.")

        # self.model, self.optimiser, self.start_epoch, start_iter = \
        #   load_weightsV2(self.model, self.optimiser, args.wts, self.model_dir)
        self.lr_schedulers = get_lr_schedulers(self.optimiser, cfg,
                                               self.start_epoch)
        self.batch_size = self.cfg.TRAINING.BATCH_SIZE

        args.world_size = 1
        print(args)
        self.args = args
        self.epoch = 0
        self.best_loss_train = math.inf
        self.losses = AverageMeterDict()
        self.ious = AverageMeterDict()

        num_samples = None if cfg.DATALOADER.NUM_SAMPLES == -1 else cfg.DATALOADER.NUM_SAMPLES
        if torch.cuda.device_count() > 1:
            # shuffle parameter does not seem to shuffle the data for distributed sampler
            self.train_sampler = torch.utils.data.distributed.DistributedSampler(
                torch.utils.data.RandomSampler(self.trainset,
                                               replacement=True,
                                               num_samples=num_samples),
                shuffle=True)
        else:
            self.train_sampler = torch.utils.data.RandomSampler(self.trainset, replacement=True, num_samples=num_samples) \
              if num_samples is not None else None
        shuffle = True if self.train_sampler is None else False
        self.trainloader = DataLoader(self.trainset,
                                      batch_size=self.batch_size,
                                      num_workers=cfg.DATALOADER.NUM_WORKERS,
                                      shuffle=shuffle,
                                      sampler=self.train_sampler)

        print(
            summary(self.model,
                    tuple((3, cfg.INPUT.TW, 256, 256)),
                    batch_size=1))
Beispiel #12
0
if __name__ == '__main__':
    parser = get_parser()
    pars = parser.parse_args()

    logger = get_logger('Knowledge Distillation')
    logger.info(pars)

    if not pars.save_model_path:
        logger.error("no model path given: -save_model_path")
        sys.exit()

    field_size, train_dict, valid_dict, test_dict = get_dataset(pars)

    # teacher
    model = get_model(field_size=field_size, cuda=pars.use_cuda and torch.cuda.is_available(), feature_sizes=train_dict['feature_sizes'], pars=pars, logger=logger)
    model = load_model_dic(model, pars.save_model_path)

    # student
    number_of_deep_nodes = 400
    h_depth = 2
    student = get_model(field_size=field_size, cuda=pars.use_cuda and torch.cuda.is_available(), feature_sizes=train_dict['feature_sizes'], deep_nodes=number_of_deep_nodes, h_depth=h_depth,
                        pars=pars, logger=logger)

    logger.info(model)
    logger.info(student)

    if pars.use_cuda and torch.cuda.is_available():
        torch.cuda.empty_cache()
        student = student.cuda()
        model = model.cuda()
Beispiel #13
0
def build_class_mapping(mapPath):
    classMap = {}
    with open(mapPath, 'r') as inputFile:
        for line in inputFile:
            lineComponents = line.rstrip().split(' ')
            if len(lineComponents) < 2:
                classMap[lineComponents[0]] = lineComponents[0]
            else:
                classMap[lineComponents[0]] = lineComponents[1]
    return classMap


def check(data, v):
    print(v)
    assert v in data


if __name__ == '__main__':
    if len(sys.argv) > 4:
        embedType = sys.argv[4]
        if len(sys.argv) > 5:
            model_dir = sys.argv[5]

    util.get_model(embedType, model_dir)

    docPath = sys.argv[1]
    classPath = sys.argv[2]
    usagePath = sys.argv[3]
    with open(usagePath) as f:
        util.evaluate_regression(f, docPath, embedType)