Example #1
0
def get_labels(config_path, load_path):
    config = parse_config("config", config_path)
    temp_model_path = config_path + ".model"

    #Restore the best model
    labeler = SequenceLabeler.load(load_path)

    #Label the data in the 'label_path' section
    if config["path_label"] is not None:
        for path_test in config["path_label"].strip().split(":"):
            data_test = read_input_files(path_test)
            results_test, processed_data, incorrect_counter, sent_count = process_sentences_labelling(
                data_test,
                labeler,
                is_training=False,
                learningrate=0.0,
                config=config,
                name="test" + str(i),
                ReturnData=True)
            evaluator_file = open(
                '/content/drive/My Drive/beamsearch_remove_60', 'w')
            for j in range(len(processed_data)):
                evaluator_file.write(processed_data[j] + "\n")
            evaluator_file.close()
            print('Number of incorrect tokens: ', incorrect_counter)
            print('Number of sentences: ', sent_count)
Example #2
0
def run_experiment(config_path):
    config = parse_config("config", config_path)
    temp_model_path = config_path + ".model"
    if "random_seed" in config:
        random.seed(config["random_seed"])
        numpy.random.seed(config["random_seed"])

    for key, val in config.items():
        print(str(key) + ": " + str(val))

    data_train, data_dev, data_test = None, None, None
    if config["path_train"] != None and len(config["path_train"]) > 0:
        data_train = read_input_files(config["path_train"],
                                      config["max_train_sent_length"])
    if config["path_dev"] != None and len(config["path_dev"]) > 0:
        data_dev = read_input_files(config["path_dev"])
    if config["path_test"] != None and len(config["path_test"]) > 0:
        data_test = []
        for path_test in config["path_test"].strip().split(":"):
            data_test += read_input_files(path_test)

    if config["load"] != None and len(config["load"]) > 0:
        labeler = SequenceLabeler.load(config["load"])
    else:
        labeler = SequenceLabeler(config)
        labeler.build_vocabs(data_train, data_dev, data_test,
                             config["preload_vectors"])
        labeler.construct_network()
        labeler.initialize_session()
        if config["preload_vectors"] != None:
            labeler.preload_word_embeddings(config["preload_vectors"])

    print("parameter_count: " + str(labeler.get_parameter_count()))
    print("parameter_count_without_word_embeddings: " +
          str(labeler.get_parameter_count_without_word_embeddings()))

    if data_train != None:
        model_selector = config["model_selector"].split(":")[0]
        model_selector_type = config["model_selector"].split(":")[1]
        best_selector_value = 0.0
        best_epoch = -1
        learningrate = config["learningrate"]
        for epoch in range(config["epochs"]):
            print("EPOCH: " + str(epoch))
            print("current_learningrate: " + str(learningrate))
            random.shuffle(data_train)

            results_train = process_sentences(data_train,
                                              labeler,
                                              is_training=True,
                                              learningrate=learningrate,
                                              config=config,
                                              name="train")

            if data_dev != None:
                results_dev = process_sentences(data_dev,
                                                labeler,
                                                is_training=False,
                                                learningrate=0.0,
                                                config=config,
                                                name="dev")

                if math.isnan(results_dev["dev_cost_sum"]) or math.isinf(
                        results_dev["dev_cost_sum"]):
                    sys.stderr.write("ERROR: Cost is NaN or Inf. Exiting.\n")
                    break

                if (epoch == 0 or
                    (model_selector_type == "high"
                     and results_dev[model_selector] > best_selector_value) or
                    (model_selector_type == "low"
                     and results_dev[model_selector] < best_selector_value)):
                    best_epoch = epoch
                    best_selector_value = results_dev[model_selector]
                    labeler.saver.save(
                        labeler.session,
                        temp_model_path,
                        latest_filename=os.path.basename(temp_model_path) +
                        ".checkpoint")
                print("best_epoch: " + str(best_epoch))

                if config["stop_if_no_improvement_for_epochs"] > 0 and (
                        epoch - best_epoch
                ) >= config["stop_if_no_improvement_for_epochs"]:
                    break

                if (epoch - best_epoch) > 3:
                    learningrate *= config["learningrate_decay"]

            while config["garbage_collection"] == True and gc.collect() > 0:
                pass

        if data_dev != None and best_epoch >= 0:
            # loading the best model so far
            labeler.saver.restore(labeler.session, temp_model_path)

            os.remove(temp_model_path + ".checkpoint")
            os.remove(temp_model_path + ".data-00000-of-00001")
            os.remove(temp_model_path + ".index")
            os.remove(temp_model_path + ".meta")

    if config["save"] is not None and len(config["save"]) > 0:
        labeler.save(config["save"])

    if config["path_test"] is not None:
        i = 0
        for path_test in config["path_test"].strip().split(":"):
            data_test = read_input_files(path_test)
            results_test = process_sentences(data_test,
                                             labeler,
                                             is_training=False,
                                             learningrate=0.0,
                                             config=config,
                                             name="test" + str(i))
            i += 1
Example #3
0
def run_experiment(config_path):
    config = parse_config("config", config_path)
    temp_model_path = config_path + ".model"
    if "random_seed" in config:
        random.seed(config["random_seed"])
        numpy.random.seed(config["random_seed"])

    for key, val in config.items():
        print(str(key) + ": " + str(val))

    data_train, data_dev, data_test = None, None, None
    if config["path_train"] != None and len(config["path_train"]) > 0:
        if config['alternating_training']:
            # implements dataset-switching, i.e. first trains on the 'main' dataset, then on the augmented dataset in similar sized chunks
            data_train, split_points = read_input_files(
                config["path_train"],
                config["max_train_sent_length"],
                return_splits=True)
            main_train = data_train[:split_points[0]]
            data_train = data_train[split_points[0]:]
            random.shuffle(data_train)  # shuffle all augmented data
            data_train = main_train + data_train
            minibatch_size = split_points[0]
            minibatches = []
            for batch_start_index in range(0, len(data_train), minibatch_size):
                minibatches += [
                    data_train[batch_start_index:batch_start_index +
                               minibatch_size]
                ]
            if len(minibatches[-1]) < 0.5 * minibatch_size:
                minibatches[-2] = minibatches[-2] + minibatches[-1]
                minibatches = minibatches[:
                                          -1]  # merge last minibatch with previous, if too small
        else:
            data_train = read_input_files(config["path_train"],
                                          config["max_train_sent_length"])
            minibatches = [data_train]
        print("minibatch sizes: " +
              ", ".join([str(len(i)) for i in minibatches]))

    if config["path_dev"] != None and len(config["path_dev"]) > 0:
        data_dev = read_input_files(config["path_dev"])
    if config["path_test"] != None and len(config["path_test"]) > 0:
        data_test = []
        for path_test in config["path_test"].strip().split(":"):
            data_test += read_input_files(path_test)

    if config["load"] != None and len(config["load"]) > 0:
        labeler = SequenceLabeler.load(config["load"])
    else:
        labeler = SequenceLabeler(config)
        labeler.build_vocabs(data_train, data_dev, data_test,
                             config["preload_vectors"])
        labeler.construct_network()
        labeler.initialize_session()
        if config["preload_vectors"] != None:
            labeler.preload_word_embeddings(config["preload_vectors"])

    print("parameter_count: " + str(labeler.get_parameter_count()))
    print("parameter_count_without_word_embeddings: " +
          str(labeler.get_parameter_count_without_word_embeddings()))

    if data_train != None:
        model_selector = config["model_selector"].split(":")[0]
        model_selector_type = config["model_selector"].split(":")[1]
        no_improvement_for = 0
        best_selector_value = 0.0
        best_epoch = -1
        learningrate = config["learningrate"]
        for epoch in range(config["epochs"]):
            print("EPOCH: " + str(epoch))
            for batchno, minibatch in enumerate(minibatches):
                print("BATCH: " + str(batchno))
                print("current_learningrate: " + str(learningrate))
                random.shuffle(minibatch)
                results_train = process_sentences(minibatch,
                                                  labeler,
                                                  is_training=True,
                                                  learningrate=learningrate,
                                                  config=config,
                                                  name="train")

                if data_dev != None:
                    results_dev = process_sentences(data_dev,
                                                    labeler,
                                                    is_training=False,
                                                    learningrate=0.0,
                                                    config=config,
                                                    name="dev")
                    no_improvement_for += 1

                    if math.isnan(results_dev["dev_cost_sum"]) or math.isinf(
                            results_dev["dev_cost_sum"]):
                        sys.stderr.write(
                            "ERROR: Cost is NaN or Inf. Exiting.\n")
                        break

                    if ((epoch == 0 and batchno == 0) or
                        (model_selector_type == "high"
                         and results_dev[model_selector] > best_selector_value)
                            or
                        (model_selector_type == "low" and
                         results_dev[model_selector] < best_selector_value)):
                        best_epoch = epoch
                        best_batch = batchno
                        no_improvement_for = 0
                        best_selector_value = results_dev[model_selector]
                        labeler.saver.save(
                            labeler.session,
                            temp_model_path,
                            latest_filename=os.path.basename(temp_model_path) +
                            ".checkpoint")
                    print("best_epoch and best_batch: " + str(best_epoch) +
                          "-" + str(best_batch))
                    print("no improvement for: " + str(no_improvement_for))

                if no_improvement_for > config["learningrate_delay"]:
                    learningrate *= config["learningrate_decay"]

                if config[
                        "stop_if_no_improvement_for_epochs"] > 0 and no_improvement_for >= config[
                            "stop_if_no_improvement_for_epochs"]:
                    break

            if config[
                    "stop_if_no_improvement_for_epochs"] > 0 and no_improvement_for >= config[
                        "stop_if_no_improvement_for_epochs"]:
                break

            while config["garbage_collection"] == True and gc.collect() > 0:
                pass

        if data_dev != None and best_epoch >= 0:
            # loading the best model so far
            labeler.saver.restore(labeler.session, temp_model_path)

            os.remove(temp_model_path + ".checkpoint")
            os.remove(temp_model_path + ".data-00000-of-00001")
            os.remove(temp_model_path + ".index")
            os.remove(temp_model_path + ".meta")

    if config["save"] is not None and len(config["save"]) > 0:
        labeler.save(config["save"])

    if config["path_test"] is not None:
        i = 0
        for path_test in config["path_test"].strip().split(":"):
            data_test = read_input_files(path_test)
            results_test = process_sentences(data_test,
                                             labeler,
                                             is_training=False,
                                             learningrate=0.0,
                                             config=config,
                                             name="test" + str(i))
            i += 1
Example #4
0
def load_model(config, data_train, data_dev, data_test):
    if config["load"] != None and len(config["load"]) > 0:
        labeler = SequenceLabeler.load(config["load"])
    else:
        labeler = SequenceLabeler(config)
        labeler.build_vocabs(data_train, data_dev, data_test,
                             config["preload_vectors"])
        labeler.construct_network()
        labeler.initialize_session()
        if config["preload_vectors"] is not None:
            labeler.preload_word_embeddings(config["preload_vectors"])
    print("parameter_count: " + str(labeler.get_parameter_count()))
    print("parameter_count_without_word_embeddings: " +
          str(labeler.get_parameter_count_without_word_embeddings()))
    return labeler