예제 #1
0
                print("Starting the model prediction ...")

                reader = DataReader(num_classes,
                                    max_seq_length=max_seq_length,
                                    tag_to_idx_map_file=tag_to_idx_map_file,
                                    vector_size=embed_vector_size)
                entityExtractor = EntityExtractor(reader,
                                                  embedding_pickle_file)

                #load the model
                print("Loading the model from file {} ...".format(
                    model_file_path))
                entityExtractor.load(model_file_path)
                entityExtractor.print_summary()

                predicted_tags = entityExtractor.predict_2(
                    local_data_file_path)
                if not os.path.exists("C:\dl4nlp\output"):
                    os.makedirs("C:\dl4nlp\output")

                with open('C:\dl4nlp\output\prediction.out', 'w') as f:
                    for ind, line in enumerate(predicted_tags):
                        f.write("{}\t{}\n".format(ind, line))

            else:
                print("undefined mode")

    K.clear_session()
    K.set_session(None)
    print("Done.")
'''
예제 #2
0
def main():
    print("Running on advisor conversation")

    b_download_embedding_files = False
    b_train = True
    b_evaluate = True
    b_score = False

    #Specify the path where to store the downloaded files

    from sys import platform
    if platform == "win32":
        home_dir = "C:\\dl4nlp"
    else:
        home_dir = os.path.join(os.path.expanduser('~'), "dl4nlp")

    print("home_dir = {}".format(home_dir))

    # The hyper-parameters of the word embedding trained model
    window_size = 5
    embed_vector_size = 50
    min_count = 400

    # Define the data files
    data_folder = os.path.join("sample_data", "advisorConversations")
    train_file_path = os.path.join(data_folder,
                                   "advisorConversations_train.txt")
    test_file_path = os.path.join(data_folder, "advisorConversations_test.txt")
    data_file_path = os.path.join(data_folder,
                                  "unlabeled_advisorConversations.txt")
    resources_pickle_file = os.path.join(home_dir, "models", "resources.pkl")
    embedding_pickle_file = os.path.join(home_dir, "models", "w2vmodel_advisorConversations_vs_{}_ws_{}_mc_{}.pkl" \
            .format(embed_vector_size, window_size, min_count))
    print("embedding_pickle_file= {}".format(embedding_pickle_file))

    if b_download_embedding_files == True:
        #Specify the string to look for in blob names from your container
        embedding_folder_name = "word2vec_advisorConversations_model_vs_{}_ws_{}_mc_{}_parquet_files".\
            format(embed_vector_size, window_size, min_count)
        print("embedding_folder_name= {}".format(embedding_folder_name))

        embedding_full_path = os.path.join(home_dir, "models",
                                           embedding_folder_name)
        print("embedding_full_path= {}".format(embedding_full_path))

        #download the parquet files from Blob storage
        download_embedding_parquet_files_from_storage(embedding_full_path,
                                                      embedding_folder_name,
                                                      num_parquet_files=1000)

        save_embeddings_to_pickle_file(embedding_full_path,
                                       embedding_pickle_file,
                                       embed_vector_size)
        print("Done")

    # The hyperparameters of the LSTM trained model
    #network_type= 'unidirectional'
    network_type = 'bidirectional'
    #embed_vector_size = 50
    num_layers = 2
    num_hidden_units = 150
    num_epochs = 10
    batch_size = 50
    dropout = 0.2
    reg_alpha = 0.0

    model_file_path = os.path.join(home_dir,'models','lstm_{}_model_units_{}_lyrs_{}_epchs_{}_vs_{}_ws_{}_mc_{}.h5'.\
                  format(network_type, num_hidden_units, num_layers,  num_epochs, embed_vector_size, window_size, min_count))

    K.clear_session()
    with K.get_session() as sess:
        K.set_session(sess)
        graphr = K.get_session().graph
        with graphr.as_default():

            if b_train == True:
                print("Training the model... num_epochs = {}, num_layers = {}, num_hidden_units = {}".\
                      format(num_epochs, num_layers,num_hidden_units))

                reader = DataReader()
                entityExtractor = EntityExtractor(reader,
                                                  embedding_pickle_file)

                entityExtractor.train (train_file_path, \
                    output_resources_pickle_file = resources_pickle_file, \
                    network_type = network_type, \
                    num_epochs = num_epochs, \
                    batch_size = batch_size, \
                    dropout = dropout, \
                    reg_alpha = reg_alpha, \
                    num_hidden_units = num_hidden_units, \
                    num_layers = num_layers)

                #Save the model
                entityExtractor.save(model_file_path)

            if b_evaluate == True:
                # Evaluate the model
                print("Evaluating the model...")

                reader = DataReader(
                    input_resources_pickle_file=resources_pickle_file)
                entityExtractor = EntityExtractor(reader)

                #load the model
                print("Loading the model from file {} ...".format(
                    model_file_path))
                entityExtractor.load(model_file_path)
                entityExtractor.print_summary()

                if not os.path.exists(os.path.join(home_dir, "output")):
                    os.makedirs(os.path.join(home_dir, "output"))

                # make sure that the input test data file is in IOB format
                output_prediction_file = os.path.join(home_dir, "output",
                                                      "prediction_output.tsv")

                evaluation_report, confusion_matrix = entityExtractor.evaluate_model(
                    test_file_path, output_prediction_file)
                print(evaluation_report)
                print(confusion_matrix)

            if b_score == True:
                print("Starting the model prediction ...")

                reader = DataReader(
                    input_resources_pickle_file=resources_pickle_file)
                entityExtractor = EntityExtractor(reader)

                #load the model
                print("Loading the model from file {} ...".format(
                    model_file_path))
                entityExtractor.load(model_file_path)
                entityExtractor.print_summary()

                predicted_tags = entityExtractor.predict_2(data_file_path)

                if not os.path.exists(os.path.join(home_dir, "output")):
                    os.makedirs(os.path.join(home_dir, "output"))

                output_prediction_file = os.path.join(home_dir, "output",
                                                      "prediction_output.tsv")
                with open(output_prediction_file, 'w') as f:
                    for ind, line in enumerate(predicted_tags):
                        f.write("{}\t{}\n".format(ind, line))

    K.clear_session()
    K.set_session(None)
    print("Done.")