Esempio n. 1
0
def run_fasttext_setting(setting_file, home_dir,
                         train_data_file, test_data_file,
                         overwrite_params=None):
    properties = exp_util.load_properties(setting_file)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties,
                                                          overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    if dnn_embedding_file.endswith('none'):
        dnn_embedding_file = None

    ######## dnn #######
    print("loading dataset...")
    df, train_size, test_size = exp_util.\
        load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t")
    class_col = int(exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map


    dnn_classifier.fit_fasttext_holdout(df=df,
                                split_at_row=train_size,
                                class_col=class_col,
                                outfolder=outfolder,
                                task=exp_util.describe_task(properties, overwrite_params, setting_file),
                                text_norm_option=1,
                                text_input_info=input_text_info,
                                embedding_file=dnn_embedding_file)
    print("Completed running on this setting file")
    print(datetime.datetime.now())
Esempio n. 2
0
def run_fasttext_model(setting_file: str, properties: dict, df: numpy.ndarray,
                       y, train_size: int, class_col: int, outfolder: str,
                       dnn_embedding_file, text_field_mapping: dict):

    # this is the folder to save output to

    print("\n" + str(datetime.datetime.now()))

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("text_fieldnames", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = text_field_mapping[config[0]]
        map["text_length"] = int(config[1])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    dnn_classifier.fit_fasttext_holdout(df=df,
                                        split_at_row=train_size,
                                        class_col=class_col,
                                        outfolder=outfolder,
                                        task=exp_util.describe_task(
                                            properties, overwrite_params,
                                            setting_file),
                                        text_norm_option=1,
                                        text_input_info=input_text_info,
                                        embedding_file=dnn_embedding_file)
    print("Completed running on this setting file")
    print(datetime.datetime.now())
Esempio n. 3
0
def run_dnn_setting(setting_file,
                    home_dir,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    word_weights_file = exp_util.load_setting('word_weights_file', properties,
                                              overwrite_params)
    if word_weights_file == None:
        word_weights = None
    else:
        print("using word weights to revise embedding vectors...")
        word_weights = load_word_weights(word_weights_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    #print("embedding file is========="+dnn_embedding_file)
    emb_model = embedding_util.load_emb_model(embedding_format,
                                              dnn_embedding_file)

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details
    model_descriptors = [
        "input=2d bilstm=100-False|dense=?-softmax|emb",
        #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb",
        "input=2d han_2dinput"
    ]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    )
    df = df.fillna('')
    df = df.as_matrix()
    class_col = int(
        exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML model=" + model_descriptor)

        model_descriptor = model_descriptor.split(" ")[1]

        dnn_branches = []
        dnn_branch_input_shapes = []
        input_text_info = {}
        count = 0
        for x in exp_util.load_setting("training_text_data_columns",
                                       properties,
                                       overwrite_params).split("|"):
            config = x.split(",")
            map = {}

            map["text_col"] = config[0]
            map["text_length"] = int(config[2])
            map["text_dim"] = util.DNN_EMBEDDING_DIM
            input_text_info[count] = map

            if config[1] == 'simple':
                dnn_branch = dnn_classifier.create_dnn_branch(
                    map["text_length"],
                    util.DNN_EMBEDDING_DIM,
                    model_descriptor='simple')
            else:
                dnn_branch = dnn_classifier.create_dnn_branch(
                    map["text_length"],
                    util.DNN_EMBEDDING_DIM,
                    model_descriptor=model_descriptor)
            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            count += 1
        # now create DNN branches based on the required input text column sources

        print("creating merged model (if multiple input branches)")
        final_model = \
            dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes,
                                            target_classes)
        print("fitting model...")

        dnn_classifier.fit_dnn(df=df,
                               nfold=n_fold,
                               class_col=class_col,
                               final_model=final_model,
                               outfolder=outfolder,
                               task=exp_util.describe_task(
                                   properties, overwrite_params, setting_file),
                               model_descriptor=model_descriptor,
                               text_norm_option=1,
                               text_input_info=input_text_info,
                               embedding_model=emb_model,
                               embedding_model_format=embedding_format,
                               word_weights=word_weights)
    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
Esempio n. 4
0
def run_fasttext_setting(setting_file, home_dir, overwrite_params=None):
    properties = exp_util.load_properties(setting_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    if dnn_embedding_file.endswith('none'):
        dnn_embedding_file = None

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    )
    df = df.fillna('')
    df = df.as_matrix()
    class_col = int(
        exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

    dnn_classifier.fit_fasttext(df=df,
                                nfold=n_fold,
                                class_col=class_col,
                                outfolder=outfolder,
                                task=exp_util.describe_task(
                                    properties, overwrite_params,
                                    setting_file),
                                text_norm_option=1,
                                text_input_info=input_text_info,
                                embedding_file=dnn_embedding_file)
    print("Completed running on this setting file")
    print(datetime.datetime.now())
Esempio n. 5
0
def run_single_setting(setting_file,
                       home_dir,
                       remove_rare_classes,
                       remove_no_desc_instances,
                       overwrite_params=None,
                       gensimFormat=None):
    properties = exp_util.load_properties(setting_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)
    # this is the folder containing other numeric features that are already pre-extracted
    csv_training_other_feaures = home_dir + exp_util.load_setting(
        'training_other_features', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    if gensimFormat is None:
        gensimFormat = ".gensim" in dnn_embedding_file
    if gensimFormat:
        pretrained_embedding_models = gensim.models.KeyedVectors.load(
            dnn_embedding_file, mmap='r')
    else:
        pretrained_embedding_models = gensim.models.KeyedVectors. \
            load_word2vec_format(dnn_embedding_file, binary=True)

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details
    model_descriptors = [
        #"input=2d bilstm=100-False|dense=?-softmax|emb",
        "input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb"
    ]
    # "input=2d han_2dinput"]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    ).as_matrix()
    df.astype(str)
    if remove_no_desc_instances:
        print(
            "you have chosen to remove instances whose description are empty")
        df = exp_util.remove_empty_desc_instances(df, 5)

    y = df[:,
           int(
               exp_util.
               load_setting("class_column", properties, overwrite_params))]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    remove_instance_indexes = []
    if remove_rare_classes:
        print(
            "you have chosen to remove classes whose instances are less than n_fold"
        )
        instance_labels = list(y)
        class_dist = {x: instance_labels.count(x) for x in instance_labels}
        remove_labels = []
        for k, v in class_dist.items():
            if v < n_fold:
                remove_labels.append(k)
        remove_instance_indexes = []
        for i in range(len(y)):
            label = y[i]
            if label in remove_labels:
                remove_instance_indexes.append(i)
        y = numpy.delete(y, remove_instance_indexes)
        target_classes = len(set(y))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML model=" + model_descriptor)

        input_shape = model_descriptor.split(" ")[0]
        model_descriptor = model_descriptor.split(" ")[1]

        if input_shape.endswith("2d"):
            input_as_2D = True
        else:
            input_as_2D = False

        if "han" in model_descriptor or "lstm" in model_descriptor:
            dnn_embedding_mask_zero = True
        else:
            dnn_embedding_mask_zero = False

        input_column_sources = \
            [x for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|")]
        # now create DNN branches based on the required input text column sources

        dnn_branches = []
        dnn_branch_input_shapes = []
        dnn_branch_input_features = []
        for string in input_column_sources:
            print("\tcreating model branch=" + string)
            config = string.split(",")
            col_index = config[0]

            embedding_trainable = False
            if col_index == '13':
                embedding_trainable = True

            text_data = cc.create_text_input_data(config[0], df)

            col_text_length = int(config[2])

            text_data = numpy.delete(text_data, remove_instance_indexes)
            data = ["" if type(x) is float else str(x) for x in text_data]

            dnn_branch = dnn_classifier.create_dnn_branch_textinput(
                pretrained_embedding_models,
                input_text_data=data,
                input_text_sentence_length=col_text_length,
                input_text_word_embedding_dim=util.DNN_EMBEDDING_DIM,
                model_descriptor=model_descriptor,
                embedding_trainable=embedding_trainable,
                embedding_mask_zero=dnn_embedding_mask_zero)

            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            dnn_branch_input_features.append(dnn_branch[2])

        print("creating merged model (if multiple input branches)")
        final_model = \
            dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes,
                                            target_classes)
        print("fitting model...")
        dnn_classifier.fit_dnn(inputs=dnn_branch_input_features,
                               nfold=n_fold,
                               y_train=y,
                               final_model=final_model,
                               outfolder=outfolder,
                               task=exp_util.describe_task(
                                   properties, overwrite_params, setting_file),
                               model_descriptor=model_descriptor)
        print("Completed running all models on this setting file")
        print(datetime.datetime.now())
Esempio n. 6
0
def run_dnn_models(properties: dict, df: numpy.ndarray, y, train_size: int,
                   class_col: int, out_folder: str, embeddingmodel,
                   embeddingformat, word_weights: list,
                   text_field_mapping: dict):
    # in order to test different DNN architectures, I implemented a parser that analyses a string following
    # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3
    # cnn layer then concatenate the output by max pooling finally into a softmax
    #
    # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them
    # to the same dataset for experiments
    #
    # the descriptor is passed as a param to 'Classifer', which parses the string to create a model
    # see 'classifier_learn.py - learn_dnn method for details
    model_descriptors = [
        "input=2d bilstm=100-False|dense=?-softmax|emb",
        #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb",
        "input=2d han_2dinput"
    ]
    # model_descriptors = [
    #     "input=2d han_2dinput"]

    # input=3d han_full|glv,
    # input=2d lstm=100-False|dense=?-softmax|glv

    # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv",
    # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"]

    ######## dnn #######

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    for model_descriptor in model_descriptors:
        print("\tML model=" + model_descriptor)

        model_descriptor = model_descriptor.split(" ")[1]

        dnn_branches = []
        dnn_branch_input_shapes = []
        input_text_info = {}
        count = 0
        for x in exp_util.load_setting("text_fieldnames", properties,
                                       overwrite_params).split("|"):
            config = x.split(",")
            map = {}

            map["text_col"] = text_field_mapping[config[0]]
            map["text_length"] = int(config[1])
            map["text_dim"] = util.DNN_EMBEDDING_DIM
            input_text_info[count] = map

            # if config[1] == 'simple':
            #     dnn_branch = dnn_classifier.create_dnn_branch(map["text_length"],
            #                                                   util.DNN_EMBEDDING_DIM,
            #                                                   model_descriptor='simple'
            #                                                   )
            # else:
            dnn_branch = dnn_classifier.create_dnn_branch(
                map["text_length"],
                util.DNN_EMBEDDING_DIM,
                model_descriptor=model_descriptor)
            dnn_branches.append(dnn_branch[0])
            dnn_branch_input_shapes.append(dnn_branch[1])
            count += 1
        # now create DNN branches based on the required input text column sources

        print("creating merged model (if multiple input branches)")
        final_model = \
            dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes,
                                            target_classes)
        print("fitting model...")

        dnn_classifier.fit_dnn_holdout(df=df,
                                       split_at_row=train_size,
                                       class_col=class_col,
                                       final_model=final_model,
                                       outfolder=out_folder,
                                       task=exp_util.describe_task(
                                           properties, overwrite_params,
                                           setting_file),
                                       model_descriptor=model_descriptor,
                                       text_norm_option=1,
                                       text_input_info=input_text_info,
                                       embedding_model=embeddingmodel,
                                       embedding_model_format=embeddingformat,
                                       word_weights=word_weights)
    print("Completed running all models on this setting file")
    print(datetime.datetime.now())