コード例 #1
0
def train_with_extended_embeddingvocab(csv_basic_feature, csv_other_feature,
                                       dnn_embedding_file, n_fold,
                                       model_descriptor,
                                       extra_data_for_embeddingvocab,
                                       extra_data_for_embeddingvocab_text_col,
                                       outfolder):
    print(datetime.datetime.now())

    tweets_exta = None
    if len(sys.argv) > 4:
        tweets_exta = generate_extra_data_for_embeddingvocab(
            extra_data_for_embeddingvocab,
            extra_data_for_embeddingvocab_text_col)

    X, y = fc.create_text_and_autodict(csv_basic_feature, csv_other_feature)
    df = pd.read_csv(csv_basic_feature, header=0, delimiter=",",
                     quoting=0).as_matrix()
    df.astype(str)
    profiles = df[:, 22]
    profiles = ["" if type(x) is float else x for x in profiles]
    cls = cm.Classifer("stakeholdercls",
                       "_dnn_text+autodictext_",
                       X,
                       y,
                       outfolder,
                       categorical_targets=6,
                       algorithms=["dnn"],
                       nfold=n_fold,
                       text_data=profiles,
                       dnn_embedding_file=dnn_embedding_file,
                       dnn_descriptor=model_descriptor,
                       dnn_text_data_extra_for_embedding_vcab=tweets_exta)
    cls.run()
コード例 #2
0
def run_cml_setting(setting_file, home_dir,
                    train_data_file, test_data_file,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    csv_training_text_data = home_dir + exp_util.load_setting('training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties,
                                                          overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    # print("embedding file is========="+dnn_embedding_file)
    emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file)

    print("loading dataset...")
    df, train_size, test_size = exp_util.\
        load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t")
    class_col = int(exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))
    print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params))

    print("fitting model...")

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    print("creating feature matrix")
    X_all = []
    for k, v in input_text_info.items():
        X_sub = tfe.get_aggr_embedding_vectors(df=df,
                                               text_col=v["text_col"],
                                               text_norm_option=1,
                                               aggr_option=1,
                                               emb_format=embedding_format,
                                               emb_model=emb_model,
                                               emb_dim=int(v["text_dim"]))
        X_all.append(X_sub)
    X_all=numpy.concatenate(X_all, axis=1)

    setting_file = setting_file[setting_file.rfind("/") + 1:]
    models=["svm_l"]
    for model_name in models:
        print("\tML model=" + model_name)
        print("fitting model...")

        cls = cml.Classifer(setting_file, model_name, X_all[0:train_size,:], y[0:train_size], outfolder,
                           categorical_targets=target_classes,
                           nfold=None, algorithms=[model_name])
        trained_model=cls.run()["svm_l"]
        cls.eval_holdout(trained_model, model_name, X_all[train_size:,:],y[train_size:] )

    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
コード例 #3
0
ファイル: exp_wop_cml.py プロジェクト: ziqizhang/wop
            config = string.split(",")
            col_name = config[1]
            text_data = cc.create_text_input_data(config[0], df)
            text_data = numpy.delete(text_data, remove_instance_indexes)
            data = ["" if type(x) is float else x for x in text_data]
            X_ngram, vocab = tfe.get_ngram_tfidf(data)
            features_from_separate_fields.append(X_ngram)
        X_all = numpy.concatenate(features_from_separate_fields, axis=1)

        print("\tfeature extraction completed.")
        print(datetime.datetime.now())
        print("\nRunning nb")
        cls = cm.Classifer(properties['label'],
                           "nb",
                           X_all,
                           y,
                           outfolder,
                           categorical_targets=target_classes,
                           nfold=n_fold,
                           algorithms=["nb"])
        cls.run()

        print(datetime.datetime.now())
        print("\nRunning knn")
        cls = cm.Classifer(properties['label'],
                           "knn",
                           X_all,
                           y,
                           outfolder,
                           categorical_targets=target_classes,
                           nfold=n_fold,
                           algorithms=["knn"])
コード例 #4
0
ファイル: exp_svm_runner.py プロジェクト: ziqizhang/wop
        # cls = cm.Classifer(k+"stakeholdercls", "_profiletext_", X, y, outfolder,
        #                     categorical_targets=6,nfold=n_fold,algorithms=["svm_l"])
        # cls.run()
        #
        #
        # #setting 2: use profile text and other features
        # print(datetime.datetime.now())
        # X, y = fc.create_features_text_and_other(csv_basic_feature, csv_other_feature)
        # cls = cm.Classifer(k+"stakeholdercls", "_profiletext+other_", X, y, outfolder,
        #                    categorical_targets=6,nfold=n_fold,algorithms=["svm_l"])
        # cls.run()


        ####### svm, pca #######
        # setting 3: use profile text feature only, but using pca
        print(datetime.datetime.now())
        X, y = fc.create_features_text(csv_basic_feature)
        cls = cm.Classifer(k+"stakeholdercls", "_profiletext_", X, y, outfolder,
                           categorical_targets=6, nfold=n_fold, algorithms=["pca-svm_l"])
        cls.run()

        # setting 4: use profile text and other features, but also using pca
        print(datetime.datetime.now())
        X, y = fc.create_features_text_and_other(csv_basic_feature, csv_other_feature)
        cls = cm.Classifer(k+"stakeholdercls", "_profiletext+other_", X, y, outfolder,
                           categorical_targets=6, nfold=n_fold, algorithms=["pca-svm_l"])
        cls.run()



コード例 #5
0
ファイル: exp_uddin.py プロジェクト: ziqizhang/msm4phi
    # Convert feature vectors to float64 type
    X = X.astype(numpy.float32)

    return X, y


if __name__ == "__main__":
    #this is the file pointing to the basic features, i.e., just the numeric values
    #msm4phi/paper2/data/training_data/basic_features.csv
    csv_feature_folder = sys.argv[1]

    #this is the folder to save output to
    outfolder = sys.argv[2]
    n_fold = 10

    print(datetime.datetime.now())
    X, y = create_features(csv_feature_folder)

    #behaviour only
    print(">>>>> _uddin2018_ >>>>>")
    print(datetime.datetime.now())
    cls = cm.Classifer("stakeholdercls",
                       "_uddin18_",
                       X,
                       y,
                       outfolder,
                       categorical_targets=6,
                       nfold=n_fold,
                       algorithms=["svm_l"])
    cls.run()
コード例 #6
0
ファイル: exp_wop_scalable.py プロジェクト: ziqizhang/wop
def run_cml_setting(setting_file,
                    home_dir,
                    overwrite_params=None,
                    embedding_format=None):
    properties = exp_util.load_properties(setting_file)

    word_weights_file = exp_util.load_setting('word_weights_file', properties,
                                              overwrite_params)
    if word_weights_file == None:
        word_weights = None
    else:
        print("using word weights to revise embedding vectors")
        word_weights = load_word_weights(word_weights_file)

    csv_training_text_data = home_dir + exp_util.load_setting(
        'training_text_data', properties, overwrite_params)

    # this is the folder to save output to
    outfolder = home_dir + exp_util.load_setting("output_folder", properties,
                                                 overwrite_params)

    print("\n" + str(datetime.datetime.now()))
    print("loading embedding models...")
    # this the Gensim compatible embedding file
    dnn_embedding_file = home_dir + exp_util.load_setting(
        "embedding_file", properties,
        overwrite_params)  # "H:/Python/glove.6B/glove.840B.300d.bin.gensim"
    # print("embedding file is========="+dnn_embedding_file)
    emb_model = embedding_util.load_emb_model(embedding_format,
                                              dnn_embedding_file)

    n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params))

    ######## dnn #######
    print("loading dataset...")
    df = pd.read_csv(
        csv_training_text_data,
        header=0,
        delimiter=";",
        quoting=0,
        encoding="utf-8",
    )
    df = df.fillna('')
    df = df.as_matrix()
    class_col = int(
        exp_util.load_setting("class_column", properties, overwrite_params))
    y = df[:, class_col]

    target_classes = len(set(y))
    print("\ttotal classes=" + str(target_classes))

    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("training_text_data_columns", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = config[0]
        map["text_length"] = int(config[2])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    print("creating feature matrix")
    X_all = []
    for k, v in input_text_info.items():
        X_sub = tfe.get_aggr_embedding_vectors(df=df,
                                               text_col=v["text_col"],
                                               text_norm_option=1,
                                               aggr_option=0,
                                               emb_format=embedding_format,
                                               emb_model=emb_model,
                                               emb_dim=int(v["text_dim"]))
        X_all.append(X_sub)
    X_all = numpy.concatenate(X_all, axis=1)

    setting_file = setting_file[setting_file.rfind("/") + 1:]
    models = ["svm_l"]
    for model_name in models:
        print("\tML model=" + model_name)
        print("fitting model...")

        cls = cml.Classifer(setting_file,
                            model_name,
                            X_all,
                            y,
                            outfolder,
                            categorical_targets=target_classes,
                            nfold=n_fold,
                            algorithms=[model_name])
        cls.run()

    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
コード例 #7
0
                            header=0,
                            delimiter=",",
                            quoting=0)
        df = dfraw.as_matrix()
        df.astype(str)
        profiles = df[:, 16]
        profiles = ["" if type(x) is float else x for x in profiles]

        outfolder = sys.argv[4]
        cls = cm.Classifer(
            "stakeholdercls",
            "_dnn_text+autodictext_",
            X,
            y,
            outfolder,
            categorical_targets=6,
            algorithms=["dnn"],
            nfold=None,
            text_data=profiles,
            dnn_embedding_file=None,
            dnn_descriptor=
            "cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"
        )
        labels = cls.predict(pretrained_model_file)

        headers = list(dfraw.columns.values)
        headers.append("label")
        with open(outfolder + "/" + csv_basic_feature, 'w',
                  newline='') as csvfile:
            csvwriter = csv.writer(csvfile,
                                   delimiter=',',
                                   quotechar='"',
コード例 #8
0
    X = numpy.array(X)

    return X, y


if __name__ == "__main__":
    #this is the file pointing to the basic features, i.e., just the numeric values
    #msm4phi/paper2/data/training_data/basic_features.csv
    csv_feature_folder = sys.argv[1]

    #this is the folder to save output to
    training_data_csv = sys.argv[2]
    outfolder = sys.argv[3]
    n_fold = 10

    print(datetime.datetime.now())
    X, y = create_features(csv_feature_folder, training_data_csv)

    #behaviour only
    print(">>>>> _penn11_ >>>>>")
    print(datetime.datetime.now())
    cls = cm.Classifer("stakeholdercls",
                       "_penn11_",
                       X,
                       y,
                       outfolder,
                       categorical_targets=6,
                       nfold=n_fold,
                       algorithms=["gbrt"])
    cls.run()
コード例 #9
0
        csv_training_text_data = v[0]
        csv_training_other_feaures = v[1]
        print(csv_training_text_data)

        for model_descriptor in model_descriptors:
            print("\t"+model_descriptor)

            #SETTING0 dnn applied to profile only
            X, y = fc.create_features_text(csv_training_text_data)
            df = pd.read_csv(csv_training_text_data, header=0, delimiter=",", quoting=0).as_matrix()
            df.astype(str)
            profiles = df[:, 22]
            profiles = ["" if type(x) is float else x for x in profiles]
            cls = cm.Classifer("stakeholdercls", "_dnn_text_", X, y, outfolder,
                               categorical_targets=6, algorithms=["dnn"], nfold=n_fold,
                               text_data=profiles, dnn_embedding_file=dnn_embedding_file,
                               dnn_descriptor=model_descriptor)
            cls.run()

            print(datetime.datetime.now())
            #X would be the 'metafeature' to pass to the dnn model. Note it MUST NOT contain text and should be
            #ready-to-use features
            X, y = fc.create_features_gazetteer(csv_training_text_data, csv_training_other_feaures)
            df = pd.read_csv(csv_training_text_data, header=0, delimiter=",", quoting=0).as_matrix()
            df.astype(str)
            profiles = df[:, 22]
            profiles = ["" if type(x) is float else x for x in profiles]
            cls = cm.Classifer("stakeholdercls", "_dnn_text+other_", X, y, outfolder,
                               categorical_targets=6, algorithms=["dnn"],nfold=n_fold,
                               text_data=profiles, dnn_embedding_file=dnn_embedding_file,
                               dnn_descriptor=model_descriptor,
コード例 #10
0
    # Convert feature vectors to float64 type
    X = X.astype(numpy.float32)

    return X, y


if __name__ == "__main__":
    #this is the file pointing to the basic features, i.e., just the numeric values
    #msm4phi/paper2/data/training_data/basic_features.csv
    csv_feature_folder = sys.argv[1]

    #this is the folder to save output to
    outfolder = sys.argv[2]
    n_fold = 10

    print(datetime.datetime.now())
    X, y = create_features(csv_feature_folder)

    #behaviour only
    print(">>>>> _preotiuc15_ >>>>>")
    print(datetime.datetime.now())
    cls = cm.Classifer("stakeholdercls",
                       "_preotiuc15_",
                       X,
                       y,
                       outfolder,
                       categorical_targets=6,
                       nfold=n_fold,
                       algorithms=["gpc"])
    cls.run()
コード例 #11
0
def run_cml_models(setting_file: str, properties: dict, df: numpy.ndarray, y,
                   train_size: int, class_col: int, out_folder: str,
                   embeddingmodel_file: str, embeddingmodel, embeddingformat,
                   text_field_mapping: dict):
    print('[STARTED] running settings with label=' +
          exp_util.load_setting("label", properties, overwrite_params))

    input_text_info = {}
    count = 0
    for x in exp_util.load_setting("text_fieldnames", properties,
                                   overwrite_params).split("|"):
        config = x.split(",")
        map = {}
        map["text_col"] = text_field_mapping[config[0]]
        map["text_length"] = int(config[1])
        map["text_dim"] = util.DNN_EMBEDDING_DIM
        input_text_info[count] = map

        count += 1

    print("creating feature matrix")
    X_all = []
    for k, v in input_text_info.items():
        # print(v)
        # if v["text_col"]==5:
        #     print("here")
        X_sub = tfe.get_aggr_embedding_vectors(df=df,
                                               text_col=v["text_col"],
                                               text_norm_option=1,
                                               aggr_option=0,
                                               emb_format=embeddingformat,
                                               emb_model=embeddingmodel,
                                               emb_dim=int(v["text_dim"]))
        X_all.append(X_sub)
    X_all = numpy.concatenate(X_all, axis=1)

    X_train = X_all[0:train_size]
    X_test = X_all[train_size:]
    y_train = y[0:train_size]
    y_test = y[train_size:]

    setting_file = setting_file[setting_file.rfind("/") + 1:]

    models = ["svm_l"]
    for model_name in models:
        identifier = model_name + "|" + embeddingmodel_file[embeddingmodel_file
                                                            .rfind("/") + 1:]
        print("\tML model and embedding=" + model_name)
        print("fitting model...")

        cls = cml.Classifer(setting_file,
                            identifier,
                            X_train,
                            y_train,
                            out_folder,
                            categorical_targets=y,
                            nfold=None,
                            algorithms=[model_name])
        trained_model = cls.run()[model_name]
        cls.eval_holdout(trained_model, model_name, X_test, y_test)

    print("Completed running all models on this setting file")
    print(datetime.datetime.now())
コード例 #12
0
        # cls = cm.Classifer(k+"stakeholdercls", "_text_only_", X, y, outfolder,
        #                      categorical_targets=6,nfold=n_fold,algorithms=["sgd","svm_l","lr","rf","svm_rbf",
        #                                                                     "pca-sgd", "pca-svm_l", "pca-lr", "pca-rf",
        #                                                                     "pca-svm_rbf"])
        # cls.run()
        #
        #
        # #text+behaviour
        print(">>>>> _text+behaviour_only_ >>>>>")
        print(datetime.datetime.now())
        X, y = fc.create_text_and_behaviour(csv_text_and_behaviour, csv_preprocessed_feature)
        # cls = cm.Classifer(k+"stakeholdercls", "_text+behaviour_", X, y, outfolder,
        #                     categorical_targets=6,nfold=n_fold,algorithms=["sgd","svm_l","lr","rf","svm_rbf",
        #                                                                    "pca-sgd", "pca-svm_l", "pca-lr", "pca-rf",
        #                                                                    "pca-svm_rbf"])
        cls = cm.Classifer(k + "stakeholdercls", "_text+behaviour_", X, y, outfolder,
                           categorical_targets=6, nfold=n_fold, algorithms=["lr"])
        cls.run()

        #text+dict
        print(">>>>> _text+dict_only_ >>>>>")
        print(datetime.datetime.now())
        X, y = fc.create_text_and_autodict(csv_text_and_behaviour, csv_preprocessed_feature)
        df = pd.read_csv(csv_text_and_behaviour, header=0, delimiter=",", quoting=0).as_matrix()
        cls = cm.Classifer(k + "stakeholdercls", "_text+dict_", X, y, outfolder,
                           categorical_targets=6, nfold=n_fold, algorithms=["sgd", "svm_l", "lr", "rf", "svm_rbf",
                                                                            "pca-sgd", "pca-svm_l", "pca-lr", "pca-rf",
                                                                            "pca-svm_rbf"])
        cls.run()

        #text+behaviour+dict
        print(">>>>> _text+dict+behaviour_only_ >>>>>")