def train_with_extended_embeddingvocab(csv_basic_feature, csv_other_feature, dnn_embedding_file, n_fold, model_descriptor, extra_data_for_embeddingvocab, extra_data_for_embeddingvocab_text_col, outfolder): print(datetime.datetime.now()) tweets_exta = None if len(sys.argv) > 4: tweets_exta = generate_extra_data_for_embeddingvocab( extra_data_for_embeddingvocab, extra_data_for_embeddingvocab_text_col) X, y = fc.create_text_and_autodict(csv_basic_feature, csv_other_feature) df = pd.read_csv(csv_basic_feature, header=0, delimiter=",", quoting=0).as_matrix() df.astype(str) profiles = df[:, 22] profiles = ["" if type(x) is float else x for x in profiles] cls = cm.Classifer("stakeholdercls", "_dnn_text+autodictext_", X, y, outfolder, categorical_targets=6, algorithms=["dnn"], nfold=n_fold, text_data=profiles, dnn_embedding_file=dnn_embedding_file, dnn_descriptor=model_descriptor, dnn_text_data_extra_for_embedding_vcab=tweets_exta) cls.run()
def run_cml_setting(setting_file, home_dir, train_data_file, test_data_file, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) csv_training_text_data = home_dir + exp_util.load_setting('training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" # print("embedding file is========="+dnn_embedding_file) emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) print("loading dataset...") df, train_size, test_size = exp_util.\ load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t") class_col = int(exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 print("creating feature matrix") X_all = [] for k, v in input_text_info.items(): X_sub = tfe.get_aggr_embedding_vectors(df=df, text_col=v["text_col"], text_norm_option=1, aggr_option=1, emb_format=embedding_format, emb_model=emb_model, emb_dim=int(v["text_dim"])) X_all.append(X_sub) X_all=numpy.concatenate(X_all, axis=1) setting_file = setting_file[setting_file.rfind("/") + 1:] models=["svm_l"] for model_name in models: print("\tML model=" + model_name) print("fitting model...") cls = cml.Classifer(setting_file, model_name, X_all[0:train_size,:], y[0:train_size], outfolder, categorical_targets=target_classes, nfold=None, algorithms=[model_name]) trained_model=cls.run()["svm_l"] cls.eval_holdout(trained_model, model_name, X_all[train_size:,:],y[train_size:] ) print("Completed running all models on this setting file") print(datetime.datetime.now())
config = string.split(",") col_name = config[1] text_data = cc.create_text_input_data(config[0], df) text_data = numpy.delete(text_data, remove_instance_indexes) data = ["" if type(x) is float else x for x in text_data] X_ngram, vocab = tfe.get_ngram_tfidf(data) features_from_separate_fields.append(X_ngram) X_all = numpy.concatenate(features_from_separate_fields, axis=1) print("\tfeature extraction completed.") print(datetime.datetime.now()) print("\nRunning nb") cls = cm.Classifer(properties['label'], "nb", X_all, y, outfolder, categorical_targets=target_classes, nfold=n_fold, algorithms=["nb"]) cls.run() print(datetime.datetime.now()) print("\nRunning knn") cls = cm.Classifer(properties['label'], "knn", X_all, y, outfolder, categorical_targets=target_classes, nfold=n_fold, algorithms=["knn"])
# cls = cm.Classifer(k+"stakeholdercls", "_profiletext_", X, y, outfolder, # categorical_targets=6,nfold=n_fold,algorithms=["svm_l"]) # cls.run() # # # #setting 2: use profile text and other features # print(datetime.datetime.now()) # X, y = fc.create_features_text_and_other(csv_basic_feature, csv_other_feature) # cls = cm.Classifer(k+"stakeholdercls", "_profiletext+other_", X, y, outfolder, # categorical_targets=6,nfold=n_fold,algorithms=["svm_l"]) # cls.run() ####### svm, pca ####### # setting 3: use profile text feature only, but using pca print(datetime.datetime.now()) X, y = fc.create_features_text(csv_basic_feature) cls = cm.Classifer(k+"stakeholdercls", "_profiletext_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["pca-svm_l"]) cls.run() # setting 4: use profile text and other features, but also using pca print(datetime.datetime.now()) X, y = fc.create_features_text_and_other(csv_basic_feature, csv_other_feature) cls = cm.Classifer(k+"stakeholdercls", "_profiletext+other_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["pca-svm_l"]) cls.run()
# Convert feature vectors to float64 type X = X.astype(numpy.float32) return X, y if __name__ == "__main__": #this is the file pointing to the basic features, i.e., just the numeric values #msm4phi/paper2/data/training_data/basic_features.csv csv_feature_folder = sys.argv[1] #this is the folder to save output to outfolder = sys.argv[2] n_fold = 10 print(datetime.datetime.now()) X, y = create_features(csv_feature_folder) #behaviour only print(">>>>> _uddin2018_ >>>>>") print(datetime.datetime.now()) cls = cm.Classifer("stakeholdercls", "_uddin18_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["svm_l"]) cls.run()
def run_cml_setting(setting_file, home_dir, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params) if word_weights_file == None: word_weights = None else: print("using word weights to revise embedding vectors") word_weights = load_word_weights(word_weights_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" # print("embedding file is========="+dnn_embedding_file) emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ) df = df.fillna('') df = df.as_matrix() class_col = int( exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 print("creating feature matrix") X_all = [] for k, v in input_text_info.items(): X_sub = tfe.get_aggr_embedding_vectors(df=df, text_col=v["text_col"], text_norm_option=1, aggr_option=0, emb_format=embedding_format, emb_model=emb_model, emb_dim=int(v["text_dim"])) X_all.append(X_sub) X_all = numpy.concatenate(X_all, axis=1) setting_file = setting_file[setting_file.rfind("/") + 1:] models = ["svm_l"] for model_name in models: print("\tML model=" + model_name) print("fitting model...") cls = cml.Classifer(setting_file, model_name, X_all, y, outfolder, categorical_targets=target_classes, nfold=n_fold, algorithms=[model_name]) cls.run() print("Completed running all models on this setting file") print(datetime.datetime.now())
header=0, delimiter=",", quoting=0) df = dfraw.as_matrix() df.astype(str) profiles = df[:, 16] profiles = ["" if type(x) is float else x for x in profiles] outfolder = sys.argv[4] cls = cm.Classifer( "stakeholdercls", "_dnn_text+autodictext_", X, y, outfolder, categorical_targets=6, algorithms=["dnn"], nfold=None, text_data=profiles, dnn_embedding_file=None, dnn_descriptor= "cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv" ) labels = cls.predict(pretrained_model_file) headers = list(dfraw.columns.values) headers.append("label") with open(outfolder + "/" + csv_basic_feature, 'w', newline='') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"',
X = numpy.array(X) return X, y if __name__ == "__main__": #this is the file pointing to the basic features, i.e., just the numeric values #msm4phi/paper2/data/training_data/basic_features.csv csv_feature_folder = sys.argv[1] #this is the folder to save output to training_data_csv = sys.argv[2] outfolder = sys.argv[3] n_fold = 10 print(datetime.datetime.now()) X, y = create_features(csv_feature_folder, training_data_csv) #behaviour only print(">>>>> _penn11_ >>>>>") print(datetime.datetime.now()) cls = cm.Classifer("stakeholdercls", "_penn11_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["gbrt"]) cls.run()
csv_training_text_data = v[0] csv_training_other_feaures = v[1] print(csv_training_text_data) for model_descriptor in model_descriptors: print("\t"+model_descriptor) #SETTING0 dnn applied to profile only X, y = fc.create_features_text(csv_training_text_data) df = pd.read_csv(csv_training_text_data, header=0, delimiter=",", quoting=0).as_matrix() df.astype(str) profiles = df[:, 22] profiles = ["" if type(x) is float else x for x in profiles] cls = cm.Classifer("stakeholdercls", "_dnn_text_", X, y, outfolder, categorical_targets=6, algorithms=["dnn"], nfold=n_fold, text_data=profiles, dnn_embedding_file=dnn_embedding_file, dnn_descriptor=model_descriptor) cls.run() print(datetime.datetime.now()) #X would be the 'metafeature' to pass to the dnn model. Note it MUST NOT contain text and should be #ready-to-use features X, y = fc.create_features_gazetteer(csv_training_text_data, csv_training_other_feaures) df = pd.read_csv(csv_training_text_data, header=0, delimiter=",", quoting=0).as_matrix() df.astype(str) profiles = df[:, 22] profiles = ["" if type(x) is float else x for x in profiles] cls = cm.Classifer("stakeholdercls", "_dnn_text+other_", X, y, outfolder, categorical_targets=6, algorithms=["dnn"],nfold=n_fold, text_data=profiles, dnn_embedding_file=dnn_embedding_file, dnn_descriptor=model_descriptor,
# Convert feature vectors to float64 type X = X.astype(numpy.float32) return X, y if __name__ == "__main__": #this is the file pointing to the basic features, i.e., just the numeric values #msm4phi/paper2/data/training_data/basic_features.csv csv_feature_folder = sys.argv[1] #this is the folder to save output to outfolder = sys.argv[2] n_fold = 10 print(datetime.datetime.now()) X, y = create_features(csv_feature_folder) #behaviour only print(">>>>> _preotiuc15_ >>>>>") print(datetime.datetime.now()) cls = cm.Classifer("stakeholdercls", "_preotiuc15_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["gpc"]) cls.run()
def run_cml_models(setting_file: str, properties: dict, df: numpy.ndarray, y, train_size: int, class_col: int, out_folder: str, embeddingmodel_file: str, embeddingmodel, embeddingformat, text_field_mapping: dict): print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) input_text_info = {} count = 0 for x in exp_util.load_setting("text_fieldnames", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = text_field_mapping[config[0]] map["text_length"] = int(config[1]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 print("creating feature matrix") X_all = [] for k, v in input_text_info.items(): # print(v) # if v["text_col"]==5: # print("here") X_sub = tfe.get_aggr_embedding_vectors(df=df, text_col=v["text_col"], text_norm_option=1, aggr_option=0, emb_format=embeddingformat, emb_model=embeddingmodel, emb_dim=int(v["text_dim"])) X_all.append(X_sub) X_all = numpy.concatenate(X_all, axis=1) X_train = X_all[0:train_size] X_test = X_all[train_size:] y_train = y[0:train_size] y_test = y[train_size:] setting_file = setting_file[setting_file.rfind("/") + 1:] models = ["svm_l"] for model_name in models: identifier = model_name + "|" + embeddingmodel_file[embeddingmodel_file .rfind("/") + 1:] print("\tML model and embedding=" + model_name) print("fitting model...") cls = cml.Classifer(setting_file, identifier, X_train, y_train, out_folder, categorical_targets=y, nfold=None, algorithms=[model_name]) trained_model = cls.run()[model_name] cls.eval_holdout(trained_model, model_name, X_test, y_test) print("Completed running all models on this setting file") print(datetime.datetime.now())
# cls = cm.Classifer(k+"stakeholdercls", "_text_only_", X, y, outfolder, # categorical_targets=6,nfold=n_fold,algorithms=["sgd","svm_l","lr","rf","svm_rbf", # "pca-sgd", "pca-svm_l", "pca-lr", "pca-rf", # "pca-svm_rbf"]) # cls.run() # # # #text+behaviour print(">>>>> _text+behaviour_only_ >>>>>") print(datetime.datetime.now()) X, y = fc.create_text_and_behaviour(csv_text_and_behaviour, csv_preprocessed_feature) # cls = cm.Classifer(k+"stakeholdercls", "_text+behaviour_", X, y, outfolder, # categorical_targets=6,nfold=n_fold,algorithms=["sgd","svm_l","lr","rf","svm_rbf", # "pca-sgd", "pca-svm_l", "pca-lr", "pca-rf", # "pca-svm_rbf"]) cls = cm.Classifer(k + "stakeholdercls", "_text+behaviour_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["lr"]) cls.run() #text+dict print(">>>>> _text+dict_only_ >>>>>") print(datetime.datetime.now()) X, y = fc.create_text_and_autodict(csv_text_and_behaviour, csv_preprocessed_feature) df = pd.read_csv(csv_text_and_behaviour, header=0, delimiter=",", quoting=0).as_matrix() cls = cm.Classifer(k + "stakeholdercls", "_text+dict_", X, y, outfolder, categorical_targets=6, nfold=n_fold, algorithms=["sgd", "svm_l", "lr", "rf", "svm_rbf", "pca-sgd", "pca-svm_l", "pca-lr", "pca-rf", "pca-svm_rbf"]) cls.run() #text+behaviour+dict print(">>>>> _text+dict+behaviour_only_ >>>>>")