def run_fasttext_setting(setting_file, home_dir, train_data_file, test_data_file, overwrite_params=None): properties = exp_util.load_properties(setting_file) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" if dnn_embedding_file.endswith('none'): dnn_embedding_file = None ######## dnn ####### print("loading dataset...") df, train_size, test_size = exp_util.\ load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t") class_col = int(exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map dnn_classifier.fit_fasttext_holdout(df=df, split_at_row=train_size, class_col=class_col, outfolder=outfolder, task=exp_util.describe_task(properties, overwrite_params, setting_file), text_norm_option=1, text_input_info=input_text_info, embedding_file=dnn_embedding_file) print("Completed running on this setting file") print(datetime.datetime.now())
def run_fasttext_model(setting_file: str, properties: dict, df: numpy.ndarray, y, train_size: int, class_col: int, outfolder: str, dnn_embedding_file, text_field_mapping: dict): # this is the folder to save output to print("\n" + str(datetime.datetime.now())) target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("text_fieldnames", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = text_field_mapping[config[0]] map["text_length"] = int(config[1]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 dnn_classifier.fit_fasttext_holdout(df=df, split_at_row=train_size, class_col=class_col, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), text_norm_option=1, text_input_info=input_text_info, embedding_file=dnn_embedding_file) print("Completed running on this setting file") print(datetime.datetime.now())
def run_dnn_setting(setting_file, home_dir, train_data_file, test_data_file, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params) if word_weights_file == None: word_weights = None else: word_weights = load_word_weights(word_weights_file) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ "input=2d bilstm=100-False|dense=?-softmax|emb", #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb", "input=2d han_2dinput"] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### print("loading dataset...") df, train_size, test_size = exp_util.\ load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t") class_col = int(exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML model=" + model_descriptor) model_descriptor = model_descriptor.split(" ")[1] dnn_branches = [] dnn_branch_input_shapes = [] input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map dnn_branch = dnn_classifier.create_dnn_branch(map["text_length"], util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor ) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) count += 1 # now create DNN branches based on the required input text column sources print("creating merged model (if multiple input branches)") final_model = \ dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes, target_classes) print("fitting model..."+str(datetime.datetime.now())) setting=os.path.splitext(os.path.basename(setting_file))[0] setting=setting[setting.index("_")+1] desc = 'setting=' + setting desc += '|embedding=' desc += os.path.splitext(os.path.basename( exp_util.load_setting('embedding_file', properties, overwrite_params)))[0] dnn_classifier.fit_dnn_holdout(df=df, split_at_row=train_size, class_col=class_col, final_model=final_model, outfolder=outfolder, task=desc, model_descriptor=model_descriptor, text_norm_option=1, text_input_info=input_text_info, embedding_model=emb_model, embedding_model_format=embedding_format, word_weights=word_weights) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_cml_setting(setting_file, home_dir, train_data_file, test_data_file, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) csv_training_text_data = home_dir + exp_util.load_setting('training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting("embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" # print("embedding file is========="+dnn_embedding_file) emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) print("loading dataset...") df, train_size, test_size = exp_util.\ load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t") class_col = int(exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 print("creating feature matrix") X_all = [] for k, v in input_text_info.items(): X_sub = tfe.get_aggr_embedding_vectors(df=df, text_col=v["text_col"], text_norm_option=1, aggr_option=1, emb_format=embedding_format, emb_model=emb_model, emb_dim=int(v["text_dim"])) X_all.append(X_sub) X_all=numpy.concatenate(X_all, axis=1) setting_file = setting_file[setting_file.rfind("/") + 1:] models=["svm_l"] for model_name in models: print("\tML model=" + model_name) print("fitting model...") cls = cml.Classifer(setting_file, model_name, X_all[0:train_size,:], y[0:train_size], outfolder, categorical_targets=target_classes, nfold=None, algorithms=[model_name]) trained_model=cls.run()["svm_l"] cls.eval_holdout(trained_model, model_name, X_all[train_size:,:],y[train_size:] ) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_cml_setting(setting_file, home_dir, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params) if word_weights_file == None: word_weights = None else: print("using word weights to revise embedding vectors") word_weights = load_word_weights(word_weights_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" # print("embedding file is========="+dnn_embedding_file) emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ) df = df.fillna('') df = df.as_matrix() class_col = int( exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 print("creating feature matrix") X_all = [] for k, v in input_text_info.items(): X_sub = tfe.get_aggr_embedding_vectors(df=df, text_col=v["text_col"], text_norm_option=1, aggr_option=0, emb_format=embedding_format, emb_model=emb_model, emb_dim=int(v["text_dim"])) X_all.append(X_sub) X_all = numpy.concatenate(X_all, axis=1) setting_file = setting_file[setting_file.rfind("/") + 1:] models = ["svm_l"] for model_name in models: print("\tML model=" + model_name) print("fitting model...") cls = cml.Classifer(setting_file, model_name, X_all, y, outfolder, categorical_targets=target_classes, nfold=n_fold, algorithms=[model_name]) cls.run() print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_fasttext_setting(setting_file, home_dir, overwrite_params=None): properties = exp_util.load_properties(setting_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" if dnn_embedding_file.endswith('none'): dnn_embedding_file = None n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ) df = df.fillna('') df = df.as_matrix() class_col = int( exp_util.load_setting("class_column", properties, overwrite_params)) y = df[:, class_col] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) print("fitting model...") input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map dnn_classifier.fit_fasttext(df=df, nfold=n_fold, class_col=class_col, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), text_norm_option=1, text_input_info=input_text_info, embedding_file=dnn_embedding_file) print("Completed running on this setting file") print(datetime.datetime.now())
def run_mtl_setting(setting_file, home_dir, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params) if word_weights_file == None: word_weights = None else: print("using word weights to revise embedding vectors...") word_weights = load_word_weights(word_weights_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" # print("embedding file is========="+dnn_embedding_file) emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ "input=2d bilstm=100-False|dense=?-softmax|emb", # "input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb", "input=2d han_2dinput" ] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ) df = df.fillna('') df = df.as_matrix() #stats about main task maintask_class_col = int( exp_util.load_setting("class_column", properties, overwrite_params)) main_y = df[:, maintask_class_col] target_classes = len(set(main_y)) print("\ttotal classes=" + str(target_classes)) #stats about auxiliary tasks auxtask_class_col = exp_util.load_setting("class_auxiliary", properties, overwrite_params) if auxtask_class_col == None: print("Not MTL, quit.") exit(1) auxtask_class_cols = [] aux_classes = [] for i in auxtask_class_col.split(","): i = int(i) aux_y = df[:, i] aux_cls = len(set(aux_y)) print("\t\t auxiliary task with classes=" + str(aux_classes)) auxtask_class_cols.append(i) aux_classes.append(aux_cls) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML shared model=" + model_descriptor) model_descriptor = model_descriptor.split(" ")[1] dnn_branches = [] dnn_branch_input_shapes = [] input_text_info = {} count = 0 for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = config[0] map["text_length"] = int(config[2]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map dnn_branch = dnn_classifier.create_dnn_branch( map["text_length"], util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) count += 1 # now create DNN branches based on the required input text column sources print("creating MTL model (if multiple input branches)") final_model = \ mtl_classifier.create_mtl_layers(dnn_branches, dnn_branch_input_shapes, target_classes, aux_classes) print("fitting model...") mtl_classifier.fit_dnn_mtl(df=df, nfold=n_fold, main_class_col=maintask_class_col, aux_class_cols=auxtask_class_cols, final_model=final_model, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), model_descriptor=model_descriptor, text_norm_option=1, text_input_info=input_text_info, embedding_model=emb_model, embedding_model_format=embedding_format, word_weights=word_weights) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_single_setting(setting_file, home_dir, remove_rare_classes, remove_no_desc_instances, overwrite_params=None, gensimFormat=None): properties = exp_util.load_properties(setting_file) csv_training_text_data = home_dir + exp_util.load_setting( 'training_text_data', properties, overwrite_params) # this is the folder containing other numeric features that are already pre-extracted csv_training_other_feaures = home_dir + exp_util.load_setting( 'training_other_features', properties, overwrite_params) # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = home_dir + exp_util.load_setting( "embedding_file", properties, overwrite_params) # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" if gensimFormat is None: gensimFormat = ".gensim" in dnn_embedding_file if gensimFormat: pretrained_embedding_models = gensim.models.KeyedVectors.load( dnn_embedding_file, mmap='r') else: pretrained_embedding_models = gensim.models.KeyedVectors. \ load_word2vec_format(dnn_embedding_file, binary=True) n_fold = int(exp_util.load_setting("n_fold", properties, overwrite_params)) # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ #"input=2d bilstm=100-False|dense=?-softmax|emb", "input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb" ] # "input=2d han_2dinput"] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### print("loading dataset...") df = pd.read_csv( csv_training_text_data, header=0, delimiter=";", quoting=0, encoding="utf-8", ).as_matrix() df.astype(str) if remove_no_desc_instances: print( "you have chosen to remove instances whose description are empty") df = exp_util.remove_empty_desc_instances(df, 5) y = df[:, int( exp_util. load_setting("class_column", properties, overwrite_params))] target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) remove_instance_indexes = [] if remove_rare_classes: print( "you have chosen to remove classes whose instances are less than n_fold" ) instance_labels = list(y) class_dist = {x: instance_labels.count(x) for x in instance_labels} remove_labels = [] for k, v in class_dist.items(): if v < n_fold: remove_labels.append(k) remove_instance_indexes = [] for i in range(len(y)): label = y[i] if label in remove_labels: remove_instance_indexes.append(i) y = numpy.delete(y, remove_instance_indexes) target_classes = len(set(y)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML model=" + model_descriptor) input_shape = model_descriptor.split(" ")[0] model_descriptor = model_descriptor.split(" ")[1] if input_shape.endswith("2d"): input_as_2D = True else: input_as_2D = False if "han" in model_descriptor or "lstm" in model_descriptor: dnn_embedding_mask_zero = True else: dnn_embedding_mask_zero = False input_column_sources = \ [x for x in exp_util.load_setting("training_text_data_columns", properties, overwrite_params).split("|")] # now create DNN branches based on the required input text column sources dnn_branches = [] dnn_branch_input_shapes = [] dnn_branch_input_features = [] for string in input_column_sources: print("\tcreating model branch=" + string) config = string.split(",") col_index = config[0] embedding_trainable = False if col_index == '13': embedding_trainable = True text_data = cc.create_text_input_data(config[0], df) col_text_length = int(config[2]) text_data = numpy.delete(text_data, remove_instance_indexes) data = ["" if type(x) is float else str(x) for x in text_data] dnn_branch = dnn_classifier.create_dnn_branch_textinput( pretrained_embedding_models, input_text_data=data, input_text_sentence_length=col_text_length, input_text_word_embedding_dim=util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor, embedding_trainable=embedding_trainable, embedding_mask_zero=dnn_embedding_mask_zero) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) dnn_branch_input_features.append(dnn_branch[2]) print("creating merged model (if multiple input branches)") final_model = \ dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes, target_classes) print("fitting model...") dnn_classifier.fit_dnn(inputs=dnn_branch_input_features, nfold=n_fold, y_train=y, final_model=final_model, outfolder=outfolder, task=exp_util.describe_task( properties, overwrite_params, setting_file), model_descriptor=model_descriptor) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_setting( setting_file, home_dir, train_data_file, test_data_file, model_choice, #dnn - including cnn,bilstm,han; cml -svm ; fasttext-fasttext dataset_type: str, #mwpd, wdc, rakuten, icecat dataset_text_field_mapping: dict, overwrite_params=None, embedding_format=None): properties = exp_util.load_properties(setting_file) word_weights_file = exp_util.load_setting('word_weights_file', properties, overwrite_params) if word_weights_file == None: word_weights = None else: print("using word weights to revise embedding vectors...") word_weights = load_word_weights(word_weights_file) print("loading dataset...") if dataset_type == "mwpd": df, train_size, test_size = exp_util. \ load_and_merge_train_test_data_jsonMPWD(train_data_file, test_data_file) elif dataset_type == "rakuten": df, train_size, test_size = exp_util. \ load_and_merge_train_test_csvRakuten(train_data_file, test_data_file, delimiter="\t") elif dataset_type == "icecat": df, train_size, test_size = exp_util. \ load_and_merge_train_test_data_jsonIceCAT(train_data_file, test_data_file) elif dataset_type == "fakerev": df, train_size, test_size = exp_util. \ load_and_merge_train_test_data_productfakerev(train_data_file, test_data_file) else: #wdc df, train_size, test_size = exp_util. \ load_and_merge_train_test_data_jsonWDC(train_data_file, test_data_file) #numpy.nan_to_num(df) class_fieldname = exp_util.load_setting("class_fieldname", properties, overwrite_params) class_col = dataset_text_field_mapping[class_fieldname] y = df[:, class_col] # this is the folder to save output to outfolder = home_dir + exp_util.load_setting("output_folder", properties, overwrite_params) print("\n" + str(datetime.datetime.now())) print("loading embedding models...") # this the Gensim compatible embedding file dnn_embedding_file = exp_util.load_setting("embedding_file", properties, overwrite_params) if dnn_embedding_file is not None and dnn_embedding_file.lower() != 'none': dnn_embedding_file = home_dir + dnn_embedding_file # "H:/Python/glove.6B/glove.840B.300d.bin.gensim" print("\t will use this embedding data model: " + dnn_embedding_file) # print("embedding file is========="+dnn_embedding_file) if embedding_format == 'none': emb_model = None else: emb_model = embedding_util.load_emb_model(embedding_format, dnn_embedding_file) if model_choice == 'dnn': run_dnn_models(properties, df, y, train_size, class_col, outfolder, emb_model, embedding_format, word_weights, dataset_text_field_mapping) elif model_choice == 'cml': run_cml_models(setting_file, properties, df, y, train_size, class_col, outfolder, dnn_embedding_file, emb_model, embedding_format, dataset_text_field_mapping) else: run_fasttext_model(setting_file, properties, df, y, train_size, class_col, outfolder, dnn_embedding_file, dataset_text_field_mapping)
def run_cml_models(setting_file: str, properties: dict, df: numpy.ndarray, y, train_size: int, class_col: int, out_folder: str, embeddingmodel_file: str, embeddingmodel, embeddingformat, text_field_mapping: dict): print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) input_text_info = {} count = 0 for x in exp_util.load_setting("text_fieldnames", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = text_field_mapping[config[0]] map["text_length"] = int(config[1]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map count += 1 print("creating feature matrix") X_all = [] for k, v in input_text_info.items(): # print(v) # if v["text_col"]==5: # print("here") X_sub = tfe.get_aggr_embedding_vectors(df=df, text_col=v["text_col"], text_norm_option=1, aggr_option=0, emb_format=embeddingformat, emb_model=embeddingmodel, emb_dim=int(v["text_dim"])) X_all.append(X_sub) X_all = numpy.concatenate(X_all, axis=1) X_train = X_all[0:train_size] X_test = X_all[train_size:] y_train = y[0:train_size] y_test = y[train_size:] setting_file = setting_file[setting_file.rfind("/") + 1:] models = ["svm_l"] for model_name in models: identifier = model_name + "|" + embeddingmodel_file[embeddingmodel_file .rfind("/") + 1:] print("\tML model and embedding=" + model_name) print("fitting model...") cls = cml.Classifer(setting_file, identifier, X_train, y_train, out_folder, categorical_targets=y, nfold=None, algorithms=[model_name]) trained_model = cls.run()[model_name] cls.eval_holdout(trained_model, model_name, X_test, y_test) print("Completed running all models on this setting file") print(datetime.datetime.now())
def run_dnn_models(properties: dict, df: numpy.ndarray, y, train_size: int, class_col: int, out_folder: str, embeddingmodel, embeddingformat, word_weights: list, text_field_mapping: dict): # in order to test different DNN architectures, I implemented a parser that analyses a string following # specific syntax, creates different architectures. This one here takes word embedding, pass it to 3 # cnn layer then concatenate the output by max pooling finally into a softmax # # So you can add mulitple descriptors in to a list, and the program will go through each model structure, apply them # to the same dataset for experiments # # the descriptor is passed as a param to 'Classifer', which parses the string to create a model # see 'classifier_learn.py - learn_dnn method for details model_descriptors = [ "input=2d bilstm=100-False|dense=?-softmax|emb", #"input=2d cnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=?-softmax|emb", "input=2d han_2dinput" ] # model_descriptors = [ # "input=2d han_2dinput"] # input=3d han_full|glv, # input=2d lstm=100-False|dense=?-softmax|glv # "scnn[2,3,4](conv1d=100,maxpooling1d=4)|maxpooling1d=4|flatten|dense=6-softmax|glv", # "scnn[2,3,4](conv1d=100)|maxpooling1d=4|flatten|dense=6-softmax|glv"] ######## dnn ####### target_classes = len(set(y)) print("\ttotal classes=" + str(target_classes)) print('[STARTED] running settings with label=' + exp_util.load_setting("label", properties, overwrite_params)) for model_descriptor in model_descriptors: print("\tML model=" + model_descriptor) model_descriptor = model_descriptor.split(" ")[1] dnn_branches = [] dnn_branch_input_shapes = [] input_text_info = {} count = 0 for x in exp_util.load_setting("text_fieldnames", properties, overwrite_params).split("|"): config = x.split(",") map = {} map["text_col"] = text_field_mapping[config[0]] map["text_length"] = int(config[1]) map["text_dim"] = util.DNN_EMBEDDING_DIM input_text_info[count] = map # if config[1] == 'simple': # dnn_branch = dnn_classifier.create_dnn_branch(map["text_length"], # util.DNN_EMBEDDING_DIM, # model_descriptor='simple' # ) # else: dnn_branch = dnn_classifier.create_dnn_branch( map["text_length"], util.DNN_EMBEDDING_DIM, model_descriptor=model_descriptor) dnn_branches.append(dnn_branch[0]) dnn_branch_input_shapes.append(dnn_branch[1]) count += 1 # now create DNN branches based on the required input text column sources print("creating merged model (if multiple input branches)") final_model = \ dnn_classifier.merge_dnn_branch(dnn_branches, dnn_branch_input_shapes, target_classes) print("fitting model...") dnn_classifier.fit_dnn_holdout(df=df, split_at_row=train_size, class_col=class_col, final_model=final_model, outfolder=out_folder, task=exp_util.describe_task( properties, overwrite_params, setting_file), model_descriptor=model_descriptor, text_norm_option=1, text_input_info=input_text_info, embedding_model=embeddingmodel, embedding_model_format=embeddingformat, word_weights=word_weights) print("Completed running all models on this setting file") print(datetime.datetime.now())