def init(params, metric, output_path, draw): # load graph structure def load_data(params): params["network_file"] = os.path.join(DATA_PATH, params["network_file"]) G = getattr(dh, params["func"])(params) return G time_path = output_path + "_time" G = load_data(params["load_data"]) module_embedding = __import__("init_embedding." + params["init_train"]["func"], fromlist=["init_embedding"]).NodeEmbedding ne = module_embedding(params["init_train"], G) print("after module_embedding") st = datetime.datetime.now() embeddings, weights = ne.train() ed = datetime.datetime.now() dh.append_to_file(time_path, str(ed - st) + "\n") with open(output_path + "_init", "w") as f: f.write( json.dumps({ "embeddings": embeddings.tolist(), "weights": weights.tolist() })) metric(embeddings) draw(embeddings) return G, embeddings, weights
def multilabel_classification(X, params): X_scaled = scale(X) n = len(X) if "label_mode" in params: y = dh.load_multilabel_ground_truth(params["ground_truth"], n, params["label_mode"]) else: y = dh.load_multilabel_ground_truth(params["ground_truth"], n) print y.shape print X.shape y = y[:n] acc = 0.0 micro_f1 = 0.0 macro_f1 = 0.0 for _ in xrange(params["times"]): X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = params["test_size"]) clf = getattr(mll, params["model"]["func"])(X_train, y_train, params["model"]) ret = mll.infer(clf, X_test, y_test) acc += ret[1] y_score = ret[0] micro_f1 += f1_score(y_test, y_score, average='micro') macro_f1 += f1_score(y_test, y_score, average='macro') acc /= float(params["times"]) micro_f1 /= float(params["times"]) macro_f1 /= float(params["times"]) return {"acc" : acc, "micro_f1": micro_f1, "macro_f1": macro_f1}
def extract_tree(params): g = dh.load_graph(os.path.join(DATA_PATH, params["network_file"])) g_mat = dh.transfer_to_matrix(g) eh = __import__('extract_hierarchy.' + params["extract_hierarchy_model"]["func"], fromlist = ["extract_hierarchy"]) tree = eh.extract_hierarchy(g, params["extract_hierarchy_model"], ) return g_mat, tree
def metric(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) # load node number node_path=os.path.join(DATA_PATH, params["data"], "node.txt") node_file=open(node_path, 'r') nodes=node_file.readlines() node_num=len(nodes) node_file.close() # load embeddings if params["file_type"] == "txt": embedding_path=os.path.join(DATA_PATH, "experiment", params["embeddings_file"]) X = dh.load_embedding(embedding_path, params["file_type"], node_num) else: embedding_path = os.path.join(DATA_PATH, "experiment", params["embeddings_file"]) X = dh.load_embedding(embedding_path, params["file_type"],node_num) # results include: accuracy, micro f1, macro f1 metric_res = classification(X, params) # insert into res for k, v in metric_res.items(): res[k] = v return res
def metric(params, info, pre_res, **kwargs): res = params_handler(params, info) mus, row2name = dh.load_dict(os.path.join(params["res_home"], "mus.dat")) sigs, row2name = dh.load_dict(os.path.join(params["res_home"], "sigs.dat")) std_sigs = np.sqrt(sigs) assert len(mus) > 0, "The mus file has no data" N = len(mus) M = len(mus[0]) # sigs is spherical or diagonal if len(std_sigs[0]) == 1: ones = np.ones_like(mus) tmp = std_sigs.reshape(N, 1) std_sigs = ones * tmp # dimension reduction if M > 2: mus, std_sigs = ct.reduce_dist_dim(mus, std_sigs, 2) res["ellipse_path"] = os.path.join(params["res_home"], "dist_ellipse.pdf") dg.draw_ellipse(mus, std_sigs, row2name, res["ellipse_path"], params["timesOfSigma"]) return res
def train_model(params): g_mat, tree = extract_tree(params) handlers = {} handlers["get_network"] = gn(g_mat, params["get_network_hierarchy"]) handlers["embedding_model"] = __import__( 'node_embedding.' + params["embedding_model"]["func"], fromlist=["node_embedding"]).NodeEmbedding handlers["transfer_embeddings"] = __import__( 'transfer_embeddings.' + params["transfer_embeddings"]["func"], fromlist=["transfer_embeddings"]).TransferEmbedding res_coordinates = [None] * len(tree) res_coordinates[len(tree) - 1] = np.zeros( params["embedding_model"]["embedding_size"], dtype=np.float32) res_radius = [None] * len(tree) res_radius[len(tree) - 1] = float(params["init_radius"]) dfs(len(tree) - 1, tree, handlers, params, res_radius, res_coordinates) res_path = params["train_output"] dh.symlink(res_path, os.path.join(RES_PATH, "new_train_res")) dh.append_to_file( res_path, json.dumps({ "radius": np.array(res_radius).tolist(), "coordinates": np.array(res_coordinates).tolist() })) return res_coordinates, res_radius
def main_old(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--operation', type=str, default="all", help="[all | init | train | metric | draw]") parser.add_argument('--conf', type=str, default="default") args = parser.parse_args() params = dh.load_json_file(os.path.join(CONF_PATH, args.conf + ".json")) metric_path_pre = os.path.join(RES_PATH, args.conf) if os.path.exists(metric_path_pre) == False: os.mkdir(metric_path_pre) output_path = os.path.join(metric_path_pre, dh.get_time_str()) metric_path = output_path + "_metric" def metric(embeddings): if "metrics" not in params: return for metric in params["metrics"]: res = getattr(Metric, metric["func"])(embeddings, metric) dh.append_to_file(metric_path, str(res) + "\n") print res dh.symlink(metric_path, os.path.join(metric_path_pre, "new_metric")) if "drawers" in params: draw_path = output_path + "_draw" if os.path.exists(draw_path) == False: os.mkdir(draw_path) draw_cnt = [0] def draw(embeddings): if "drawers" not in params: return for drawer in params['drawers']: getattr(Metric, drawer["func"])(embeddings, drawer, draw_path, draw_cnt[0]) draw_cnt[0] += 1 if args.operation == "all": G, embeddings, weights = __import__("init." + params["init"]["func"], fromlist=["init"]).init( params["init"], metric, output_path, draw) __import__("dynamic_loop." + params["main_loop"]["func"], fromlist=["dynamic_loop"]).loop(params["main_loop"], G, embeddings, weights, metric, output_path, draw) elif args.operation == "init": G, embeddings, weights = __import__("init." + params["init"]["func"], fromlist=["init"]).init( params["init"], metric, output_path, draw) elif args.operation == "draw": pass else: print "Not Support!"
def metric(embeddings): if "metrics" not in params: return for metric in params["metrics"]: res = getattr(Metric, metric["func"])(embeddings, metric) dh.append_to_file(metric_path, str(res) + "\n") print res
def test(self, model_path: str, comment: str = None, num_words: int = NUM_WORDS, max_comment_length: int = MAX_COMMENT_LENGTH) -> None: """ It tests entered comments for abusiveness """ def predict(comment: str): comment = dh.clean_comment(comment) print( 'Prediction:', tc.predict_proba(np.array([comment]), num_words=num_words, sequence_length=max_comment_length, batch_size=1)) tc = TextClassifier() dh = DataHandler() print(f'Loading model from {model_path}') tc.load(model_path) if comment is not None: predict(dh.clean_comment(comment)) else: try: print('Enter a message. ' 'I will tell you whether it is abusive or not') print('To exit, please, press Ctrl+C') while True: comment = input('--> ') predict(dh.clean_comment(comment)) except KeyboardInterrupt: return
def loop(params, G, embeddings, weights, metric, output_path, draw): params["get_next"]["input_file"] = os.path.join( DATA_PATH, params["get_next"]["input_file"]) module_next = __import__("get_next." + params["get_next"]["func"], fromlist=["get_next"]).GetNext gn = module_next(params["get_next"]) params_new = params["new_embedding"] module_new_embedding = __import__("init_embedding." + params_new["func"], fromlist=["new_embedding"]).NodeEmbedding def new_embedding(G, init_embeddings, init_weights, n): ne = module_new_embedding(params_new, G) embeddings, weights = ne.train() return embeddings, weights time_path = output_path + "_time" dynamic_embeddings = [] while True: num_new = gn.get_next(G) if num_new == 0: break st = datetime.datetime.now() embeddings, weights = new_embedding(G, embeddings, weights, num_new) ed = datetime.datetime.now() dh.append_to_file(time_path, str(ed - st) + "\n") res = metric(embeddings) draw(embeddings) dynamic_embeddings.append({ "embeddings": embeddings.tolist(), "weights": weights.tolist() }) with open(output_path + "_dynamic", "w") as f: f.write(json.dumps(dynamic_embeddings))
def metric(embeddings): if "metrics" not in params: return for metric in params["metrics"]: print("[] Start node classification...") res = getattr(Metric, metric["func"])(embeddings, metric) dh.append_to_file(metric_path, str(res) + "\n") print("[+] Metric: " + str(res))
def infer(params, info, pre_res, **kwargs): res = params_handler(params, info, pre_res) embeds = dh.get_tagonehot( os.path.join(info["network_folder"]["name"], info["network_folder"]["mix_edge"])) dh.save_as_pickle(embeds, res["entity_embedding_path"]) return res
def new_embedding(G, init_embeddings, init_weights, u): unigrams_in = dh.in_degree_distribution(G) unigrams_out = dh.out_degree_distribution(G) bs = module_new_batch(G, u, params_new["batch_strategy"]) ne = module_new_embedding(params_new["embedding_model"], init_embeddings, init_weights, unigrams_in, unigrams_out) embeddings, weights = ne.train(bs.get_batch, 1001) return embeddings, weights
def optimize(params, info, pre_res, **kwargs): #pdb.set_trace() res = params_handler(params, info, pre_res) G_entity = dh.load_entity_as_graph(os.path.join(info["network_folder"]["name"], info["network_folder"]["edge"]), \ os.path.join(info["network_folder"]["name"], info["network_folder"]["mix_edge"]), \ os.path.join(info["network_folder"]["name"], info["network_folder"]["entity"])) # G.node[id]["tags"] = binary lst tag G_tag = dh.load_edge_as_graph(params["walk_file"], \ os.path.join(info["network_folder"]["name"], info["network_folder"]["tag"])) # walk file params["embedding_model"]["en_num"] = len(G_entity.nodes()) params["embedding_model"]["tag_num"] = len(G_tag.nodes()) info["en_num"] = params["embedding_model"]["en_num"] info["tag_num"] = params["embedding_model"]["tag_num"] # get features gf_handler = __import__("get_features." + params["get_features"]["func"], fromlist=["sget_features"]) if "dim" not in params["get_features"]: params["get_features"]["dim"] = params["tag_num"] features = gf_handler.get_features(params["get_features"], info) # return numpy # model init print("[+] The embedding model is model.%s" % (params["embedding_model"]["func"])) info["logger"].info("[+] The embedding model is model.%s" % (params["embedding_model"]["func"])) params["embedding_model"]["aggregator"]["feature_num"] = params[ "get_features"]["dim"] model_handler = __import__("model." + params["embedding_model"]["func"], fromlist=["model"]) model = model_handler.TagConditionedEmbedding(params["embedding_model"], features) model.build_graph() # batch generator print("[+] The batch strategy is batch_strategy.%s" % (params["batch_strategy"])) info["logger"].info("[+] The batch strategy is batch_strategy.%s\n" % (params["batch_strategy"])) bs_handler = __import__("batch_strategy." + params["batch_strategy"], fromlist=["batch_strategy"]) bs = bs_handler.BatchStrategy(G_tag, G_entity, params) # train model res["model_path"] = model.train(bs.get_batch) # infer model return res
def infer(params, info, pre_res, **kwargs): res, G_entity, G_tag, features = params_handler(params, info, pre_res) model_handler = __import__("model." + params["embedding_model"]["func"], fromlist=["model"]) model = model_handler.TagConditionedEmbedding(params["embedding_model"], features) bs_handler = __import__("batch_strategy." + params["batch_strategy"], fromlist=["batch_strategy"]) bs = bs_handler.BatchStrategy(G_tag, G_entity, params) embeds = model.infer(bs.get_all(), params["embedding_model"]["model_path"]) dh.save_as_pickle(embeds, res["entity_embedding_path"]) return res
def init(params, metric, output_path, draw): params['output_path'] = output_path time_path = output_path + "_time" start_time = datetime.datetime.now() ut.sage_main(params) train_time = datetime.datetime.now() - start_time print("the train_time is" + str(train_time)) dh.append_to_file(time_path, str(train_time) + "\n") G = None embedding = None weight = None return G, embedding, weight
def get_account_info_from_csv(self, email): account_information = { 'name': None, 'address': None, 'city_state_postal_code': None, 'country': None, 'phone': None } account_name = DataHandler().test_data( 'first_name', email) + " " + DataHandler().test_data( 'last_name', email) account_information['name'] = account_name account_address = str(DataHandler().test_data('address_1', email)) account_information['address'] = account_address account_city_state_postal = DataHandler().test_data('city', email) + ", " \ + DataHandler().test_data('state', email) + " " + str( DataHandler().test_data('postal_code', email)) account_information[ 'city_state_postal_code'] = account_city_state_postal account_country = DataHandler().test_data('country', email) account_information['country'] = account_country account_phone = str(DataHandler().test_data('mobile_phone', email)) account_information['phone'] = account_phone return account_information
def visualization(X, params): ground_truth_path = os.path.join(DATA_PATH, params["data"], params["ground_truth"]) y = dh.load_ground_truth(ground_truth_path) y = y[:len(X)] row = len(X) column = len(X[0]) if column > 2: X = ct.reduce_embedding_dim(X, 2) X = scale(X) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.set_title('Scatter Plot') plt.xlabel('X') plt.ylabel('Y') cValue = ct.label2color(y) ax.scatter(X[:, 0], X[:, 1], c=cValue, cmap='viridis', marker='s') #plt.legend('x1') scatter_path = os.path.join(params["res_home"], params["embeddings_file"] + "scatter.pdf") plt.savefig(scatter_path) plt.show() return {"scatter_path": scatter_path}
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--conf", type=str, default="toy") parser.add_argument("--log", type=int, default=0, help="0 if log print out in screen else 1") parser.add_argument("--level", type=str, default="INFO", help="log level = INFO | DEBUG") args = parser.parse_args() params = dh.load_json(os.path.join(CONF_PATH, args.conf + ".json")) info = init(args, params["static_info"], params) info["logger"].debug("log level is DEBUG") info["logger"].info("init finished! \n %s \n" % (info)) res = {} for module in params["run_modules"]: info["logger"].info("run module: %s" % (module["func"])) mdl_name = module["func"] mdl_params = module["params"] mdl = __import__(mdl_name + "." + mdl_params["func"], fromlist=[mdl_name]) res[mdl_name] = getattr(mdl, mdl_name)(mdl_params, info=info, pre_res=res, mdl_name=mdl_name)
def init(params, metric, output_path, draw): embeddings_path = os.path.join(RES_PATH, params["embeddings_path"]) dic = dh.load_json_file(embeddings_path) embeddings = np.array(dic["embeddings"]) metric(embeddings) draw(embeddings) return None, None, None
def loop(params, G, embeddings, weights, metric, output_path, draw): embeddings_path = os.path.join(RES_PATH, params["embeddings_path"]) dynamic_embeddings = dh.load_json_file(embeddings_path) for items in dynamic_embeddings: embeddings = np.array(items["embeddings"]) metric(embeddings) draw(embeddings)
def train(self, data_path: str, model_path: str = None, glove_path: str = r'./data/glove.840B.300d.txt', embedding_dim: int = 300, num_words: int = NUM_WORDS, max_comment_length: int = MAX_COMMENT_LENGTH, epochs: int = 10, batch_size: int = 512) -> None: """ It trains a model """ print(f'Loading data from {data_path}') dh = DataHandler(data_path) tc = TextClassifier() print('Fitting to data') tc.fit(dh.X_train, dh.y_train, num_words=num_words, glove_path=glove_path, embedding_dim=embedding_dim, sequence_length=max_comment_length, validation_data=(dh.X_val, dh.y_val), epochs=epochs, batch_size=batch_size) tc.save(model_path) preds = tc.predict_proba(dh.X_test, num_words=num_words, sequence_length=max_comment_length, batch_size=batch_size) print('ROC_AUC score for test data:', roc_auc_score(dh.y_test, preds))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--conf', type=str, default="default") args = parser.parse_args() params = dh.load_json_file( os.path.join(MULTI_CONF_PATH, args.conf + ".json")) out_path = os.path.join(RES_PATH, "multi_res_" + str(int(time.time() * 1000.0))) single_params = {} for item in params.items(): if item[0] == "models": continue single_params[item[0]] = item[1] for m in params["models"]: for it in m.items(): if it[0] == "traversal": continue else: single_params[it[0]] = it[1] if "traversal" not in m: tmp = [] else: tmp = [item for item in m["traversal"].items()] with open(out_path, "a") as f: dfs(tmp, single_params, f) try: os.symlink(out_path, os.path.join(RES_PATH, "MultiRes")) except OSError: os.remove(os.path.join(RES_PATH, "MultiRes")) os.symlink(out_path, os.path.join(RES_PATH, "MultiRes"))
def classification(X, params): res = {} X_scaled = scale(X) y = dh.load_ground_truth(params["ground_truth"]) y = y[:len(X)] #print(len(y)) #print("y_0=",y[0]) ts = 0.0 for i in range(9): ts += 0.1 acc = 0.0 micro_f1 = 0.0 macro_f1 = 0.0 for _ in range(params["times"]): X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = ts, stratify = y,random_state=params["np_seed"]) clf = getattr(mll, params["model"]["func"])(X_train, y_train, params["model"]) ret = mll.infer(clf, X_test, y_test) acc += ret[1] y_score = ret[0] micro_f1 += f1_score(y_test, y_score, average='micro') macro_f1 += f1_score(y_test, y_score, average='macro') acc /= float(params["times"]) micro_f1 /= float(params["times"]) macro_f1 /= float(params["times"]) print("test_size:",ts) res["%.2f" % ts] = {"acc" : acc, "micro_f1": micro_f1, "macro_f1": macro_f1} print({"acc" : acc, "micro_f1": micro_f1, "macro_f1": macro_f1}) return res
def main(): parser = argparse.ArgumentParser( formatter_class = argparse.RawTextHelpFormatter) parser.add_argument('--operation', type = str, default = "all", help = "[all | extract_tree | train | metric]") parser.add_argument('--conf', type = str, default = "default") parser.add_argument('--metric_input', type = str, default = "new_train_res") parser.add_argument('--train_output', type = str, default = str(int(time.time() * 1000.0))) parser.add_argument('--metric_output', type = str, default = str(int(time.time() * 1000.0))) args = parser.parse_args() params = dh.load_json_file(os.path.join(CONF_PATH, args.conf + ".json")) params["metric_input"] = os.path.join(RES_PATH, args.metric_input) params["train_output"] = os.path.join(RES_PATH, "train_res_" + args.train_output) params["metric_output"] = os.path.join(RES_PATH, "metric_res_" + args.metric_output) if args.operation == "all": train_model(params) metric(params) elif args.operation == "extract_tree": extract_tree(params) elif args.operation == "train": train_model(params) elif args.operation == "metric": metric(params) else: print "Not Support!"
def extract_hierarchy(G, params): g, n, m = dh.load_tree(os.path.join(DATA_PATH, params["file_path"])) tree = [None] * n for u in g: tree[u] = Node(u, set(g[u].keys()), set()) dfs(n - 1, tree) return tree
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--operation', type=str, default="all", help="[all | train | metric | draw]") parser.add_argument('--conf', type=str, default="default") parser.add_argument('--iteration', type=int, default=10001) parser.add_argument('--model', type=str, default="model_simple") args = parser.parse_args() params = dh.load_json_file( os.path.join(SINGLE_CONF_PATH, args.conf + ".json")) params["iteration"] = args.iteration params["model"] = args.model if args.operation == "all": train_model(params) metric(params) elif args.operation == "train": train_model(params) elif args.operation == "metric": metric(params) elif args.operation == "draw": pass else: print "Not Support!"
def metric(params): G_truth = dh.load_ground_truth( os.path.join(DATA_PATH, params["ground_truth_file"])) ret = [] for metric in params["metric_function"]: ret.append(getattr(Metric, metric["func"])(G_truth, metric)) return ret
def classification(X, params): res = {} X_scaled = scale(X) ground_truth_path=os.path.join(DATA_PATH, params["data"],params["ground_truth"]) y = dh.load_ground_truth(ground_truth_path) y = y[:len(X)] #print(X_scaled.shape) #print(len(y)) #print("y_0=",y[0]) acc = 0.0 micro_f1 = 0.0 macro_f1 = 0.0 n_train = params["n_train"] print("number_of_train_set", n_train) for _ in range(params["times"]): X_train, X_test, y_train, y_test = X[:n_train, :], X[n_train:, :], y[:n_train], y[n_train:] clf = getattr(mll, params["model"]["func"])(X_train, y_train, params["model"]) ret = mll.infer(clf, X_test, y_test) acc += ret[1] y_score = ret[0] micro_f1 += f1_score(y_test, y_score, average='micro') macro_f1 += f1_score(y_test, y_score, average='macro') acc /= float(params["times"]) micro_f1 /= float(params["times"]) macro_f1 /= float(params["times"]) res = {"acc" : acc, "micro_f1": micro_f1, "macro_f1": macro_f1} print({"acc" : acc, "micro_f1": micro_f1, "macro_f1": macro_f1}) return res
def main(): parser = argparse.ArgumentParser(formatter_class = argparse.RawTextHelpFormatter) parser.add_argument("--conf", type = str, default = "lc") parser.add_argument("--level", type = str, default = "INFO", help="log level = INFO | DEBUG") args = parser.parse_args() params = dh.load_json(os.path.join(CONF_PATH, args.conf + ".json")) info = init(args, params["static_info"], params) info["logger"].info("init finished! \n %s \n" %(info)) info["logger"].debug("log level is DEBUG") res = {} for module in params["run_modules"]: mdl_name = module["func"] mdl_params = module["params"] print (mdl_name) if info["debug_level"] == 'DEBUG': pdb.set_trace() #if mdl_name in ["metric"]: # continue mdl = __import__(mdl_name + "." + mdl_params["func"], fromlist=[mdl_name]) res[mdl_name] = getattr(mdl, mdl_name)(mdl_params, info = info, pre_res = res, mdl_name = mdl_name)