def save_error(self, site, error): print('Error', site, error, "\n\n") self.processed_sites.append( dict( success=False, donor=site, error=error, )) save_pickle(self.tmp_file_name, self.processed_sites)
def train_val_split(self, X: list, y: list, valid_size: float, data_name=None, data_dir=None, save=True): logger.info('split train data into train and valid') Xy = [] for i in range(len(X)): Xy.append((X[i], y[i])) train, valid = train_test_split( Xy, test_size=valid_size, random_state=42) if save: train_path = data_dir / "{}.train.pkl".format(data_name) valid_path = data_dir / "{}.valid.pkl".format(data_name) save_pickle(data=train, file_path=train_path) save_pickle(data=valid, file_path=valid_path) return train, valid
def create_mask(manifest_path, output_path, max_distance, n_jobs, xy): mask = Mask.load_manifest(manifest_path=manifest_path) print_("Manifest loaded.") mask.split_manifest(XY=xy) print_("Data prepared.") pairs_collector = mask.find_pairs(n_jobs=n_jobs, search_range=max_distance) print_("Pairs located.") extracted_mask = mask.extract(pairs_collector) print_("Pairs extracted.") save_pickle(output_path, extracted_mask) print_("Done.")
def save_data(): global processed_links, links_to_process, links_with_comments_form, processed_domains stage = { 'processed_links': processed_links, 'links_to_process': links_to_process, 'links_with_comments_form': links_with_comments_form, 'processed_domains': processed_domains, } print( "Saved Data, links_with_comments_form: {}, processed_links: {}, links_to_process: {}, processed_domains: {}" .format(len(links_with_comments_form), len(processed_links), len(links_to_process), len(processed_domains))) save_pickle(tmp_file, stage) with open('domains.txt', 'w') as file: for link in processed_domains: file.write("{}\n".format(link))
def save_traintest(self): graphs = self.data[GRAPH] # load all the graphs # labels = self.labels # graphs_names = self.graphs_names # debug purposes: reshuffle all the data before the splitting random_indices = list(range(len(graphs))) random.shuffle(random_indices) graphs = [graphs[i] for i in random_indices] labels = self.labels[random_indices] graphs_names = [self.graphs_names[i] for i in random_indices] if True: train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/train_list.txt' test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/test_list.txt' train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_train_list.txt' test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_test_list.txt' train_files = [] test_files = [] g_train = [] l_train = [] n_train = [] g_test = [] l_test = [] n_test = [] with open(train_list_file, 'r') as f: train_files = [l.strip() for l in f.readlines()] with open(test_list_file, 'r') as f: test_files = [l.strip() for l in f.readlines()] for i in range(len(labels)): graph_jsonpath = graphs_names[i] # print(graph_jsonpath) if graph_jsonpath in train_files: g_train.append(graphs[i]) l_train.append(labels[i]) n_train.append(graphs_names[i]) if graph_jsonpath in test_files: g_test.append(graphs[i]) l_test.append(labels[i]) n_test.append(graphs_names[i]) l_train = torch.Tensor(l_train).type(torch.LongTensor) l_test = torch.Tensor(l_test).type(torch.LongTensor) if self.is_cuda is True: l_train = l_train.cuda() l_test = l_test.cuda() print('[app][save_traintest] len labels', len(labels)) print('[app][save_traintest] len l_test', len(l_test)) print('[app][save_traintest] len l_train', len(l_train)) tot_bgn = (labels == self.mapping['benign']).sum().item() tot_mal = (labels == self.mapping['malware']).sum().item() print('[app][save_traintest] tot_bgn', tot_bgn, 'tot_mal', tot_mal) if not os.path.isdir(self.odir): os.makedirs(self.odir) save_pickle(g_train, os.path.join(self.odir, 'train')) save_pickle(l_train, os.path.join(self.odir, 'train_labels')) save_pickle(g_test, os.path.join(self.odir, 'test')) save_pickle(l_test, os.path.join(self.odir, 'test_labels'))
def train(self, save_path='', k_fold=10, train_list_file=None, test_list_file=None): if self.pretrained_weight is not None: self.model = load_checkpoint(self.model, self.pretrained_weight, self.is_cuda) save_dir = save_path.split('/checkpoint')[0] loss_fcn = torch.nn.CrossEntropyLoss() # initialize graphs self.accuracies = np.zeros(k_fold) graphs = self.data[GRAPH] # load all the graphs # debug purposes: reshuffle all the data before the splitting random_indices = list(range(len(graphs))) random.shuffle(random_indices) graphs = [graphs[i] for i in random_indices] labels = self.labels[random_indices] graphs_names = [self.graphs_names[i] for i in random_indices] split_train_test = True if train_list_file is None and test_list_file is None else False print('[app][train] split_train_test', split_train_test) ''' if split_train_test is True: print('[app][train] train_list_file', train_list_file) print('[app][train] test_list_file', test_list_file) ############################# # Create new train/test set # Split train and test ############################# train_size = int(self.TRAIN_SIZE * len(graphs)) g_train = graphs[:train_size] l_train = labels[:train_size] n_train = graphs_names[:train_size] g_test = graphs[train_size:] l_test = labels[train_size:] n_test = graphs_names[train_size:] else: ############################# # Load train and test graphs from list ############################# train_files = [] test_files = [] g_train = [] l_train = [] n_train = [] g_test = [] l_test = [] n_test = [] with open(train_list_file, 'r') as f: train_files = [l.strip() for l in f.readlines()] with open(test_list_file, 'r') as f: test_files = [l.strip() for l in f.readlines()] for i in range(len(labels)): graph_jsonpath = graphs_names[i] # print(graph_jsonpath) if graph_jsonpath in train_files: g_train.append(graphs[i]) l_train.append(labels[i]) n_train.append(graphs_names[i]) if graph_jsonpath in test_files: g_test.append(graphs[i]) l_test.append(labels[i]) n_test.append(graphs_names[i]) l_train = torch.Tensor(l_train).type(torch.LongTensor) l_test = torch.Tensor(l_test).type(torch.LongTensor) if self.is_cuda is True: l_train = l_train.cuda() l_test = l_test.cuda() ''' print('[app][train] len labels', len(labels)) print('[app][train] len g_train', len(g_train)) # print('[app][train] g_train', g_train) if not os.path.isdir(self.odir): os.makedirs(self.odir) save_pickle(g_train, os.path.join(self.odir, 'train')) save_pickle(l_train, os.path.join(self.odir, 'train_labels')) save_pickle(g_test, os.path.join(self.odir, 'test')) save_pickle(l_test, os.path.join(self.odir, 'test_labels')) # save graph name list to txt file save_txt(n_train, os.path.join(self.odir, 'train_list.txt')) save_txt(n_test, os.path.join(self.odir, 'test_list.txt')) K = k_fold for k in range(K): self.model = self.ModelObj(g=self.data_graph[0], config_params=self.model_config, n_classes=self.data_nclasses, n_rels=self.data_nrels, n_entities=self.data_nentities, is_cuda=self.is_cuda, batch_size=1, model_src_path=self.model_src_path) print('*** [app][__init__] Model layers ***') for name, param in self.model.named_parameters(): if param.requires_grad: print('\t', name, param.data.type()) print('>>> [app][__init__] self.model.fc.weight.type', self.model.fc.weight.type()) optimizer = torch.optim.Adam( self.model.parameters(), lr=self.learning_config['lr'], weight_decay=self.learning_config['weight_decay']) start = int(len(g_train) / K) * k end = int(len(g_train) / K) * (k + 1) print('\n\n\n[app][train] Process new k=' + str(k) + ' | ' + str(start) + '-' + str(end)) # training batch train_batch_graphs = g_train[:start] + g_train[end:] train_batch_labels = l_train[list(range(0, start)) + list(range(end + 1, len(g_train)))] train_batch_samples = list( map(list, zip(train_batch_graphs, train_batch_labels))) train_batches = DataLoader( train_batch_samples, batch_size=self.learning_config['batch_size'], shuffle=True, collate_fn=collate) # testing batch val_batch_graphs = g_train[start:end] val_batch_labels = l_train[start:end] # print('[app][train] val_batch_graphs', val_batch_graphs) print('[app][train] len val_batch_graphs', len(val_batch_graphs)) print('[app][train] val_batch_graphs[0].number_of_nodes()', val_batch_graphs[0].number_of_nodes()) print('[app][train] val_batch_graphs[-1].number_of_nodes()', val_batch_graphs[-1].number_of_nodes()) val_batch = dgl.batch(val_batch_graphs) print('[app][train] train_batches size: ', len(train_batches)) print('[app][train] train_batch_graphs size: ', len(train_batch_graphs)) print('[app][train] val_batch_graphs size: ', len(val_batch_graphs)) print('[app][train] train_batches', train_batches) print('[app][train] val_batch_labels', val_batch_labels) dur = [] for epoch in range(self.learning_config['epochs']): self.model.train() if epoch >= 3: t0 = time.time() losses = [] training_accuracies = [] for iter_idx, (bg, label) in enumerate(train_batches): # print('~~~ [app][train] bg', bg) logits = self.model(bg) if self.learning_config['cuda']: label = label.cuda() loss = loss_fcn(logits, label) losses.append(loss.item()) _, indices = torch.max(logits, dim=1) # print('~~~~ logits', logits) # print('------------------') print('\t [app][train] indices', indices) # print('\t label', label) correct = torch.sum(indices == label) training_accuracies.append(correct.item() * 1.0 / len(label)) optimizer.zero_grad() loss.backward(retain_graph=True) # loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss, _ = self.model.eval_graph_classification( val_batch_labels, val_batch) print( "[app][train] Epoch {:05d} | Time(s) {:.4f} | train_acc {:.4f} | train_loss {:.4f} | val_acc {:.4f} | val_loss {:.4f}" .format(epoch, np.mean(dur) if dur else 0, np.mean(training_accuracies), np.mean(losses), val_acc, val_loss)) is_better = self.early_stopping(val_loss, self.model, save_path) if is_better: self.accuracies[k] = val_acc if self.early_stopping.early_stop: # Print model's state_dict # print("*** Model's state_dict:") # for param_tensor in self.model.state_dict(): # print(param_tensor, "\t", self.model.state_dict()[param_tensor].size()) # # Print optimizer's state_dict # print("*** Optimizer's state_dict:") # for var_name in optimizer.state_dict(): # print(var_name, "\t", optimizer.state_dict()[var_name]) # Save state dict # torch.save(self.model.state_dict(), save_dir+'/model_state.pt') # Save model # torch.save({ # 'epoch': epoch, # 'model_state_dict': self.model.state_dict(), # 'optimizer_state_dict': optimizer.state_dict(), # 'val_loss': val_loss, # }, save_dir+'/saved') print("[app][train] Early stopping") break self.early_stopping.reset()
edge_df.to_pickle(f"../data/edge_angle/{molecule}.pkl") if __name__ == "__main__": with utils.timer("make_feature_per_molecule"): for mode in ["train", "test"]: meta_df = pd.read_pickle(f"../pickle/{mode}.pkl").set_index("id") molecules = meta_df["molecule_name"].unique().tolist() st_df = pd.read_pickle("../pickle/structures.pkl") ## train or validのstructureに絞る st_df = st_df[st_df.molecule_name.isin(molecules)]\ [["molecule_name","atom_index","atom","x","y","z"]] # 分子単位に処理 st_gr = st_df.groupby("molecule_name") st_dict = {} for molecule in tqdm(molecules): st_dict[molecule] = st_gr.get_group(molecule) all_file_num = len(molecules) with Pool(4) as p: res = p.map(make_per_molecule, molecules) with utils.timer("concatenate_molecules_feature"): for mode in ["train", "test"]: meta_df = pd.read_pickle(f"../pickle/{mode}.pkl").set_index("id") molecules = meta_df["molecule_name"].unique().tolist() df_list = [] for molecule in tqdm(molecules): df_list.append( utils.load_pickle(f"../data/edge_angle/{molecule}.pkl")) all_df = pd.concat(df_list).reset_index(drop=True) utils.save_pickle(all_df, f"../pickle/{mode}_edge_angle.pkl")
def donors_loop(self, donors): Comment = SeleniumChecker() total = len(donors) count = 0 for donor in donors: count += 1 if not donor: continue if donor in self.processed_donors: continue try: tries = 1 print(getpid(), '{} of {}'.format(count, total), donor) Comment.get(donor) if not Comment.find_form(): self.save_error(donor, 'Form not found') else: comment = random.choice(self.comments) author = random.choice(self.usernames) email = random.choice(self.emails) acceptor = next(self.acceptors) posted_data, screenshot_before, screenshot_after = self.post( Comment, donor, acceptor, comment, author, email) #login to google account, wp, facebook # press button in iframe # https://stackoverflow.com/questions/27793187/unable-to-click-on-a-button-inside-an-iframe #http://jkuat.ac.ke/campuses/nairobicbd/student-email-portal-e-learning/ #https://brownsenglish.edu.au/blog/english-student-testimonial-kim/ #https://georgelakoff.com/2017/07/28/time-to-solve-the-student-debt-crisis/ while Comment.check_text( 'You are being asked to login because') != -1: tries += 1 if tries > 7: break Comment.wait() Comment.get(donor) Comment.find_form() comment = random.choice(self.comments) author = random.choice(self.usernames) email = random.choice(self.emails) posted_data, screenshot_before, screenshot_after = self.post( Comment, donor, acceptor, comment, author, email) Comment.unwait() while Comment.check_text( 'Duplicate comment detected') != -1: tries += 1 if tries > 7: break Comment.wait() Comment.get(donor) Comment.find_form() comment = random.choice(self.comments) author = random.choice(self.usernames) email = random.choice(self.emails) acceptor = next(self.acceptors) posted_data, screenshot_before, screenshot_after = self.post( Comment, donor, acceptor, comment, author, email) Comment.unwait() while Comment.check_text( 'Forbidden. Sender blacklisted.') != -1: tries += 1 if tries > 7: break Comment.wait() Comment.get(donor) Comment.find_form() comment = random.choice(self.comments) author = random.choice(self.usernames) email = random.choice(self.emails) acceptor = next(self.acceptors) posted_data, screenshot_before, screenshot_after = self.post( Comment, donor, acceptor, comment, author, email) Comment.unwait() self.processed_donors.append(donor) self.processed_sites.append( dict( success=True, donor=donor, acceptor=acceptor, comment=comment, author=author, email=email, posted_data=posted_data, screenshot_before=screenshot_before, screenshot_after=screenshot_after, )) save_pickle(self.tmp_file_name, self.processed_sites) except Exception as e: self.save_error(donor, str(e))
def _save_feature(self, train_df, test_df): new_cols = self._get_new_cols(train_df, test_df) self.train_path.parent.mkdir(exist_ok=True) self.test_path.parent.mkdir(exist_ok=True) utils.save_pickle(train_df[new_cols], self.train_path) utils.save_pickle(test_df[new_cols], self.test_path)
right_on="atom_index", how="left") del df["atom_index"] df = df.merge(atom_df, left_on="atom_index_1", right_on="atom_index", how="left") del df["atom_index"] df.rename(columns={"atom_x": "atom_0", "atom_y": "atom_1"}, inplace=True) df["bond_atom"] = df["atom_0"].map(atom_dict) + df["atom_1"].map(atom_dict) df["bond_atom"] = df["bond_atom"].map(map_func) df["molecule_name"] = molecule_name return df atom_dict = {0: "H", 1: "C", 2: "N", 3: "O", 4: "F"} #mode = "train" mode = "test" meta_df = pd.read_pickle(f"../pickle/{mode}.pkl") molecules = meta_df["molecule_name"].unique() with Pool(4) as p: res = p.map(make_bond, molecules) all_df = pd.concat(res, axis=0).reset_index(drop=True) utils.save_pickle(all_df, f"../pickle/{mode}_bond_v2.pkl") # save only connect info only_bond_df = all_df.loc[all_df.bond_type != -1, ["molecule_name", "atom_index_0", "atom_index_1"]] utils.save_pickle(only_bond_df, f"../pickle/{mode}_bond_v2_only_bond.pkl")
def train(self, save_path='', k_fold=10, train_list_file=None, test_list_file=None): if self.pretrained_weight is not None: self.model = load_checkpoint(self.model, self.pretrained_weight) loss_fcn = torch.nn.CrossEntropyLoss() # initialize graphs self.accuracies = np.zeros(k_fold) graphs = self.data[GRAPH] # load all the graphs # debug purposes: reshuffle all the data before the splitting random_indices = list(range(len(graphs))) random.shuffle(random_indices) graphs = [graphs[i] for i in random_indices] labels = self.labels[random_indices] graphs_names = [self.graphs_names[i] for i in random_indices] split_train_test = True if train_list_file is None and test_list_file is None else False print('split_train_test', split_train_test) print('train_list_file', train_list_file) print('test_list_file', test_list_file) if split_train_test is True: ############################# # Create new train/test set # Split train and test ############################# train_size = int(self.TRAIN_SIZE * len(graphs)) g_train = graphs[:train_size] l_train = labels[:train_size] n_train = graphs_names[:train_size] g_test = graphs[train_size:] l_test = labels[train_size:] n_test = graphs_names[train_size:] else: ############################# # Load train and test graphs from list ############################# train_files = [] test_files = [] g_train = [] l_train = [] n_train = [] g_test = [] l_test = [] n_test = [] with open(train_list_file, 'r') as f: train_files = [l.strip() for l in f.readlines()] with open(test_list_file, 'r') as f: test_files = [l.strip() for l in f.readlines()] for i in range(len(labels)): graph_jsonpath = graphs_names[i] # print(graph_jsonpath) if graph_jsonpath in train_files: g_train.append(graphs[i]) l_train.append(labels[i]) n_train.append(graphs_names[i]) if graph_jsonpath in test_files: g_test.append(graphs[i]) l_test.append(labels[i]) n_test.append(graphs_names[i]) l_train = torch.Tensor(l_train).type(torch.LongTensor) l_test = torch.Tensor(l_test).type(torch.LongTensor) if self.is_cuda is True: l_train = l_train.cuda() l_test = l_test.cuda() # print('len g_train', len(g_train)) # print('g_train', g_train) if not os.path.isdir(self.odir): os.makedirs(self.odir) save_pickle(g_train, os.path.join(self.odir, 'train')) save_pickle(l_train, os.path.join(self.odir, 'train_labels')) save_pickle(g_test, os.path.join(self.odir, 'test')) save_pickle(l_test, os.path.join(self.odir, 'test_labels')) # save graph name list to txt file save_txt(n_train, os.path.join(self.odir, 'train_list.txt')) save_txt(n_test, os.path.join(self.odir, 'test_list.txt')) K = k_fold for k in range(K): # K-fold cross validation # create GNN model # self.model = Model(g=self.data[GRAPH], # config_params=self.model_config, # n_classes=self.data[N_CLASSES], # n_rels=self.data[N_RELS] if N_RELS in self.data else None, # n_entities=self.data[N_ENTITIES] if N_ENTITIES in self.data else None, # is_cuda=self.learning_config['cuda'], # seq_dim=self.seq_max_length, # batch_size=1) optimizer = torch.optim.Adam( self.model.parameters(), lr=self.learning_config['lr'], weight_decay=self.learning_config['weight_decay']) if self.learning_config['cuda']: self.model.cuda() start = int(len(g_train) / K) * k end = int(len(g_train) / K) * (k + 1) print('\n\n\nProcess new k=' + str(k) + ' | ' + str(start) + '-' + str(end)) # testing batch val_batch_graphs = g_train[start:end] val_batch_labels = l_train[start:end] val_batch = dgl.batch(val_batch_graphs) # training batch train_batch_graphs = g_train[:start] + g_train[end:] train_batch_labels = l_train[list(range(0, start)) + list(range(end + 1, len(g_train)))] train_batch_samples = list( map(list, zip(train_batch_graphs, train_batch_labels))) train_batches = DataLoader( train_batch_samples, batch_size=self.learning_config['batch_size'], shuffle=True, collate_fn=collate) print('train_batches size: ', len(train_batches)) print('train_batch_graphs size: ', len(train_batch_graphs)) print('val_batch_graphs size: ', len(val_batch_graphs)) print('train_batches', train_batches) print('val_batch_labels', val_batch_labels) dur = [] for epoch in range(self.learning_config['epochs']): self.model.train() if epoch >= 3: t0 = time.time() losses = [] training_accuracies = [] for iter_idx, (bg, label) in enumerate(train_batches): logits = self.model(bg) if self.learning_config['cuda']: label = label.cuda() loss = loss_fcn(logits, label) losses.append(loss.item()) _, indices = torch.max(logits, dim=1) correct = torch.sum(indices == label) training_accuracies.append(correct.item() * 1.0 / len(label)) optimizer.zero_grad() loss.backward(retain_graph=True) optimizer.step() if epoch >= 3: dur.append(time.time() - t0) val_acc, val_loss, _ = self.model.eval_graph_classification( val_batch_labels, val_batch) print( "Epoch {:05d} | Time(s) {:.4f} | train_acc {:.4f} | train_loss {:.4f} | val_acc {:.4f} | val_loss {:.4f}" .format(epoch, np.mean(dur) if dur else 0, np.mean(training_accuracies), np.mean(losses), val_acc, val_loss)) is_better = self.early_stopping(val_loss, self.model, save_path) if is_better: self.accuracies[k] = val_acc if self.early_stopping.early_stop: print("Early stopping") break self.early_stopping.reset()
def gen_tokenizer(save_path, pretrained_model='google/electra-small-discriminator'): tokenizer = ElectraTokenizer.from_pretrained(pretrained_model) tokenizer.add_tokens(['[E]', '[/E]']) save_pickle(save_path, data=tokenizer)
def __init__(self, data, model_config, learning_config, pretrained_weight, early_stopping=True, patience=100, json_path=None, pickle_folder=None, vocab_path=None, mapping_path=None, odir=None, model_src_path=None): if model_src_path is not None: sys.path.insert(0, model_src_path + 'model.py') print('*** model_src_path', model_src_path) from model_edgnn import Model else: from models.model import Model self.data = data self.model_config = model_config # max length of a sequence (max nodes among graphs) self.learning_config = learning_config self.pretrained_weight = pretrained_weight self.is_cuda = learning_config['cuda'] # with open(vocab_path+'/../mapping.json', 'r') as f: with open(mapping_path, 'r') as f: self.mapping = json.load(f) self.labels = self.data[LABELS] self.graphs_names = self.data[GNAMES] self.data_graph = self.data[GRAPH] # save nid and eid to nodes & edges # print('self.data_graph[0]', self.data_graph[0]) if 'nid' not in self.data_graph[0].ndata: # if True: for k, g in enumerate(self.data_graph): g = self.write_nid_eid(g) self.data_graph[k] = g # print('self.data_graph', self.data_graph) save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH)) data_nclasses = self.data[N_CLASSES] if N_RELS in self.data: data_nrels = self.data[N_RELS] else: data_nrels = None if N_ENTITIES in self.data: data_nentities = self.data[N_ENTITIES] else: data_nentities = None self.model = Model(g=self.data_graph[0], config_params=model_config, n_classes=data_nclasses, n_rels=data_nrels, n_entities=data_nentities, is_cuda=self.is_cuda, batch_size=1, json_path=json_path, vocab_path=vocab_path, model_src_path=model_src_path) if self.is_cuda is True: # self.model.cuda() print('Use cuda') self.model.to(torch.device('cuda')) print('*** Model parameters ***') pp = 0 for p in list(self.model.parameters()): nn = 1 for s in list(p.size()): # print('p', p) print('\t s, nn, nn*s', s, nn, nn * s) nn = nn * s pp += nn print('Total params', pp) if early_stopping: self.early_stopping = EarlyStopping(patience=patience, verbose=True) # Output folder to save train / test data if odir is None: odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S") self.odir = odir
def __init__(self, data, model_config, learning_config, pretrained_weight, early_stopping=True, patience=100, json_path=None, pickle_folder=None, vocab_path=None, mapping_path=None, odir=None, model_src_path=None, append_nid_eid=False, gdot_path=None): if model_src_path is not None: sys.path.insert(1, model_src_path) print('*** model_src_path', model_src_path) from model_edgnn_o import Model else: from models.model_edgnn_o import Model self.data = data self.model_config = model_config # max length of a sequence (max nodes among graphs) self.learning_config = learning_config self.pretrained_weight = pretrained_weight self.is_cuda = learning_config['cuda'] # with open(vocab_path+'/../mapping.json', 'r') as f: with open(mapping_path, 'r') as f: self.mapping = json.load(f) # print('[App][__init__] GNAMES', GNAMES) # print('[App][__init__] self.data', self.data) self.graphs_names = self.data[GNAMES] # print('[App][__init__] self.graphs_names', self.graphs_names) self.data_graph = self.data[GRAPH] # save nid and eid to nodes & edges if append_nid_eid is True: print('self.data_graph[0]', self.data_graph[0]) if 'nid' not in self.data_graph[0].ndata: print('*** Not found nid. Appending...') # if True: for k, g in enumerate(self.data_graph): g = self.write_nid_eid(g) self.data_graph[k] = g # print('self.data_graph', self.data_graph) save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH)) # data_nclasses = self.data[N_CLASSES] data_nclasses = 2 if N_RELS in self.data: data_nrels = self.data[N_RELS] else: data_nrels = None if N_ENTITIES in self.data: data_nentities = self.data[N_ENTITIES] else: data_nentities = None self.model = Model(g=self.data_graph[0], config_params=self.model_config, n_classes=data_nclasses, n_rels=data_nrels, n_entities=data_nentities, is_cuda=self.is_cuda, batch_size=1, model_src_path=model_src_path, gdot_path=gdot_path) if self.is_cuda is True: # self.model.cuda() print('* Use cuda') self.model.to(torch.device('cuda')) # print('*** Model parameters ***') # pp = 0 # print('self.model', self.model) # # self.e_src_attn_layer = self.model.edgnn_layers[1].e_src_attn_fc # # self.e_group_attn_layer = self.model.edgnn_layers[1].e_group_attn_fc # # # self.dst_attn_layer = self.model.edgnn_layers[1].dst_attn_fc # # print('self.e_group_attn_layer', self.e_group_attn_layer) # # print('self.e_src_attn_layer', self.e_src_attn_layer) # # # print('self.dst_attn_layer', self.dst_attn_layer) # # self.e_group_attn_layer.register_forward_hook(self.hook) # # self.e_src_attn_layer.register_forward_hook(self.hook2) # # self.dst_attn_layer.register_forward_hook(self.hook) # for p in list(self.model.parameters()): # nn = 1 # for s in list(p.size()): # # print('p', p) # # print('\t s, nn, nn*s', s, nn, nn*s) # nn = nn*s # pp += nn # print('Total params', pp) if early_stopping: self.early_stopping = EarlyStopping(patience=patience, verbose=True) # Output folder to save train / test data if odir is None: odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S") self.odir = odir