Example #1
0
 def save_error(self, site, error):
     print('Error', site, error, "\n\n")
     self.processed_sites.append(
         dict(
             success=False,
             donor=site,
             error=error,
         ))
     save_pickle(self.tmp_file_name, self.processed_sites)
Example #2
0
 def train_val_split(self, X: list, y: list, valid_size: float,
                     data_name=None, data_dir=None, save=True):
     logger.info('split train data into train and valid')
     Xy = []
     for i in range(len(X)):
         Xy.append((X[i], y[i]))
     train, valid = train_test_split(
         Xy, test_size=valid_size, random_state=42)
     if save:
         train_path = data_dir / "{}.train.pkl".format(data_name)
         valid_path = data_dir / "{}.valid.pkl".format(data_name)
         save_pickle(data=train, file_path=train_path)
         save_pickle(data=valid, file_path=valid_path)
     return train, valid
Example #3
0
def create_mask(manifest_path, output_path, max_distance, n_jobs, xy):
    mask = Mask.load_manifest(manifest_path=manifest_path)
    print_("Manifest loaded.")

    mask.split_manifest(XY=xy)
    print_("Data prepared.")

    pairs_collector = mask.find_pairs(n_jobs=n_jobs, search_range=max_distance)
    print_("Pairs located.")

    extracted_mask = mask.extract(pairs_collector)
    print_("Pairs extracted.")

    save_pickle(output_path, extracted_mask)
    print_("Done.")
def save_data():
    global processed_links, links_to_process, links_with_comments_form, processed_domains
    stage = {
        'processed_links': processed_links,
        'links_to_process': links_to_process,
        'links_with_comments_form': links_with_comments_form,
        'processed_domains': processed_domains,
    }
    print(
        "Saved Data, links_with_comments_form: {}, processed_links: {}, links_to_process: {}, processed_domains: {}"
        .format(len(links_with_comments_form), len(processed_links),
                len(links_to_process), len(processed_domains)))
    save_pickle(tmp_file, stage)

    with open('domains.txt', 'w') as file:
        for link in processed_domains:
            file.write("{}\n".format(link))
    def save_traintest(self):
        graphs = self.data[GRAPH]  # load all the graphs
        # labels = self.labels
        # graphs_names = self.graphs_names
        # debug purposes: reshuffle all the data before the splitting
        random_indices = list(range(len(graphs)))
        random.shuffle(random_indices)
        graphs = [graphs[i] for i in random_indices]
        labels = self.labels[random_indices]
        graphs_names = [self.graphs_names[i] for i in random_indices]

        if True:
            train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/train_list.txt'
            test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/__save_results/reverse__TuTu__vocabtutu__iapi__tfidf__topk=3/9691/test_list.txt'

            train_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_train_list.txt'
            test_list_file = '/media/tunguyen/TuTu_Passport/MTAAV/HAN-sec-new/data/TuTu_test_list.txt'

            train_files = []
            test_files = []
            g_train = []
            l_train = []
            n_train = []
            g_test = []
            l_test = []
            n_test = []
            with open(train_list_file, 'r') as f:
                train_files = [l.strip() for l in f.readlines()]
            with open(test_list_file, 'r') as f:
                test_files = [l.strip() for l in f.readlines()]

            for i in range(len(labels)):
                graph_jsonpath = graphs_names[i]
                # print(graph_jsonpath)
                if graph_jsonpath in train_files:
                    g_train.append(graphs[i])
                    l_train.append(labels[i])
                    n_train.append(graphs_names[i])
                if graph_jsonpath in test_files:
                    g_test.append(graphs[i])
                    l_test.append(labels[i])
                    n_test.append(graphs_names[i])

            l_train = torch.Tensor(l_train).type(torch.LongTensor)
            l_test = torch.Tensor(l_test).type(torch.LongTensor)
            if self.is_cuda is True:
                l_train = l_train.cuda()
                l_test = l_test.cuda()

        print('[app][save_traintest] len labels', len(labels))
        print('[app][save_traintest] len l_test', len(l_test))
        print('[app][save_traintest] len l_train', len(l_train))
        tot_bgn = (labels == self.mapping['benign']).sum().item()
        tot_mal = (labels == self.mapping['malware']).sum().item()
        print('[app][save_traintest] tot_bgn', tot_bgn, 'tot_mal', tot_mal)

        if not os.path.isdir(self.odir):
            os.makedirs(self.odir)
        save_pickle(g_train, os.path.join(self.odir, 'train'))
        save_pickle(l_train, os.path.join(self.odir, 'train_labels'))
        save_pickle(g_test, os.path.join(self.odir, 'test'))
        save_pickle(l_test, os.path.join(self.odir, 'test_labels'))
    def train(self,
              save_path='',
              k_fold=10,
              train_list_file=None,
              test_list_file=None):
        if self.pretrained_weight is not None:
            self.model = load_checkpoint(self.model, self.pretrained_weight,
                                         self.is_cuda)
        save_dir = save_path.split('/checkpoint')[0]

        loss_fcn = torch.nn.CrossEntropyLoss()

        # initialize graphs
        self.accuracies = np.zeros(k_fold)
        graphs = self.data[GRAPH]  # load all the graphs

        # debug purposes: reshuffle all the data before the splitting
        random_indices = list(range(len(graphs)))
        random.shuffle(random_indices)
        graphs = [graphs[i] for i in random_indices]
        labels = self.labels[random_indices]
        graphs_names = [self.graphs_names[i] for i in random_indices]

        split_train_test = True if train_list_file is None and test_list_file is None else False
        print('[app][train] split_train_test', split_train_test)
        '''
        if split_train_test is True:
            print('[app][train] train_list_file', train_list_file)
            print('[app][train] test_list_file', test_list_file)
            #############################
            # Create new train/test set
            # Split train and test
            #############################
            train_size = int(self.TRAIN_SIZE * len(graphs))
            g_train = graphs[:train_size]
            l_train = labels[:train_size]
            n_train = graphs_names[:train_size]

            g_test = graphs[train_size:]
            l_test = labels[train_size:]
            n_test = graphs_names[train_size:]
            
        else:
            #############################
            # Load train and test graphs from list
            #############################
            train_files = []
            test_files = []
            g_train = []
            l_train = []
            n_train = []
            g_test = []
            l_test = []
            n_test = []
            with open(train_list_file, 'r') as f:
                train_files = [l.strip() for l in f.readlines()]
            with open(test_list_file, 'r') as f:
                test_files = [l.strip() for l in f.readlines()]
            
            for i in range(len(labels)):
                graph_jsonpath = graphs_names[i]
                # print(graph_jsonpath)
                if graph_jsonpath in train_files:
                    g_train.append(graphs[i])
                    l_train.append(labels[i])
                    n_train.append(graphs_names[i])
                if graph_jsonpath in test_files:
                    g_test.append(graphs[i])
                    l_test.append(labels[i])
                    n_test.append(graphs_names[i])

            l_train = torch.Tensor(l_train).type(torch.LongTensor)
            l_test = torch.Tensor(l_test).type(torch.LongTensor)
            if self.is_cuda is True:
                l_train = l_train.cuda()
                l_test = l_test.cuda()
        '''

        print('[app][train] len labels', len(labels))
        print('[app][train] len g_train', len(g_train))
        # print('[app][train] g_train', g_train)

        if not os.path.isdir(self.odir):
            os.makedirs(self.odir)
        save_pickle(g_train, os.path.join(self.odir, 'train'))
        save_pickle(l_train, os.path.join(self.odir, 'train_labels'))
        save_pickle(g_test, os.path.join(self.odir, 'test'))
        save_pickle(l_test, os.path.join(self.odir, 'test_labels'))

        # save graph name list to txt file
        save_txt(n_train, os.path.join(self.odir, 'train_list.txt'))
        save_txt(n_test, os.path.join(self.odir, 'test_list.txt'))

        K = k_fold
        for k in range(K):
            self.model = self.ModelObj(g=self.data_graph[0],
                                       config_params=self.model_config,
                                       n_classes=self.data_nclasses,
                                       n_rels=self.data_nrels,
                                       n_entities=self.data_nentities,
                                       is_cuda=self.is_cuda,
                                       batch_size=1,
                                       model_src_path=self.model_src_path)

            print('*** [app][__init__] Model layers ***')
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    print('\t', name, param.data.type())

            print('>>> [app][__init__] self.model.fc.weight.type',
                  self.model.fc.weight.type())

            optimizer = torch.optim.Adam(
                self.model.parameters(),
                lr=self.learning_config['lr'],
                weight_decay=self.learning_config['weight_decay'])

            start = int(len(g_train) / K) * k
            end = int(len(g_train) / K) * (k + 1)
            print('\n\n\n[app][train] Process new k=' + str(k) + ' | ' +
                  str(start) + '-' + str(end))

            # training batch
            train_batch_graphs = g_train[:start] + g_train[end:]
            train_batch_labels = l_train[list(range(0, start)) +
                                         list(range(end + 1, len(g_train)))]
            train_batch_samples = list(
                map(list, zip(train_batch_graphs, train_batch_labels)))
            train_batches = DataLoader(
                train_batch_samples,
                batch_size=self.learning_config['batch_size'],
                shuffle=True,
                collate_fn=collate)

            # testing batch
            val_batch_graphs = g_train[start:end]
            val_batch_labels = l_train[start:end]
            # print('[app][train] val_batch_graphs', val_batch_graphs)
            print('[app][train] len val_batch_graphs', len(val_batch_graphs))
            print('[app][train] val_batch_graphs[0].number_of_nodes()',
                  val_batch_graphs[0].number_of_nodes())
            print('[app][train] val_batch_graphs[-1].number_of_nodes()',
                  val_batch_graphs[-1].number_of_nodes())
            val_batch = dgl.batch(val_batch_graphs)

            print('[app][train] train_batches size: ', len(train_batches))
            print('[app][train] train_batch_graphs size: ',
                  len(train_batch_graphs))
            print('[app][train] val_batch_graphs size: ',
                  len(val_batch_graphs))
            print('[app][train] train_batches', train_batches)
            print('[app][train] val_batch_labels', val_batch_labels)

            dur = []
            for epoch in range(self.learning_config['epochs']):
                self.model.train()
                if epoch >= 3:
                    t0 = time.time()
                losses = []
                training_accuracies = []
                for iter_idx, (bg, label) in enumerate(train_batches):
                    # print('~~~ [app][train] bg', bg)
                    logits = self.model(bg)
                    if self.learning_config['cuda']:
                        label = label.cuda()
                    loss = loss_fcn(logits, label)
                    losses.append(loss.item())
                    _, indices = torch.max(logits, dim=1)
                    # print('~~~~ logits', logits)
                    # print('------------------')
                    print('\t [app][train] indices', indices)
                    # print('\t label', label)
                    correct = torch.sum(indices == label)
                    training_accuracies.append(correct.item() * 1.0 /
                                               len(label))

                    optimizer.zero_grad()
                    loss.backward(retain_graph=True)
                    # loss.backward()
                    optimizer.step()

                if epoch >= 3:
                    dur.append(time.time() - t0)

                val_acc, val_loss, _ = self.model.eval_graph_classification(
                    val_batch_labels, val_batch)
                print(
                    "[app][train] Epoch {:05d} | Time(s) {:.4f} | train_acc {:.4f} | train_loss {:.4f} | val_acc {:.4f} | val_loss {:.4f}"
                    .format(epoch,
                            np.mean(dur) if dur else 0,
                            np.mean(training_accuracies), np.mean(losses),
                            val_acc, val_loss))

                is_better = self.early_stopping(val_loss, self.model,
                                                save_path)
                if is_better:
                    self.accuracies[k] = val_acc

                if self.early_stopping.early_stop:
                    # Print model's state_dict
                    # print("*** Model's state_dict:")
                    # for param_tensor in self.model.state_dict():
                    #     print(param_tensor, "\t", self.model.state_dict()[param_tensor].size())

                    # # Print optimizer's state_dict
                    # print("*** Optimizer's state_dict:")
                    # for var_name in optimizer.state_dict():
                    #     print(var_name, "\t", optimizer.state_dict()[var_name])

                    # Save state dict
                    # torch.save(self.model.state_dict(), save_dir+'/model_state.pt')

                    # Save model
                    # torch.save({
                    #     'epoch': epoch,
                    #     'model_state_dict': self.model.state_dict(),
                    #     'optimizer_state_dict': optimizer.state_dict(),
                    #     'val_loss': val_loss,
                    # }, save_dir+'/saved')

                    print("[app][train] Early stopping")
                    break

            self.early_stopping.reset()
Example #7
0
    edge_df.to_pickle(f"../data/edge_angle/{molecule}.pkl")


if __name__ == "__main__":
    with utils.timer("make_feature_per_molecule"):
        for mode in ["train", "test"]:
            meta_df = pd.read_pickle(f"../pickle/{mode}.pkl").set_index("id")
            molecules = meta_df["molecule_name"].unique().tolist()
            st_df = pd.read_pickle("../pickle/structures.pkl")
            ## train or validのstructureに絞る
            st_df = st_df[st_df.molecule_name.isin(molecules)]\
                    [["molecule_name","atom_index","atom","x","y","z"]]
            # 分子単位に処理
            st_gr = st_df.groupby("molecule_name")
            st_dict = {}
            for molecule in tqdm(molecules):
                st_dict[molecule] = st_gr.get_group(molecule)
            all_file_num = len(molecules)
            with Pool(4) as p:
                res = p.map(make_per_molecule, molecules)
    with utils.timer("concatenate_molecules_feature"):
        for mode in ["train", "test"]:
            meta_df = pd.read_pickle(f"../pickle/{mode}.pkl").set_index("id")
            molecules = meta_df["molecule_name"].unique().tolist()
            df_list = []
            for molecule in tqdm(molecules):
                df_list.append(
                    utils.load_pickle(f"../data/edge_angle/{molecule}.pkl"))
            all_df = pd.concat(df_list).reset_index(drop=True)
            utils.save_pickle(all_df, f"../pickle/{mode}_edge_angle.pkl")
Example #8
0
    def donors_loop(self, donors):
        Comment = SeleniumChecker()
        total = len(donors)
        count = 0

        for donor in donors:
            count += 1

            if not donor: continue
            if donor in self.processed_donors: continue

            try:
                tries = 1
                print(getpid(), '{} of {}'.format(count, total), donor)
                Comment.get(donor)

                if not Comment.find_form():
                    self.save_error(donor, 'Form not found')
                else:
                    comment = random.choice(self.comments)
                    author = random.choice(self.usernames)
                    email = random.choice(self.emails)
                    acceptor = next(self.acceptors)

                    posted_data, screenshot_before, screenshot_after = self.post(
                        Comment, donor, acceptor, comment, author, email)

                    #login to google account, wp, facebook

                    # press button in iframe
                    # https://stackoverflow.com/questions/27793187/unable-to-click-on-a-button-inside-an-iframe

                    #http://jkuat.ac.ke/campuses/nairobicbd/student-email-portal-e-learning/
                    #https://brownsenglish.edu.au/blog/english-student-testimonial-kim/
                    #https://georgelakoff.com/2017/07/28/time-to-solve-the-student-debt-crisis/

                    while Comment.check_text(
                            'You are being asked to login because') != -1:
                        tries += 1
                        if tries > 7: break
                        Comment.wait()
                        Comment.get(donor)
                        Comment.find_form()
                        comment = random.choice(self.comments)
                        author = random.choice(self.usernames)
                        email = random.choice(self.emails)
                        posted_data, screenshot_before, screenshot_after = self.post(
                            Comment, donor, acceptor, comment, author, email)
                        Comment.unwait()

                    while Comment.check_text(
                            'Duplicate comment detected') != -1:
                        tries += 1
                        if tries > 7: break
                        Comment.wait()
                        Comment.get(donor)
                        Comment.find_form()
                        comment = random.choice(self.comments)
                        author = random.choice(self.usernames)
                        email = random.choice(self.emails)
                        acceptor = next(self.acceptors)
                        posted_data, screenshot_before, screenshot_after = self.post(
                            Comment, donor, acceptor, comment, author, email)
                        Comment.unwait()

                    while Comment.check_text(
                            'Forbidden. Sender blacklisted.') != -1:
                        tries += 1
                        if tries > 7: break
                        Comment.wait()
                        Comment.get(donor)
                        Comment.find_form()
                        comment = random.choice(self.comments)
                        author = random.choice(self.usernames)
                        email = random.choice(self.emails)
                        acceptor = next(self.acceptors)
                        posted_data, screenshot_before, screenshot_after = self.post(
                            Comment, donor, acceptor, comment, author, email)
                        Comment.unwait()

                    self.processed_donors.append(donor)
                    self.processed_sites.append(
                        dict(
                            success=True,
                            donor=donor,
                            acceptor=acceptor,
                            comment=comment,
                            author=author,
                            email=email,
                            posted_data=posted_data,
                            screenshot_before=screenshot_before,
                            screenshot_after=screenshot_after,
                        ))
                    save_pickle(self.tmp_file_name, self.processed_sites)
            except Exception as e:
                self.save_error(donor, str(e))
Example #9
0
 def _save_feature(self, train_df, test_df):
     new_cols = self._get_new_cols(train_df, test_df)
     self.train_path.parent.mkdir(exist_ok=True)
     self.test_path.parent.mkdir(exist_ok=True)
     utils.save_pickle(train_df[new_cols], self.train_path)
     utils.save_pickle(test_df[new_cols], self.test_path)
Example #10
0
                  right_on="atom_index",
                  how="left")
    del df["atom_index"]
    df = df.merge(atom_df,
                  left_on="atom_index_1",
                  right_on="atom_index",
                  how="left")
    del df["atom_index"]
    df.rename(columns={"atom_x": "atom_0", "atom_y": "atom_1"}, inplace=True)
    df["bond_atom"] = df["atom_0"].map(atom_dict) + df["atom_1"].map(atom_dict)
    df["bond_atom"] = df["bond_atom"].map(map_func)
    df["molecule_name"] = molecule_name
    return df


atom_dict = {0: "H", 1: "C", 2: "N", 3: "O", 4: "F"}
#mode = "train"
mode = "test"
meta_df = pd.read_pickle(f"../pickle/{mode}.pkl")
molecules = meta_df["molecule_name"].unique()

with Pool(4) as p:
    res = p.map(make_bond, molecules)

all_df = pd.concat(res, axis=0).reset_index(drop=True)
utils.save_pickle(all_df, f"../pickle/{mode}_bond_v2.pkl")
# save only connect info
only_bond_df = all_df.loc[all_df.bond_type != -1,
                          ["molecule_name", "atom_index_0", "atom_index_1"]]
utils.save_pickle(only_bond_df, f"../pickle/{mode}_bond_v2_only_bond.pkl")
Example #11
0
    def train(self,
              save_path='',
              k_fold=10,
              train_list_file=None,
              test_list_file=None):
        if self.pretrained_weight is not None:
            self.model = load_checkpoint(self.model, self.pretrained_weight)

        loss_fcn = torch.nn.CrossEntropyLoss()

        # initialize graphs
        self.accuracies = np.zeros(k_fold)
        graphs = self.data[GRAPH]  # load all the graphs

        # debug purposes: reshuffle all the data before the splitting
        random_indices = list(range(len(graphs)))
        random.shuffle(random_indices)
        graphs = [graphs[i] for i in random_indices]
        labels = self.labels[random_indices]
        graphs_names = [self.graphs_names[i] for i in random_indices]

        split_train_test = True if train_list_file is None and test_list_file is None else False
        print('split_train_test', split_train_test)
        print('train_list_file', train_list_file)
        print('test_list_file', test_list_file)

        if split_train_test is True:
            #############################
            # Create new train/test set
            # Split train and test
            #############################
            train_size = int(self.TRAIN_SIZE * len(graphs))
            g_train = graphs[:train_size]
            l_train = labels[:train_size]
            n_train = graphs_names[:train_size]

            g_test = graphs[train_size:]
            l_test = labels[train_size:]
            n_test = graphs_names[train_size:]

        else:
            #############################
            # Load train and test graphs from list
            #############################
            train_files = []
            test_files = []
            g_train = []
            l_train = []
            n_train = []
            g_test = []
            l_test = []
            n_test = []
            with open(train_list_file, 'r') as f:
                train_files = [l.strip() for l in f.readlines()]
            with open(test_list_file, 'r') as f:
                test_files = [l.strip() for l in f.readlines()]

            for i in range(len(labels)):
                graph_jsonpath = graphs_names[i]
                # print(graph_jsonpath)
                if graph_jsonpath in train_files:
                    g_train.append(graphs[i])
                    l_train.append(labels[i])
                    n_train.append(graphs_names[i])
                if graph_jsonpath in test_files:
                    g_test.append(graphs[i])
                    l_test.append(labels[i])
                    n_test.append(graphs_names[i])

            l_train = torch.Tensor(l_train).type(torch.LongTensor)
            l_test = torch.Tensor(l_test).type(torch.LongTensor)
            if self.is_cuda is True:
                l_train = l_train.cuda()
                l_test = l_test.cuda()

        # print('len g_train', len(g_train))
        # print('g_train', g_train)

        if not os.path.isdir(self.odir):
            os.makedirs(self.odir)
        save_pickle(g_train, os.path.join(self.odir, 'train'))
        save_pickle(l_train, os.path.join(self.odir, 'train_labels'))
        save_pickle(g_test, os.path.join(self.odir, 'test'))
        save_pickle(l_test, os.path.join(self.odir, 'test_labels'))

        # save graph name list to txt file
        save_txt(n_train, os.path.join(self.odir, 'train_list.txt'))
        save_txt(n_test, os.path.join(self.odir, 'test_list.txt'))

        K = k_fold
        for k in range(K):  # K-fold cross validation

            # create GNN model
            # self.model = Model(g=self.data[GRAPH],
            #                    config_params=self.model_config,
            #                    n_classes=self.data[N_CLASSES],
            #                    n_rels=self.data[N_RELS] if N_RELS in self.data else None,
            #                    n_entities=self.data[N_ENTITIES] if N_ENTITIES in self.data else None,
            #                    is_cuda=self.learning_config['cuda'],
            #                    seq_dim=self.seq_max_length,
            #                    batch_size=1)

            optimizer = torch.optim.Adam(
                self.model.parameters(),
                lr=self.learning_config['lr'],
                weight_decay=self.learning_config['weight_decay'])

            if self.learning_config['cuda']:
                self.model.cuda()

            start = int(len(g_train) / K) * k
            end = int(len(g_train) / K) * (k + 1)
            print('\n\n\nProcess new k=' + str(k) + ' | ' + str(start) + '-' +
                  str(end))

            # testing batch
            val_batch_graphs = g_train[start:end]
            val_batch_labels = l_train[start:end]
            val_batch = dgl.batch(val_batch_graphs)

            # training batch
            train_batch_graphs = g_train[:start] + g_train[end:]
            train_batch_labels = l_train[list(range(0, start)) +
                                         list(range(end + 1, len(g_train)))]
            train_batch_samples = list(
                map(list, zip(train_batch_graphs, train_batch_labels)))
            train_batches = DataLoader(
                train_batch_samples,
                batch_size=self.learning_config['batch_size'],
                shuffle=True,
                collate_fn=collate)

            print('train_batches size: ', len(train_batches))
            print('train_batch_graphs size: ', len(train_batch_graphs))
            print('val_batch_graphs size: ', len(val_batch_graphs))
            print('train_batches', train_batches)
            print('val_batch_labels', val_batch_labels)

            dur = []
            for epoch in range(self.learning_config['epochs']):
                self.model.train()
                if epoch >= 3:
                    t0 = time.time()
                losses = []
                training_accuracies = []
                for iter_idx, (bg, label) in enumerate(train_batches):
                    logits = self.model(bg)
                    if self.learning_config['cuda']:
                        label = label.cuda()
                    loss = loss_fcn(logits, label)
                    losses.append(loss.item())
                    _, indices = torch.max(logits, dim=1)
                    correct = torch.sum(indices == label)
                    training_accuracies.append(correct.item() * 1.0 /
                                               len(label))

                    optimizer.zero_grad()
                    loss.backward(retain_graph=True)
                    optimizer.step()

                if epoch >= 3:
                    dur.append(time.time() - t0)

                val_acc, val_loss, _ = self.model.eval_graph_classification(
                    val_batch_labels, val_batch)
                print(
                    "Epoch {:05d} | Time(s) {:.4f} | train_acc {:.4f} | train_loss {:.4f} | val_acc {:.4f} | val_loss {:.4f}"
                    .format(epoch,
                            np.mean(dur) if dur else 0,
                            np.mean(training_accuracies), np.mean(losses),
                            val_acc, val_loss))

                is_better = self.early_stopping(val_loss, self.model,
                                                save_path)
                if is_better:
                    self.accuracies[k] = val_acc

                if self.early_stopping.early_stop:
                    print("Early stopping")
                    break

            self.early_stopping.reset()
Example #12
0
def gen_tokenizer(save_path,
                  pretrained_model='google/electra-small-discriminator'):
    tokenizer = ElectraTokenizer.from_pretrained(pretrained_model)
    tokenizer.add_tokens(['[E]', '[/E]'])
    save_pickle(save_path, data=tokenizer)
Example #13
0
    def __init__(self,
                 data,
                 model_config,
                 learning_config,
                 pretrained_weight,
                 early_stopping=True,
                 patience=100,
                 json_path=None,
                 pickle_folder=None,
                 vocab_path=None,
                 mapping_path=None,
                 odir=None,
                 model_src_path=None):
        if model_src_path is not None:
            sys.path.insert(0, model_src_path + 'model.py')
            print('*** model_src_path', model_src_path)
            from model_edgnn import Model
        else:
            from models.model import Model

        self.data = data
        self.model_config = model_config
        # max length of a sequence (max nodes among graphs)
        self.learning_config = learning_config
        self.pretrained_weight = pretrained_weight
        self.is_cuda = learning_config['cuda']

        # with open(vocab_path+'/../mapping.json', 'r') as f:
        with open(mapping_path, 'r') as f:
            self.mapping = json.load(f)

        self.labels = self.data[LABELS]
        self.graphs_names = self.data[GNAMES]

        self.data_graph = self.data[GRAPH]

        # save nid and eid to nodes & edges
        # print('self.data_graph[0]', self.data_graph[0])
        if 'nid' not in self.data_graph[0].ndata:
            # if True:
            for k, g in enumerate(self.data_graph):
                g = self.write_nid_eid(g)
                self.data_graph[k] = g
            # print('self.data_graph', self.data_graph)
        save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH))

        data_nclasses = self.data[N_CLASSES]
        if N_RELS in self.data:
            data_nrels = self.data[N_RELS]
        else:
            data_nrels = None

        if N_ENTITIES in self.data:
            data_nentities = self.data[N_ENTITIES]
        else:
            data_nentities = None

        self.model = Model(g=self.data_graph[0],
                           config_params=model_config,
                           n_classes=data_nclasses,
                           n_rels=data_nrels,
                           n_entities=data_nentities,
                           is_cuda=self.is_cuda,
                           batch_size=1,
                           json_path=json_path,
                           vocab_path=vocab_path,
                           model_src_path=model_src_path)

        if self.is_cuda is True:
            # self.model.cuda()
            print('Use cuda')
            self.model.to(torch.device('cuda'))

        print('*** Model parameters ***')
        pp = 0
        for p in list(self.model.parameters()):
            nn = 1
            for s in list(p.size()):
                # print('p', p)
                print('\t s, nn, nn*s', s, nn, nn * s)
                nn = nn * s
            pp += nn
        print('Total params', pp)

        if early_stopping:
            self.early_stopping = EarlyStopping(patience=patience,
                                                verbose=True)

        # Output folder to save train / test data
        if odir is None:
            odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S")
        self.odir = odir
Example #14
0
    def __init__(self,
                 data,
                 model_config,
                 learning_config,
                 pretrained_weight,
                 early_stopping=True,
                 patience=100,
                 json_path=None,
                 pickle_folder=None,
                 vocab_path=None,
                 mapping_path=None,
                 odir=None,
                 model_src_path=None,
                 append_nid_eid=False,
                 gdot_path=None):
        if model_src_path is not None:
            sys.path.insert(1, model_src_path)
            print('*** model_src_path', model_src_path)
            from model_edgnn_o import Model
        else:
            from models.model_edgnn_o import Model

        self.data = data
        self.model_config = model_config
        # max length of a sequence (max nodes among graphs)
        self.learning_config = learning_config
        self.pretrained_weight = pretrained_weight
        self.is_cuda = learning_config['cuda']

        # with open(vocab_path+'/../mapping.json', 'r') as f:
        with open(mapping_path, 'r') as f:
            self.mapping = json.load(f)

        # print('[App][__init__] GNAMES', GNAMES)
        # print('[App][__init__] self.data', self.data)
        self.graphs_names = self.data[GNAMES]
        # print('[App][__init__] self.graphs_names', self.graphs_names)

        self.data_graph = self.data[GRAPH]

        # save nid and eid to nodes & edges
        if append_nid_eid is True:
            print('self.data_graph[0]', self.data_graph[0])
            if 'nid' not in self.data_graph[0].ndata:
                print('*** Not found nid. Appending...')
                # if True:
                for k, g in enumerate(self.data_graph):
                    g = self.write_nid_eid(g)
                    self.data_graph[k] = g
                # print('self.data_graph', self.data_graph)
            save_pickle(self.data_graph, os.path.join(pickle_folder, GRAPH))

        # data_nclasses = self.data[N_CLASSES]
        data_nclasses = 2
        if N_RELS in self.data:
            data_nrels = self.data[N_RELS]
        else:
            data_nrels = None

        if N_ENTITIES in self.data:
            data_nentities = self.data[N_ENTITIES]
        else:
            data_nentities = None

        self.model = Model(g=self.data_graph[0],
                           config_params=self.model_config,
                           n_classes=data_nclasses,
                           n_rels=data_nrels,
                           n_entities=data_nentities,
                           is_cuda=self.is_cuda,
                           batch_size=1,
                           model_src_path=model_src_path,
                           gdot_path=gdot_path)

        if self.is_cuda is True:
            # self.model.cuda()
            print('* Use cuda')
            self.model.to(torch.device('cuda'))

        # print('*** Model parameters ***')
        # pp = 0
        # print('self.model', self.model)

        # # self.e_src_attn_layer = self.model.edgnn_layers[1].e_src_attn_fc
        # # self.e_group_attn_layer = self.model.edgnn_layers[1].e_group_attn_fc
        # # # self.dst_attn_layer = self.model.edgnn_layers[1].dst_attn_fc
        # # print('self.e_group_attn_layer', self.e_group_attn_layer)
        # # print('self.e_src_attn_layer', self.e_src_attn_layer)
        # # # print('self.dst_attn_layer', self.dst_attn_layer)
        # # self.e_group_attn_layer.register_forward_hook(self.hook)
        # # self.e_src_attn_layer.register_forward_hook(self.hook2)
        # # self.dst_attn_layer.register_forward_hook(self.hook)

        # for p in list(self.model.parameters()):
        #     nn = 1
        #     for s in list(p.size()):
        #         # print('p', p)
        #         # print('\t s, nn, nn*s', s, nn, nn*s)
        #         nn = nn*s
        #     pp += nn
        # print('Total params', pp)

        if early_stopping:
            self.early_stopping = EarlyStopping(patience=patience,
                                                verbose=True)

        # Output folder to save train / test data
        if odir is None:
            odir = 'output/' + time.strftime("%Y-%m-%d_%H-%M-%S")
        self.odir = odir