Ejemplo n.º 1
0
def create_ngram2subject(subject2name, save_dir):
    ngram2subject = {}
    name2subject = {}
    for subject_id, subject_names in subject2name.items():
        for subject_name in subject_names:
            if subject_name not in name2subject:
                name2subject[subject_name] = [subject_id]
            else:
                name2subject[subject_name].append(subject_id)

            name_ngrams = get_name_ngrams(subject_name)

            for ngram_tuple in name_ngrams:
                ngram = ' '.join(ngram_tuple)
                if ngram in ngram2subject.keys():
                    ngram2subject[ngram].append((subject_id, subject_name))
                else:
                    ngram2subject[ngram] = [(subject_id, subject_name)]

    print('num of subject names: ', len(name2subject))
    print('examples of name2subject: ', list(name2subject.items())[:10])
    print('num of ngram: ', len(ngram2subject))
    print('examples of ngram2subject: ', list(ngram2subject.items())[:10])

    print('save ngram2subject in pickle format...')
    pickle_save(ngram2subject, os.path.join(save_dir, 'ngram2subject.pkl'))
    print('save name2subject in pickle format...')
    pickle_save(name2subject, os.path.join(save_dir, 'name2subject.pkl'))
Ejemplo n.º 2
0
    def __init__(self, hparams, **kwargs):
        if hparams.sortish_sampler and hparams.gpus > 1:
            hparams.replace_sampler_ddp = False
        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
        use_task_specific_params(self.model, "summarization")
        save_git_info(self.hparams.output_dir)
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
        pickle_save(self.hparams, self.hparams_save_path)
        self.step_count = 0
        self.metrics = defaultdict(list)

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
            prefix=self.model.config.prefix or "",
        )
        n_observations_per_split = {
            "train": self.hparams.n_train,
            "val": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }

        self.target_lens = {
            "train": self.hparams.max_target_length,
            "val": self.hparams.val_max_target_length,
            "test": self.hparams.test_max_target_length,
        }
        assert self.target_lens["train"] <= self.target_lens[
            "val"], f"target_lens: {self.target_lens}"
        assert self.target_lens["train"] <= self.target_lens[
            "test"], f"target_lens: {self.target_lens}"

        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())

        self.hparams.git_sha = get_git_info()["repo_sha"]
        self.num_workers = hparams.num_workers
        self.decoder_start_token_id = None  # default to config
        if self.model.config.decoder_start_token_id is None and isinstance(
                self.tokenizer, MBartTokenizer):
            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[
                hparams.tgt_lang]
            self.model.config.decoder_start_token_id = self.decoder_start_token_id
        self.dataset_class = (Seq2SeqDataset if hasattr(
            self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset)
        self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams
        assert self.eval_beams >= 1, f"got self.eval_beams={self.eval_beams}. Need an integer > 1"
        if self.hparams.eval_max_gen_length is not None:
            self.eval_max_length = self.hparams.eval_max_gen_length
        else:
            self.eval_max_length = self.model.config.max_length
        self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
Ejemplo n.º 3
0
def save_string_int_dict():
    # 将nonterminal和terminal对应的映射字典保存并返回
    # 其中,对于terminal只选用most frequent的30000个token
    tt_token_to_int = {}
    tt_int_to_token = {}
    nt_token_to_int = {}
    nt_int_to_token = {}

    import pickle
    pickle.dump([terminal_count],
                open('js_dataset/rename_variable/terminal_counter.pkl', 'wb'))

    most_common_tuple = terminal_count.most_common(most_common_termial_num)
    for index, (token, times) in enumerate(most_common_tuple):
        tt_token_to_int[token] = index
        tt_int_to_token[index] = token
    for index, token in enumerate(list(non_terminal_set)):
        nt_token_to_int[token] = index
        nt_int_to_token[index] = token

    tt_int_to_token[len(tt_int_to_token)] = unknown_token  # terminal中添加UNK
    tt_token_to_int[unknown_token] = len(tt_token_to_int)

    utils.pickle_save(
        data_parameter_dir,
        [tt_token_to_int, tt_int_to_token, nt_token_to_int, nt_int_to_token
         ])  # 将映射字典保存到本地
    return tt_token_to_int, tt_int_to_token, nt_token_to_int, nt_int_to_token
Ejemplo n.º 4
0
def save_len_file(
    tokenizer_name, data_dir, max_source_length=1024, max_target_length=1024, consider_target=False, **kwargs
):
    """Save max(src_len, tgt_len) for each example to allow dynamic batching."""
    tok = AutoTokenizer.from_pretrained(tokenizer_name)
    train_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="train", **kwargs)
    pad = tok.pad_token_id

    def get_lens(ds):
        dl = tqdm(
            DataLoader(ds, batch_size=512, num_workers=8, shuffle=False, collate_fn=ds.collate_fn),
            desc=str(ds.len_file),
        )
        max_lens = []
        for batch in dl:
            src_lens = batch["input_ids"].ne(pad).sum(1).tolist()
            tgt_lens = batch["labels"].ne(pad).sum(1).tolist()
            if consider_target:
                for src, tgt in zip(src_lens, tgt_lens):
                    max_lens.append(max(src, tgt))
            else:
                max_lens.extend(src_lens)
        return max_lens

    train_lens = get_lens(train_ds)
    val_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="val", **kwargs)
    val_lens = get_lens(val_ds)
    pickle_save(train_lens, train_ds.len_file)
    pickle_save(val_lens, val_ds.len_file)
Ejemplo n.º 5
0
def create_train_pickle():
    print("start create train pickle file...")
    train_data = open(fileConfig.data_dir + fileConfig.file_weibo_train_data,
                      "r",
                      encoding="utf-8")
    out_datas = []
    for line in tqdm(train_data, "deal train file..."):
        train_infos = line.split('\t')
        assert len(train_infos) == 7
        out_datas.append({
            comConfig.col_uid: train_infos[0],
            comConfig.col_mid: train_infos[1],
            comConfig.col_time: train_infos[2],
            comConfig.col_forward_count: train_infos[3],
            comConfig.col_comment_count: train_infos[4],
            comConfig.col_like_count: train_infos[5],
            comConfig.col_content: train_infos[6]
        })
    print("save train and test infos...")
    test_data_len = 200000
    train_datas = out_datas[:len(out_datas) - test_data_len]
    test_datas = out_datas[len(out_datas) - test_data_len:]
    utils.check_dir(fileConfig.pickle_dir)
    utils.pickle_save(train_datas,
                      fileConfig.pickle_dir + fileConfig.file_train_pickle)
    utils.pickle_save(test_datas,
                      fileConfig.pickle_dir + fileConfig.file_test_pickle)
Ejemplo n.º 6
0
 def save(self, path, name):
     if self.X is not None:
         np.save(os.path.join(path, "dataset_X_{}.npy".format(name)),
                 self.X)
     if self.y is not None:
         np.save(os.path.join(path, "dataset_y_{}.npy".format(name)),
                 self.y)
     if self.image_features is not None:
         pickle_save(
             self.image_features,
             os.path.join(path,
                          "dataset_image_features_{}.pkl".format(name)))
     if self.X_integral is not None:
         np.save(
             os.path.join(path, "dataset_X_integral_{}.npy".format(name)),
             self.X_integral)
     if self.X_features is not None:
         np.save(
             os.path.join(path, "dataset_X_features_{}.npy".format(name)),
             self.X_features)
     if self.X_features_sorted is not None:
         np.save(
             os.path.join(path,
                          "dataset_X_features_sorted_{}.npy".format(name)),
             self.X_features_sorted)
     if self.X_features_sorted_indices is not None:
         np.save(
             os.path.join(
                 path,
                 "dataset_X_features_sorted_indices_{}.npy".format(name)),
             self.X_features_sorted_indices)
Ejemplo n.º 7
0
def compute_city(cityid):
    reader = shapefile.Reader(path.join(LATLNGS_SHP_DIR, str(cityid)))
    writer = shapefile.Writer(shapefile.POINT)
    writer.autoBalance = 1
    writer.field('price', 'N')
    writer.field('income', 'N')
    writer.field('age', 'N')

    total = len(reader.shapeRecords())
    count = 0
    for sr in reader.shapeRecords():
        point = sr.shape.points[0]
        price = sr.record[0]
        income = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid],
                              INCOME_VARIABLE)
        age = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid],
                           AGE_VARIABLE)
        writer.point(point[0], point[1])
        writer.record(price, income, age)
        count += 1
        if count % 100 == 0:
            print 'Processed %d out of %d' % (count, total)
            pickle_save(CENSUS_DATA_CACHE,
                        path.join(CACHE_DIR, 'census_data_cache'))
            pickle_save(BLOCK_DATA_CACHE, path.join(CACHE_DIR, 'block_data'))

    writer.save(path.join(LATLNGS_SHP_DIR, str(cityid) + '_age'))
Ejemplo n.º 8
0
def get_name_type_for_subject(entity2name_path, entity2type_path, triple_path,
                              save_dir):
    entity2name = pickle_load(entity2name_path)
    entity2type = pickle_load(entity2type_path)
    triples = pickle_load(triple_path)

    has_name_count = 0
    has_type_count = 0
    subject2name = {}

    subject2type = {}

    for index, subject in enumerate(triples.keys()):
        if subject in entity2name:
            subject2name[subject] = entity2name[subject]
            has_name_count += 1

        if subject in entity2type:
            subject2type[subject] = entity2type[subject]
            has_type_count += 1

    print(has_name_count, len(triples.keys()))
    print(has_type_count, len(triples.keys()))

    pickle_save(subject2name, os.path.join(save_dir, 'trim_subject2name.pkl'))
    pickle_save(subject2type, os.path.join(save_dir, 'trim_subject2type.pkl'))
Ejemplo n.º 9
0
 def train(self):
     self.feature_model = Feature()
     feature_list = []
     label_list = []
     sen_list = []
     self.loading_none_spliter_rule(feature_list, label_list, sen_list)
     self.loading_forcing_spliter_rule()
     self.load_normal_data(feature_list, label_list, sen_list)
     self.classifier = LogisticRegression(verbose=False)
     print "Learning..."
     self.classifier.fit(feature_list, label_list)
     print "Saving..."
     utils.pickle_save(self, self.model_path)
     print "Done"
     print "Test..."
     #f = open("wrong.dat","w")
     predicted_labels = self.classifier.predict(feature_list)
     ll = len(predicted_labels)
     cc = 0
     for i in xrange(ll):
         if label_list[i] == 0 and predicted_labels[i] == 1:
             cc += 1
             #print sen_list[i]
             #f.write("%s\n"%sen_list[i])
     #f.close()
     print cc, ll, cc * 1.0 / ll
Ejemplo n.º 10
0
def main(args, model=None) -> SummarizationModule:
    Path(args.output_dir).mkdir(exist_ok=True)
    if len(os.listdir(args.output_dir)) > 3 and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))

    # summarization model
    model: SummarizationModule = SummarizationModule(args)

    dataset = Path(args.data_dir).name
    if (args.logger_name == "default" or args.fast_dev_run
            or str(args.output_dir).startswith("/tmp")
            or str(args.output_dir).startswith("/var")):
        logger = True  # don't pollute wandb logs unnecessarily
    elif args.logger_name == "wandb":
        from pytorch_lightning.loggers import WandbLogger

        project = os.environ.get("WANDB_PROJECT", dataset)
        logger = WandbLogger(name=model.output_dir.name, project=project)

    elif args.logger_name == "wandb_shared":
        from pytorch_lightning.loggers import WandbLogger

        logger = WandbLogger(name=model.output_dir.name,
                             project=f"hf_{dataset}")

    if args.early_stopping_patience >= 0:
        es_callback = get_early_stopping_callback(model.val_metric,
                                                  args.early_stopping_patience)
    else:
        es_callback = False

    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
        checkpoint_callback=get_checkpoint_callback(args.output_dir,
                                                    model.val_metric,
                                                    args.save_top_k),
        early_stopping_callback=es_callback,
        logger=logger,
    )
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
    if not args.do_predict:
        return model

    model.hparams.test_checkpoint = ""
    checkpoints = list(
        sorted(
            glob.glob(os.path.join(args.output_dir, "*.ckpt"),
                      recursive=True)))
    if checkpoints:
        model.hparams.test_checkpoint = checkpoints[-1]
        trainer.resume_from_checkpoint = checkpoints[-1]
    trainer.logger.log_hyperparams(model.hparams)

    # test() without a model tests using the best checkpoint automatically
    trainer.test()
    return model
Ejemplo n.º 11
0
def show_results(evaluations, metric):
    """ Plot evaluation results with error bar."""
    fig, ax = plt.subplots()
    colors = cm.Dark2(np.linspace(0, 1, len(evaluations)))
    results = {}
    for i, evaluation in enumerate(evaluations):
        res = evaluation.get_results(metric)
        mean, std = np.nanmean(res, axis=0), np.nanstd(res, axis=0)
        ax.errorbar(np.arange(mean.shape[0]), mean, yerr=std, color=colors[i], label=evaluation.name, fmt='-o')
        results[evaluation.name] = res

    # store the results on disk
    pwd = os.path.dirname(os.path.realpath(__file__))
    folder = '/results/' if evaluations[0].cache_folder == '' else \
        '/results/{}/'.format(evaluations[0].cache_folder)
    folder = pwd + folder
    pickle_save(folder+'/measurement_lost_{}_{}_results.pkl'.format(evaluations[0].models[0].measurement_lost, metric), results)

    # Now add the legend with some customizations.
    legend = ax.legend(loc='upper right')

    # Set the fontsize
    for label in legend.get_texts():
        label.set_fontsize('small')

    plt.show()
Ejemplo n.º 12
0
def nt_seq_to_int(time_steps=50, status='TRAIN'):
    # 对NT seq进行进一步的处理,首先将每个token转换为number,
    # 然后对于train data和valid data将所有ast-seq extend成一个list 便于训练时的格式转换
    # 对于test data,将所有ast-seq append,保留各个ast的独立seq
    tt_token_to_int, tt_int_to_token, nt_token_to_int, nt_int_to_token = \
        pickle.load(open('js_dataset/rename_variable/rename_parameter.pkl', 'rb'))
    total_num_nt_pair = 0
    if status == 'TRAIN':
        sub_data_dir = sub_train_data_dir
        num_sub_data = num_sub_train_data
        sub_int_data_dir = sub_int_train_dir
    elif status == 'VALID':
        sub_data_dir = sub_valid_data_dir
        num_sub_data = num_sub_valid_data
        sub_int_data_dir = sub_int_valid_dir
    elif status == 'TEST':
        sub_data_dir = sub_test_data_dir
        num_sub_data = num_sub_test_data
        sub_int_data_dir = sub_int_test_dir
    else:
        print('ERROR! Unknown commend!!')
        sys.exit(1)

    def get_subset_data():  # 对每个part的nt_sequence读取并返回,等待进行处理
        for i in range(1, num_sub_data + 1):
            data_path = sub_data_dir + 'part{}.json'.format(i)
            data = utils.pickle_load(data_path)
            yield (i, data)

    subset_generator = get_subset_data()
    for index, data in subset_generator:
        data_seq = []
        for one_ast in data:  # 将每个nt_seq进行截取,并encode成integer,然后保存
            if len(one_ast) < time_steps:  # 该ast大小不足time step 舍去
                continue
            try:
                nt_int_seq = [
                    (nt_token_to_int[n],
                     tt_token_to_int.get(t, tt_token_to_int[unknown_token]))
                    for n, t in one_ast
                ]
            except KeyError:
                print('key error')
                continue
            # 在train和valid中,是直接将所有ast-seq extend到一起,在test中,保留各个ast-seq的独立
            if status == 'TEST':
                data_seq.append(nt_int_seq)
                total_num_nt_pair += len(nt_int_seq)
            else:
                data_seq.extend(nt_int_seq)
                total_num_nt_pair += len(nt_int_seq)

        one_sub_int_data_dir = sub_int_data_dir + 'int_part{}.json'.format(
            index)
        utils.pickle_save(one_sub_int_data_dir, data_seq)
    # old:14,976,250  new:157,237,460  size of training dataset comparison
    # old: 1,557,285  new: 81,078,099  测试数据集数据量对比
    print('There are {} nt_pair in {} dataset...'.format(
        total_num_nt_pair, status))
Ejemplo n.º 13
0
def main(args, model=None) -> SummarizationModule:
    Path(args.output_dir).mkdir(exist_ok=True)
    check_output_dir(args, expected_items=3)
    if model is None:
        if "summarization" in args.task:
            model: SummarizationModule = SummarizationModule(args)
        else:
            model: SummarizationModule = TranslationModule(args)
    dataset = Path(args.data_dir).name
    if (
        args.logger_name == "default"
        or args.fast_dev_run
        or str(args.output_dir).startswith("/tmp")
        or str(args.output_dir).startswith("/var")
    ):
        from pytorch_lightning.loggers import CSVLogger
        logger = CSVLogger('chen_logs',name = 'SCHWEIGEN')  # don't pollute wandb logs unnecessarily
    elif args.logger_name == "wandb":
        from pytorch_lightning.loggers import WandbLogger

        project = os.environ.get("WANDB_PROJECT", dataset)
        logger = WandbLogger(name=model.output_dir.name, project=project)

    elif args.logger_name == "wandb_shared":
        from pytorch_lightning.loggers import WandbLogger

        logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")

    if args.early_stopping_patience >= 0:
        es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
    else:
        es_callback = False

    lower_is_better = args.val_metric == "loss"
    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
        checkpoint_callback=get_checkpoint_callback(
            args.output_dir, model.val_metric, args.save_top_k, lower_is_better
        ),
        early_stopping_callback=es_callback,
        logger=logger,
    )
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
    if not args.do_predict:
        return model

    model.hparams.test_checkpoint = ""
    checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True)))
    if checkpoints:
        model.hparams.test_checkpoint = checkpoints[-1]
        trainer.resume_from_checkpoint = checkpoints[-1]
    trainer.logger.log_hyperparams(model.hparams)

    # test() without a model tests using the best checkpoint automatically
    trainer.test()
    return model
Ejemplo n.º 14
0
def debug_build_features(count_only=False):
    all_features = Trainer.build_features(2, 2)
    print(len(all_features))
    if count_only:
        return
    for feature in all_features:
        print(feature)
    pickle_save(all_features, "all_features.pkl")
    feature_reloaded = pickle_load('all_features.pkl')
    assert feature_reloaded == all_features
Ejemplo n.º 15
0
def generate_all_ids():
    """ Generate and store IDs of Teams and Players """
    # Teams
    dict_team_ids = get_team_ids_dictionary()
    utils.pickle_save(data_obj=dict_team_ids, filename='ids_of_teams.pkl')

    # Players
    dict_player_ids = get_player_ids_dictionary()
    utils.pickle_save(data_obj=dict_player_ids, filename='ids_of_players.pkl')
    return None
Ejemplo n.º 16
0
    def _save_state(self):
        obj = {}
        obj["layer_index"] = self.layer_index
        obj["iter"] = self.iter
        obj["loss"] = self.loss
        obj["mlp_best"] = self.mlp_best
        obj["mlp_crrnt"] = self.mlp_crrnt
        #obj["iters_without_impr"] = self.iters_without_impr
        obj["train_sets"] = self.train_sets.get_state()

        utils.pickle_save(obj, self.wdir + "/layerwisetrainer_state")
Ejemplo n.º 17
0
    def __init__(self, hparams, **kwargs):
        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
        use_task_specific_params(self.model, "summarization")
        save_git_info(self.hparams.output_dir)
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
        pickle_save(self.hparams, self.hparams_save_path)
        self.step_count = 0
        self.metrics = defaultdict(list)

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
            prefix=self.model.config.prefix or "",
        )
        n_observations_per_split = {
            "train": self.hparams.n_train,
            "val": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }

        self.target_lens = {
            "train": self.hparams.max_target_length,
            "val": self.hparams.val_max_target_length,
            "test": self.hparams.test_max_target_length,
        }
        assert self.target_lens["train"] <= self.target_lens[
            "val"], f"target_lens: {self.target_lens}"
        assert self.target_lens["train"] <= self.target_lens[
            "test"], f"target_lens: {self.target_lens}"

        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())

        self.hparams.git_sha = get_git_info()["repo_sha"]
        self.num_workers = hparams.num_workers
        self.decoder_start_token_id = None
        if self.model.config.decoder_start_token_id is None and isinstance(
                self.tokenizer, MBartTokenizer):
            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[
                hparams.tgt_lang]
            self.model.config.decoder_start_token_id = self.decoder_start_token_id
        if isinstance(self.tokenizer, MBartTokenizer) or isinstance(
                self.tokenizer, MarianTokenizer):
            self.dataset_class = TranslationDataset
        else:
            self.dataset_class = Seq2SeqDataset
Ejemplo n.º 18
0
    def _save_state(self):
        obj = {}
        obj["layer_index"] = self.layer_index
        obj["iter"] = self.iter
        obj["loss"] = self.loss
        obj["mlp_best"] = self.mlp_best
        obj["mlp_crrnt"] = self.mlp_crrnt
        #obj["iters_without_impr"] = self.iters_without_impr
        obj["train_sets"] = self.train_sets.get_state()

        utils.pickle_save(obj, self.wdir+"/layerwisetrainer_state")
Ejemplo n.º 19
0
def main(args, model=None) -> SummarizationModule:
    Path(args.output_dir).mkdir(exist_ok=True)
    if len(os.listdir(args.output_dir)) > 3 and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if model is None:
        if args.task == "summarization":
            model: SummarizationModule = SummarizationModule(args)
        else:
            model: SummarizationModule = TranslationModule(args)

    dataset = Path(args.data_dir).name
    if (args.logger == "default" or args.fast_dev_run
            or str(args.output_dir).startswith("/tmp")
            or str(args.output_dir).startswith("/var")):
        logger = True  # don't pollute wandb logs unnecessarily
    elif args.logger == "wandb":
        from pytorch_lightning.loggers import WandbLogger

        logger = WandbLogger(name=model.output_dir.name, project=dataset)

    elif args.logger == "wandb_shared":
        from pytorch_lightning.loggers import WandbLogger

        logger = WandbLogger(name=model.output_dir.name,
                             project=f"hf_{dataset}")
    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
        checkpoint_callback=get_checkpoint_callback(args.output_dir,
                                                    model.val_metric),
        logger=logger,
        # TODO: early stopping callback seems messed up
    )
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
    if not args.do_predict:
        return model

    model.hparams.test_checkpoint = ""
    checkpoints = list(
        sorted(
            glob.glob(os.path.join(args.output_dir, "*.ckpt"),
                      recursive=True)))
    if checkpoints:
        model.hparams.test_checkpoint = checkpoints[-1]
        trainer.resume_from_checkpoint = checkpoints[-1]
    trainer.logger.log_hyperparams(model.hparams)
    trainer.test(
        model
    )  # this breaks in DDP, known lightning issue. See evaluate_checkpoint to recover metrics.
    return model
Ejemplo n.º 20
0
    def _get_trajs(self):
        """Get trajectories either from result folder or generate them."""

        distances_file_name, traj_file_name = '/distances.npy', '/trajectories.pkl'
        distances_path, traj_path = self.folder_path+distances_file_name, self.folder_path+traj_file_name
        if os.path.isfile(distances_path) and os.path.isfile(traj_path):
            print("loading trajectories...")
            distances, trajs = np.load(distances_path), pickle_load(traj_path)
        else:
            distances, trajs = self._generate_trajs()
            np.save(distances_path, distances)
            pickle_save(traj_path, trajs)
        return distances, trajs
Ejemplo n.º 21
0
    def _save_state(self):
        obj = {}
        obj["iter"] = self.iter
        obj["done"] = self.done
        obj["loss"] = self.loss
        obj["rate"] = self.rate
        obj["mlp_best"] = self.mlp_best
        obj["halving"] = self.halving
        obj["wasAccepted"] = self.wasAccepted
        obj["train_sets"] = self.train_sets.get_state()
        obj["valid_sets"] = self.valid_sets.get_state()

        utils.pickle_save(obj, self.wdir + "/trainer_state")
Ejemplo n.º 22
0
def add_temporary_to_cls():
    global temp_cls, cls, cls_path
    for k, v in temp_cls.items():
        #print('{}: {}'.format(k, v))
        cls[k] += v
    num = check_how_many_we_have(cls)
    print(
        "Found %d proper scenes: single object or multiple objects with same motion trend "
        % num)
    if num >= 500:
        tkMessageBox.showinfo("Congratulations",
                              "You have more than 500 proper scenes already!")
    pickle_save(cls_path, cls)
Ejemplo n.º 23
0
    def _save_state(self):
        obj = {}
        obj["iter"] = self.iter
        obj["done"] = self.done
        obj["loss"] = self.loss
        obj["rate"] = self.rate
        obj["mlp_best"] = self.mlp_best
        obj["halving"] = self.halving
        obj["wasAccepted"] = self.wasAccepted
        obj["train_sets"] = self.train_sets.get_state()
        obj["valid_sets"] = self.valid_sets.get_state()

        utils.pickle_save(obj, self.wdir+"/trainer_state")
Ejemplo n.º 24
0
def dataset_training_pair(subset_size=5000):
    """读取原始AST数据集,并将其分割成多个subset data
    对每个AST,生成多个training pair"""
    data_path = js_train_data_dir
    total_size = 100000
    print('begin to generate training pairs from dataset:{}'.format(data_path))

    file = open(data_path, 'r')
    nt_train_pairs_list = []
    tt_train_pairs_list = []
    num_nt_train_pair = 0
    num_tt_train_pair = 0
    for i in range(1, total_size + 1):
        try:
            line = file.readline()  # read a lind from file(one ast)
            ast = json.loads(line)  # transform it to json format
            nt_train_pairs, tt_train_pairs = generate_train_pair(ast, nt_n_dim, nt_t_dim, tt_n_dim, tt_t_dim)
        except UnicodeDecodeError as error:  # arise by readline
            print(error)
        except JSONDecodeError as error:  # arise by json_load
            print(error)
        except RecursionError as error:
            print(error)
        except BaseException:
            print('other unknown error, plesae check the code')
        else:
            nt_train_pairs_list.extend(nt_train_pairs)
            tt_train_pairs_list.extend(tt_train_pairs)

        if i % subset_size == 0:  # 当读入的ast已经等于给定的subset的大小时
            # 对生成的training是否符合规格进行检查
            # check_correct(nt_train_pairs_list, nt_n_dim, nt_t_dim)
            # check_correct(tt_train_pairs_list, tt_n_dim, tt_t_dim)

            nt_pair_path = nt_train_pair_dir + \
                'part{}'.format(i // subset_size) + '.json'
            tt_pair_path = tt_train_pair_dir + \
                'part{}'.format(i // subset_size) + '.json'
            pickle_save(nt_pair_path, nt_train_pairs_list)
            pickle_save(tt_pair_path, tt_train_pairs_list)

            print('There are {} nt_train_pairs in {}th subset'.format(len(nt_train_pairs_list), i))
            print('There are {} tt_train_pairs in {}th subset'.format(len(tt_train_pairs_list), i))

            num_nt_train_pair += len(nt_train_pairs_list)
            num_tt_train_pair += len(tt_train_pairs_list)
            nt_train_pairs_list = []
            tt_train_pairs_list = []

    print("Number of non-terminal training pairs: {}".format(num_nt_train_pair))  # 89512876 - 3
    print("Number of terminal training pairs: {}".format(num_tt_train_pair))  # 82839660
    def __init__(self, hparams, **kwargs):
        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
        use_task_specific_params(self.model, "summarization")
        # save_git_info(self.hparams.output_dir)
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
        pickle_save(self.hparams, self.hparams_save_path)
        self.step_count = 0
        self.metrics = defaultdict(list)

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
            prefix=self.model.config.prefix or "",
        )
        n_observations_per_split = {
            "train": self.hparams.n_train,
            "val": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }

        self.target_lens = {
            "train": self.hparams.max_target_length,
            "val": self.hparams.val_max_target_length,
            "test": self.hparams.test_max_target_length,
        }
        assert self.target_lens["train"] <= self.target_lens[
            "val"], f"target_lens: {self.target_lens}"
        assert self.target_lens["train"] <= self.target_lens[
            "test"], f"target_lens: {self.target_lens}"

        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())

        # self.hparams.git_sha = get_git_info()["repo_sha"]
        self.num_workers = hparams.num_workers
        self.decoder_start_token_id = None

        # Entailment model
        self.entailment_tokenizer = AutoTokenizer.from_pretrained(
            'textattack/roberta-base-MNLI')
        self.entailment_model = AutoModelForSequenceClassification.from_pretrained(
            'textattack/roberta-base-MNLI')
        self.entailment_model = self.entailment_model.to('cuda')
Ejemplo n.º 26
0
def data_process(train_or_test, subset_size=5000):
    """读取原始AST数据集,并将其分割成多个subset data
    对每个AST,将其转换成二叉树的形式,然后进行中序遍历生成一个nt-sequence"""
    sys.setrecursionlimit(10000)  # 设置递归最大深度
    print('setrecursionlimit == 10000')
    saved_to_path = sub_data_dir

    if train_or_test == 'train':  # 对training数据集进行分割
        data_path = js_train_data_dir
        total_size = 100000
        base_num = 0
    elif train_or_test == 'test':  # 对test数据集进行分割
        data_path = js_test_data_dir
        total_size = 50000
        base_num = num_sub_train_data
    else:
        raise KeyError

    file = open(data_path, 'r')
    subset_list = []
    nt_seq = []
    for i in range(1, total_size + 1):
        try:
            line = file.readline()  # read a lind from file(one ast)
            ast = json.loads(line)  # transform it to json format
            binary_tree = bulid_binary_tree(ast)  # AST to binary tree
            nt_seq = ast_to_seq(binary_tree, 'process')  # binary to nt_sequence
        except UnicodeDecodeError as error:  # arise by readline
            print(error)
        except JSONDecodeError as error:  # arise by json_load
            print(error)
        except RecursionError as error:
            print(error)
        except BaseException as error:
            print('UNKNOWN ERROR', error)
        else:
            subset_list.append(nt_seq)  # 将生成的nt sequence加入到list中

        if i % subset_size == 0:  # 当读入的ast已经等于给定的subset的大小时
            sub_path = saved_to_path + \
                'sub_part{}'.format(base_num + (i // subset_size)) + '.json'
            utils.pickle_save(sub_path, subset_list)  # 将subset dataset保存
            subset_list = []

    if train_or_test == 'train':  # 当处理训练数据集时,需要保存映射map,测试数据集则不需要
        save_string_int_dict()
        print('training data seperating finished...')
        print('encoding information has been saved in {}'.format(data_parameter_dir))
    else:
        print('testing data seperating finished...')
Ejemplo n.º 27
0
    def __init__(self, hparams, **kwargs):
        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
        #use_task_specific_params(self.model, "summarization")
        #save_git_info(self.hparams.output_dir)
        #self.metrics_save_path = Path("/results/metrics.json")
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
        pickle_save(self.hparams, self.hparams_save_path)
        self.step_count = 0
        self.metrics = defaultdict(list)

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
            prefix=self.model.config.prefix or "",
        )
        n_observations_per_split = {
            "train": self.hparams.n_train,
            "val": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }

        self.target_lens = {
            "train": self.hparams.max_target_length,
            "val": self.hparams.val_max_target_length,
            "test": self.hparams.test_max_target_length,
        }
        assert self.target_lens["train"] <= self.target_lens[
            "val"], f"target_lens: {self.target_lens}"
        assert self.target_lens["train"] <= self.target_lens[
            "test"], f"target_lens: {self.target_lens}"

        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())

        #self.hparams.git_sha = get_git_info()["repo_sha"]
        try:
            self.num_workers = hparams.num_workers
        except AttributeError:
            self.num_workers = 2

        self.decoder_start_token_id = None
        self.dataset_class = Seq2SeqDataset
Ejemplo n.º 28
0
def dataset_split(is_training=True, subset_size=5000):
    """读取原始AST数据集,并将其分割成多个subset data
    对每个AST,将其转换成二叉树的形式,然后进行中序遍历生成一个nt-sequence"""
    sys.setrecursionlimit(10000)  # 设置递归最大深度
    print('setrecursionlimit == 10000')

    if is_training:  # 对training数据集进行分割
        data_path = js_train_data_dir
        total_size = 100000
        saved_to_path = sub_train_data_dir
    else:  # 对test数据集进行分割
        data_path = js_test_data_dir
        total_size = 50000
        saved_to_path = sub_test_data_dir

    file = open(data_path, 'r')
    subset_list = []
    nt_seq = []
    for i in range(1, total_size + 1):
        try:
            line = file.readline()  # read a lind from file(one ast)
            ast = json.loads(line)  # transform it to json format
            rename_ast = rename_variable(ast)
            binary_tree = bulid_binary_tree(rename_ast)  # AST to binary tree
            nt_seq = ast_to_seq(binary_tree)  # binary to nt_sequence
        except UnicodeDecodeError as error:  # arise by readline
            print(error)
        except JSONDecodeError as error:  # arise by json_load
            print(error)
        except RecursionError as error:
            print(error)
        except BaseException:
            print('other unknown error, plesae check the code')
        else:
            subset_list.append(nt_seq)  # 将生成的nt sequence加入到list中

        if i % subset_size == 0:  # 当读入的ast已经等于给定的subset的大小时
            sub_path = saved_to_path + \
                'part{}'.format(i // subset_size) + '.json'
            utils.pickle_save(sub_path, subset_list)  # 将subset dataset保存
            subset_list = []

    if is_training:  # 当处理训练数据集时,需要保存映射map,测试数据集则不需要
        save_string_int_dict()
        print('training data seperating finished...')
        print('encoding information has been saved in {}'.format(
            data_parameter_dir))
    else:
        print('testing data seperating finished...')
Ejemplo n.º 29
0
def align(params, load_prefix, save_path):
    """
  Align depth and color images. Save everything into a single pickle object.

  Parameters
  ----------
  params: Camera intrinsic parameters.

  load_prefix: Path to load data. Will load color stream from
  `load_prefix`_color.avi, depth stream from `load_prefix`_depth.pkl, and body
  stream from `load_prefix`_body.pkl.

  save_path: Path to save result data.

  """
    color_src = cv2.VideoCapture(load_prefix + '_color.avi')
    depth_src = pickle_load(load_prefix + '_depth.pkl')
    body_src = pickle_load(load_prefix + '_body.pkl')

    depth_height = depth_src[0].shape[0]
    depth_width = depth_src[0].shape[1]

    h_coord = np.tile(np.reshape(np.arange(1, depth_width + 1), [1, -1]),
                      [depth_height, 1]) - params['cx_d']
    v_coord = np.tile(np.reshape(np.arange(1, depth_height + 1), [-1, 1]),
                      [1, depth_width]) - params['cy_d']

    pcloud_frames = []
    depth_frames = []
    color_frames = []
    body_frames = []
    for depth, body in tqdm(zip(depth_src, body_src)):
        _, color = color_src.read()
        pcloud = depth_to_world(depth, params, h_coord, v_coord)
        pcloud_frames.append(pcloud)
        color = world_to_color(params, pcloud, color)
        color_frames.append(color)
        body_frames.append(body)
        depth_frames.append(depth)

    data = {
        'pclouds': pcloud_frames,
        'depths': depth_frames,
        'colors': color_frames,
        'bodies': body_frames
    }

    pickle_save(save_path, data)
def trajectories_by_importance(execution_traces, state_importance, args):
    if args.load_trajectories:
        all_trajectories = pickle_load(join(args.results_dir, 'Trajectories.pkl'))
        if args.verbose: print(f"HIGHLIGHTS {15 * '-' + '>'} Trajectories Loaded")
    else:
        all_trajectories = get_all_trajectories(execution_traces, args.trajectory_length, state_importance)
        pickle_save(all_trajectories, join(args.results_dir, 'Trajectories.pkl'))
        if args.verbose: print(f"HIGHLIGHTS {15 * '-' + '>'} Trajectories Generated")

    sorted_by_method = sorted([(x.importance[args.trajectory_importance], x) for x in all_trajectories],
                              key=lambda y: y[0], reverse=True)
    sorted_trajectories = [x[1] for x in sorted_by_method]
    trajectories_scores = [x[0] for x in sorted_by_method]
    summary_trajectories = trajectory_highlights(sorted_trajectories, trajectories_scores, args.allowed_similar_states,
                                                 args.num_trajectories, args.highlights_selection_method)
    return all_trajectories, summary_trajectories
Ejemplo n.º 31
0
def fit_ir(qs, save_paths={}):
  cd = calibration_dataset(qs)
  df = pd.DataFrame(cd, columns=['x', 'y'])
  
  if save_paths:
    np.save(save_paths['calibration'], cd)
  
  model = IsotonicRegression(
    y_min=0,
    y_max=1,
    increasing=True,
    out_of_bounds='clip'
    ).fit(df['x'], df['y'])
  
  if save_paths:
    pickle_save(model, save_paths['model'])

  return model
Ejemplo n.º 32
0
def init_app(data_folder):
    cls_path = os.path.join(data_folder, 'classification.pkl')
    if not os.path.isfile(cls_path):
        print("initialize classification result file")
        print("file is {}".format(cls_path))
        cls = dict()
        cls['processed'] = []
        for name in button_names:
            cls[name] = []
        pickle_save(cls_path, cls)
    else:
        print("load classification result file")
        cls = pickle_load(os.path.join(data_folder, 'classification.pkl'))
    all_files = get_all_scenes_files(data_folder)
    processed_files = cls['processed']
    remaining_files = [f for f in all_files if f not in processed_files]
    print("%d files remain to be processed" % len(remaining_files))
    show_progress(cls)
    return cls, cls_path, processed_files, remaining_files, all_files
Ejemplo n.º 33
0
def group_price_by_block(cityid):
    reader = shapefile.Reader(path.join(LATLNGS_SHP_DIR, str(cityid)))

    changed = False
    block_total = {}
    block_data = BLOCK_DATA_CACHE[cityid]

    for sr in reader.shapeRecords():
        (lng, lat) = sr.shape.points[0]
        data = block_data.get((lat, lng))

        if not data:
            continue

        if 'price' not in data and 'income' not in data:
            block_data[(lat, lng)]['price'] = sr.record[0]
            block_data[(lat, lng)]['income'] = sr.record[1]
            changed = True
        if 'Block' in data:
            if get_block_id(data) in block_total:
                block_total[get_block_id(data)]['total'] += float(sr.record[0])
                block_total[get_block_id(data)]['count'] += 1
            else:
                block_total[get_block_id(data)] = {
                    'total': float(sr.record[0]),
                    'count': 1
                }

    block_averages = {k: v['total'] / v['count'] for k, v in block_total.iteritems()}
    print len(block_data)
    print len(block_averages)

    for key, data in block_data.iteritems():
        if 'Block' in data:
            data['block_price_average'] = block_averages[get_block_id(data)]
            changed = True

    if changed:
        print 'CHANGED! updating block cache file'
        pickle_save(BLOCK_DATA_CACHE, path.join(CACHE_DIR, 'block_data'))
Ejemplo n.º 34
0
def save_model_with_weights(fname, model, scaler, metadata=None):
    """
    Save a keras model config, weights and scaler into a .zip file
    metadata is a dict of arbitrary informations
    """
    if metadata is None:
        metadata = {}

    # We'll bundle the config JSON and the weights HDF5 into a .zip
    with zipfile.ZipFile(fname, mode='w') as zipf:
        # First, save weights to a temporary hdf5 file and then zip it
        with tempfile.NamedTemporaryFile() as wf:
            model.save_weights(wf.name, overwrite=True)
            zipf.write(wf.name, arcname='weights.hdf5')
        with tempfile.NamedTemporaryFile() as wf:
            utils.pickle_save(wf.name, scaler)
            zipf.write(wf.name, arcname='scaler.pickle')
        # add the winsize to the model config
        config = model.get_config()
        config['metadata'] = metadata
        json_str = json.dumps(config)
        zipf.writestr('config.json', json_str)
Ejemplo n.º 35
0
def compute_city(cityid):
    reader = shapefile.Reader(path.join(LATLNGS_SHP_DIR, str(cityid)))
    writer = shapefile.Writer(shapefile.POINT)
    writer.autoBalance = 1
    writer.field('price', 'N')
    writer.field('income', 'N')
    writer.field('age', 'N')
    
    total = len(reader.shapeRecords())
    count = 0
    for sr in reader.shapeRecords():
        point = sr.shape.points[0]
        price = sr.record[0]
        income = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid], INCOME_VARIABLE)
        age = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid], AGE_VARIABLE)
        writer.point(point[0], point[1])
        writer.record(price, income, age)
        count += 1
        if count % 100 == 0:
            print 'Processed %d out of %d' % (count, total)
            pickle_save(CENSUS_DATA_CACHE, path.join(CACHE_DIR, 'census_data_cache'))
            pickle_save(BLOCK_DATA_CACHE, path.join(CACHE_DIR, 'block_data'))
    
    writer.save(path.join(LATLNGS_SHP_DIR, str(cityid) + '_age'))
        if sr.record[0] in zipcodeList:
            polygon = sr.shape.points
            zip_poly[str(sr.record[0])] = map(lambda x: [x[1], x[0]], polygon)
    print 'Finished reading polygon file!'


    print 'Writing city zipcodes file'
    city_zipcode_data = {}
    for key, data in all_cars_data.iteritems():
        zipcode = data[94]
        cityid = int(data[95])
   
         
            
        formatted_zipcode = format_zipcode(zipcode)
        
        if (formatted_zipcode not in city_zipcode_data) and  (formatted_zipcode in zip_poly):
            city_zipcode_data[formatted_zipcode] = {}
            poly = zip_poly[formatted_zipcode]
            city_zipcode_data[formatted_zipcode]['polygon'] = poly
            print "adding %s" % formatted_zipcode
        else:
            pass
            
        
    print 'Saving pickle files'
    pickle_save(city_zipcode_data, path.join(CACHE_DIR, 'zipcodes/pickled/allCities'))
      
        
    print 'Finished!'
        
Ejemplo n.º 37
0
CITY_IDS_CUSTOM = {}


def compute_hotspots(city_id):
    print 'Computing Moran index for', city_id
    shpfile = os.path.join(LATLNGS_SHP_DIR, str(city_id) + '.shp')
    moran_index = arcpy.SpatialAutocorrelation_stats(shpfile, 'price', 'NO_REPORT',
                                   'INVERSE_DISTANCE_SQUARED', 'EUCLIDEAN DISTANCE',
                                   'NONE', '100', '#')
    print 'FINISHED %s!' % city_id
    return moran_index


if __name__ == '__main__':
    try:
        moran_indices = pickle_load(os.path.join(CACHE_DIR, 'moran_indices'))
    except Exception as e:
        print 'Error loading moran_indices: %r, creating a new one' % e
        moran_indices = {}

    for cityid in CITY_IDS_CUSTOM or CITY_IDS:
        if cityid in moran_indices:
            print '%s already exists, skipping' % cityid
            continue
        try:
            moran_indices[cityid] = compute_hotspots(cityid).getOutput(0)
        except Exception as e:
            print 'Cannot compute Morans I for %s: %s' % (cityid, str(e))
        else:
            pickle_save(moran_indices, os.path.join(CACHE_DIR, 'moran_indices'))
Ejemplo n.º 38
0
    print 'Writing city zipcodes file'
    city_zipcode_data = {}
    for key, data in all_cars_data.iteritems():
        zipcode = data[94]
        cityid = int(data[95])
        if cityid == 137:
            if cityid not in city_zipcode_data:
                city_zipcode_data[cityid] = {}
            
            formatted_zipcode = format_zipcode(zipcode)
            
            if formatted_zipcode not in city_zipcode_data[cityid]:
                city_zipcode_data[cityid][formatted_zipcode] = {}
            
            city_zipcode_data[cityid][formatted_zipcode]['price'] = data[2]
            city_zipcode_data[cityid][formatted_zipcode]['income'] = all_census_data[zipcode][11]
            if formatted_zipcode in zip_poly:
                poly = zip_poly[formatted_zipcode]
            else:
                poly = []
                print 'zip_poly does not contain %s!' % formatted_zipcode
            city_zipcode_data[cityid][formatted_zipcode]['polygon'] = poly
        
    print 'Saving pickle files'
    for cityid in city_zipcode_data:
        pickle_save(city_zipcode_data[cityid], path.join(CACHE_DIR, 'zipcodes/pickled/' + str(cityid)))
        print 'Wrote pickled zipcode file for', cityid
        
    print 'Finished!'