def create_ngram2subject(subject2name, save_dir): ngram2subject = {} name2subject = {} for subject_id, subject_names in subject2name.items(): for subject_name in subject_names: if subject_name not in name2subject: name2subject[subject_name] = [subject_id] else: name2subject[subject_name].append(subject_id) name_ngrams = get_name_ngrams(subject_name) for ngram_tuple in name_ngrams: ngram = ' '.join(ngram_tuple) if ngram in ngram2subject.keys(): ngram2subject[ngram].append((subject_id, subject_name)) else: ngram2subject[ngram] = [(subject_id, subject_name)] print('num of subject names: ', len(name2subject)) print('examples of name2subject: ', list(name2subject.items())[:10]) print('num of ngram: ', len(ngram2subject)) print('examples of ngram2subject: ', list(ngram2subject.items())[:10]) print('save ngram2subject in pickle format...') pickle_save(ngram2subject, os.path.join(save_dir, 'ngram2subject.pkl')) print('save name2subject in pickle format...') pickle_save(name2subject, os.path.join(save_dir, 'name2subject.pkl'))
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = (Seq2SeqDataset if hasattr( self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset) self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams assert self.eval_beams >= 1, f"got self.eval_beams={self.eval_beams}. Need an integer > 1" if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
def save_string_int_dict(): # 将nonterminal和terminal对应的映射字典保存并返回 # 其中,对于terminal只选用most frequent的30000个token tt_token_to_int = {} tt_int_to_token = {} nt_token_to_int = {} nt_int_to_token = {} import pickle pickle.dump([terminal_count], open('js_dataset/rename_variable/terminal_counter.pkl', 'wb')) most_common_tuple = terminal_count.most_common(most_common_termial_num) for index, (token, times) in enumerate(most_common_tuple): tt_token_to_int[token] = index tt_int_to_token[index] = token for index, token in enumerate(list(non_terminal_set)): nt_token_to_int[token] = index nt_int_to_token[index] = token tt_int_to_token[len(tt_int_to_token)] = unknown_token # terminal中添加UNK tt_token_to_int[unknown_token] = len(tt_token_to_int) utils.pickle_save( data_parameter_dir, [tt_token_to_int, tt_int_to_token, nt_token_to_int, nt_int_to_token ]) # 将映射字典保存到本地 return tt_token_to_int, tt_int_to_token, nt_token_to_int, nt_int_to_token
def save_len_file( tokenizer_name, data_dir, max_source_length=1024, max_target_length=1024, consider_target=False, **kwargs ): """Save max(src_len, tgt_len) for each example to allow dynamic batching.""" tok = AutoTokenizer.from_pretrained(tokenizer_name) train_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="train", **kwargs) pad = tok.pad_token_id def get_lens(ds): dl = tqdm( DataLoader(ds, batch_size=512, num_workers=8, shuffle=False, collate_fn=ds.collate_fn), desc=str(ds.len_file), ) max_lens = [] for batch in dl: src_lens = batch["input_ids"].ne(pad).sum(1).tolist() tgt_lens = batch["labels"].ne(pad).sum(1).tolist() if consider_target: for src, tgt in zip(src_lens, tgt_lens): max_lens.append(max(src, tgt)) else: max_lens.extend(src_lens) return max_lens train_lens = get_lens(train_ds) val_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="val", **kwargs) val_lens = get_lens(val_ds) pickle_save(train_lens, train_ds.len_file) pickle_save(val_lens, val_ds.len_file)
def create_train_pickle(): print("start create train pickle file...") train_data = open(fileConfig.data_dir + fileConfig.file_weibo_train_data, "r", encoding="utf-8") out_datas = [] for line in tqdm(train_data, "deal train file..."): train_infos = line.split('\t') assert len(train_infos) == 7 out_datas.append({ comConfig.col_uid: train_infos[0], comConfig.col_mid: train_infos[1], comConfig.col_time: train_infos[2], comConfig.col_forward_count: train_infos[3], comConfig.col_comment_count: train_infos[4], comConfig.col_like_count: train_infos[5], comConfig.col_content: train_infos[6] }) print("save train and test infos...") test_data_len = 200000 train_datas = out_datas[:len(out_datas) - test_data_len] test_datas = out_datas[len(out_datas) - test_data_len:] utils.check_dir(fileConfig.pickle_dir) utils.pickle_save(train_datas, fileConfig.pickle_dir + fileConfig.file_train_pickle) utils.pickle_save(test_datas, fileConfig.pickle_dir + fileConfig.file_test_pickle)
def save(self, path, name): if self.X is not None: np.save(os.path.join(path, "dataset_X_{}.npy".format(name)), self.X) if self.y is not None: np.save(os.path.join(path, "dataset_y_{}.npy".format(name)), self.y) if self.image_features is not None: pickle_save( self.image_features, os.path.join(path, "dataset_image_features_{}.pkl".format(name))) if self.X_integral is not None: np.save( os.path.join(path, "dataset_X_integral_{}.npy".format(name)), self.X_integral) if self.X_features is not None: np.save( os.path.join(path, "dataset_X_features_{}.npy".format(name)), self.X_features) if self.X_features_sorted is not None: np.save( os.path.join(path, "dataset_X_features_sorted_{}.npy".format(name)), self.X_features_sorted) if self.X_features_sorted_indices is not None: np.save( os.path.join( path, "dataset_X_features_sorted_indices_{}.npy".format(name)), self.X_features_sorted_indices)
def compute_city(cityid): reader = shapefile.Reader(path.join(LATLNGS_SHP_DIR, str(cityid))) writer = shapefile.Writer(shapefile.POINT) writer.autoBalance = 1 writer.field('price', 'N') writer.field('income', 'N') writer.field('age', 'N') total = len(reader.shapeRecords()) count = 0 for sr in reader.shapeRecords(): point = sr.shape.points[0] price = sr.record[0] income = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid], INCOME_VARIABLE) age = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid], AGE_VARIABLE) writer.point(point[0], point[1]) writer.record(price, income, age) count += 1 if count % 100 == 0: print 'Processed %d out of %d' % (count, total) pickle_save(CENSUS_DATA_CACHE, path.join(CACHE_DIR, 'census_data_cache')) pickle_save(BLOCK_DATA_CACHE, path.join(CACHE_DIR, 'block_data')) writer.save(path.join(LATLNGS_SHP_DIR, str(cityid) + '_age'))
def get_name_type_for_subject(entity2name_path, entity2type_path, triple_path, save_dir): entity2name = pickle_load(entity2name_path) entity2type = pickle_load(entity2type_path) triples = pickle_load(triple_path) has_name_count = 0 has_type_count = 0 subject2name = {} subject2type = {} for index, subject in enumerate(triples.keys()): if subject in entity2name: subject2name[subject] = entity2name[subject] has_name_count += 1 if subject in entity2type: subject2type[subject] = entity2type[subject] has_type_count += 1 print(has_name_count, len(triples.keys())) print(has_type_count, len(triples.keys())) pickle_save(subject2name, os.path.join(save_dir, 'trim_subject2name.pkl')) pickle_save(subject2type, os.path.join(save_dir, 'trim_subject2type.pkl'))
def train(self): self.feature_model = Feature() feature_list = [] label_list = [] sen_list = [] self.loading_none_spliter_rule(feature_list, label_list, sen_list) self.loading_forcing_spliter_rule() self.load_normal_data(feature_list, label_list, sen_list) self.classifier = LogisticRegression(verbose=False) print "Learning..." self.classifier.fit(feature_list, label_list) print "Saving..." utils.pickle_save(self, self.model_path) print "Done" print "Test..." #f = open("wrong.dat","w") predicted_labels = self.classifier.predict(feature_list) ll = len(predicted_labels) cc = 0 for i in xrange(ll): if label_list[i] == 0 and predicted_labels[i] == 1: cc += 1 #print sen_list[i] #f.write("%s\n"%sen_list[i]) #f.close() print cc, ll, cc * 1.0 / ll
def main(args, model=None) -> SummarizationModule: Path(args.output_dir).mkdir(exist_ok=True) if len(os.listdir(args.output_dir)) > 3 and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) # summarization model model: SummarizationModule = SummarizationModule(args) dataset = Path(args.data_dir).name if (args.logger_name == "default" or args.fast_dev_run or str(args.output_dir).startswith("/tmp") or str(args.output_dir).startswith("/var")): logger = True # don't pollute wandb logs unnecessarily elif args.logger_name == "wandb": from pytorch_lightning.loggers import WandbLogger project = os.environ.get("WANDB_PROJECT", dataset) logger = WandbLogger(name=model.output_dir.name, project=project) elif args.logger_name == "wandb_shared": from pytorch_lightning.loggers import WandbLogger logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}") if args.early_stopping_patience >= 0: es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience) else: es_callback = False trainer: pl.Trainer = generic_train( model, args, logging_callback=Seq2SeqLoggingCallback(), checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric, args.save_top_k), early_stopping_callback=es_callback, logger=logger, ) pickle_save(model.hparams, model.output_dir / "hparams.pkl") if not args.do_predict: return model model.hparams.test_checkpoint = "" checkpoints = list( sorted( glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))) if checkpoints: model.hparams.test_checkpoint = checkpoints[-1] trainer.resume_from_checkpoint = checkpoints[-1] trainer.logger.log_hyperparams(model.hparams) # test() without a model tests using the best checkpoint automatically trainer.test() return model
def show_results(evaluations, metric): """ Plot evaluation results with error bar.""" fig, ax = plt.subplots() colors = cm.Dark2(np.linspace(0, 1, len(evaluations))) results = {} for i, evaluation in enumerate(evaluations): res = evaluation.get_results(metric) mean, std = np.nanmean(res, axis=0), np.nanstd(res, axis=0) ax.errorbar(np.arange(mean.shape[0]), mean, yerr=std, color=colors[i], label=evaluation.name, fmt='-o') results[evaluation.name] = res # store the results on disk pwd = os.path.dirname(os.path.realpath(__file__)) folder = '/results/' if evaluations[0].cache_folder == '' else \ '/results/{}/'.format(evaluations[0].cache_folder) folder = pwd + folder pickle_save(folder+'/measurement_lost_{}_{}_results.pkl'.format(evaluations[0].models[0].measurement_lost, metric), results) # Now add the legend with some customizations. legend = ax.legend(loc='upper right') # Set the fontsize for label in legend.get_texts(): label.set_fontsize('small') plt.show()
def nt_seq_to_int(time_steps=50, status='TRAIN'): # 对NT seq进行进一步的处理,首先将每个token转换为number, # 然后对于train data和valid data将所有ast-seq extend成一个list 便于训练时的格式转换 # 对于test data,将所有ast-seq append,保留各个ast的独立seq tt_token_to_int, tt_int_to_token, nt_token_to_int, nt_int_to_token = \ pickle.load(open('js_dataset/rename_variable/rename_parameter.pkl', 'rb')) total_num_nt_pair = 0 if status == 'TRAIN': sub_data_dir = sub_train_data_dir num_sub_data = num_sub_train_data sub_int_data_dir = sub_int_train_dir elif status == 'VALID': sub_data_dir = sub_valid_data_dir num_sub_data = num_sub_valid_data sub_int_data_dir = sub_int_valid_dir elif status == 'TEST': sub_data_dir = sub_test_data_dir num_sub_data = num_sub_test_data sub_int_data_dir = sub_int_test_dir else: print('ERROR! Unknown commend!!') sys.exit(1) def get_subset_data(): # 对每个part的nt_sequence读取并返回,等待进行处理 for i in range(1, num_sub_data + 1): data_path = sub_data_dir + 'part{}.json'.format(i) data = utils.pickle_load(data_path) yield (i, data) subset_generator = get_subset_data() for index, data in subset_generator: data_seq = [] for one_ast in data: # 将每个nt_seq进行截取,并encode成integer,然后保存 if len(one_ast) < time_steps: # 该ast大小不足time step 舍去 continue try: nt_int_seq = [ (nt_token_to_int[n], tt_token_to_int.get(t, tt_token_to_int[unknown_token])) for n, t in one_ast ] except KeyError: print('key error') continue # 在train和valid中,是直接将所有ast-seq extend到一起,在test中,保留各个ast-seq的独立 if status == 'TEST': data_seq.append(nt_int_seq) total_num_nt_pair += len(nt_int_seq) else: data_seq.extend(nt_int_seq) total_num_nt_pair += len(nt_int_seq) one_sub_int_data_dir = sub_int_data_dir + 'int_part{}.json'.format( index) utils.pickle_save(one_sub_int_data_dir, data_seq) # old:14,976,250 new:157,237,460 size of training dataset comparison # old: 1,557,285 new: 81,078,099 测试数据集数据量对比 print('There are {} nt_pair in {} dataset...'.format( total_num_nt_pair, status))
def main(args, model=None) -> SummarizationModule: Path(args.output_dir).mkdir(exist_ok=True) check_output_dir(args, expected_items=3) if model is None: if "summarization" in args.task: model: SummarizationModule = SummarizationModule(args) else: model: SummarizationModule = TranslationModule(args) dataset = Path(args.data_dir).name if ( args.logger_name == "default" or args.fast_dev_run or str(args.output_dir).startswith("/tmp") or str(args.output_dir).startswith("/var") ): from pytorch_lightning.loggers import CSVLogger logger = CSVLogger('chen_logs',name = 'SCHWEIGEN') # don't pollute wandb logs unnecessarily elif args.logger_name == "wandb": from pytorch_lightning.loggers import WandbLogger project = os.environ.get("WANDB_PROJECT", dataset) logger = WandbLogger(name=model.output_dir.name, project=project) elif args.logger_name == "wandb_shared": from pytorch_lightning.loggers import WandbLogger logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}") if args.early_stopping_patience >= 0: es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience) else: es_callback = False lower_is_better = args.val_metric == "loss" trainer: pl.Trainer = generic_train( model, args, logging_callback=Seq2SeqLoggingCallback(), checkpoint_callback=get_checkpoint_callback( args.output_dir, model.val_metric, args.save_top_k, lower_is_better ), early_stopping_callback=es_callback, logger=logger, ) pickle_save(model.hparams, model.output_dir / "hparams.pkl") if not args.do_predict: return model model.hparams.test_checkpoint = "" checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))) if checkpoints: model.hparams.test_checkpoint = checkpoints[-1] trainer.resume_from_checkpoint = checkpoints[-1] trainer.logger.log_hyperparams(model.hparams) # test() without a model tests using the best checkpoint automatically trainer.test() return model
def debug_build_features(count_only=False): all_features = Trainer.build_features(2, 2) print(len(all_features)) if count_only: return for feature in all_features: print(feature) pickle_save(all_features, "all_features.pkl") feature_reloaded = pickle_load('all_features.pkl') assert feature_reloaded == all_features
def generate_all_ids(): """ Generate and store IDs of Teams and Players """ # Teams dict_team_ids = get_team_ids_dictionary() utils.pickle_save(data_obj=dict_team_ids, filename='ids_of_teams.pkl') # Players dict_player_ids = get_player_ids_dictionary() utils.pickle_save(data_obj=dict_player_ids, filename='ids_of_players.pkl') return None
def _save_state(self): obj = {} obj["layer_index"] = self.layer_index obj["iter"] = self.iter obj["loss"] = self.loss obj["mlp_best"] = self.mlp_best obj["mlp_crrnt"] = self.mlp_crrnt #obj["iters_without_impr"] = self.iters_without_impr obj["train_sets"] = self.train_sets.get_state() utils.pickle_save(obj, self.wdir + "/layerwisetrainer_state")
def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id if isinstance(self.tokenizer, MBartTokenizer) or isinstance( self.tokenizer, MarianTokenizer): self.dataset_class = TranslationDataset else: self.dataset_class = Seq2SeqDataset
def _save_state(self): obj = {} obj["layer_index"] = self.layer_index obj["iter"] = self.iter obj["loss"] = self.loss obj["mlp_best"] = self.mlp_best obj["mlp_crrnt"] = self.mlp_crrnt #obj["iters_without_impr"] = self.iters_without_impr obj["train_sets"] = self.train_sets.get_state() utils.pickle_save(obj, self.wdir+"/layerwisetrainer_state")
def main(args, model=None) -> SummarizationModule: Path(args.output_dir).mkdir(exist_ok=True) if len(os.listdir(args.output_dir)) > 3 and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if model is None: if args.task == "summarization": model: SummarizationModule = SummarizationModule(args) else: model: SummarizationModule = TranslationModule(args) dataset = Path(args.data_dir).name if (args.logger == "default" or args.fast_dev_run or str(args.output_dir).startswith("/tmp") or str(args.output_dir).startswith("/var")): logger = True # don't pollute wandb logs unnecessarily elif args.logger == "wandb": from pytorch_lightning.loggers import WandbLogger logger = WandbLogger(name=model.output_dir.name, project=dataset) elif args.logger == "wandb_shared": from pytorch_lightning.loggers import WandbLogger logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}") trainer: pl.Trainer = generic_train( model, args, logging_callback=Seq2SeqLoggingCallback(), checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric), logger=logger, # TODO: early stopping callback seems messed up ) pickle_save(model.hparams, model.output_dir / "hparams.pkl") if not args.do_predict: return model model.hparams.test_checkpoint = "" checkpoints = list( sorted( glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True))) if checkpoints: model.hparams.test_checkpoint = checkpoints[-1] trainer.resume_from_checkpoint = checkpoints[-1] trainer.logger.log_hyperparams(model.hparams) trainer.test( model ) # this breaks in DDP, known lightning issue. See evaluate_checkpoint to recover metrics. return model
def _get_trajs(self): """Get trajectories either from result folder or generate them.""" distances_file_name, traj_file_name = '/distances.npy', '/trajectories.pkl' distances_path, traj_path = self.folder_path+distances_file_name, self.folder_path+traj_file_name if os.path.isfile(distances_path) and os.path.isfile(traj_path): print("loading trajectories...") distances, trajs = np.load(distances_path), pickle_load(traj_path) else: distances, trajs = self._generate_trajs() np.save(distances_path, distances) pickle_save(traj_path, trajs) return distances, trajs
def _save_state(self): obj = {} obj["iter"] = self.iter obj["done"] = self.done obj["loss"] = self.loss obj["rate"] = self.rate obj["mlp_best"] = self.mlp_best obj["halving"] = self.halving obj["wasAccepted"] = self.wasAccepted obj["train_sets"] = self.train_sets.get_state() obj["valid_sets"] = self.valid_sets.get_state() utils.pickle_save(obj, self.wdir + "/trainer_state")
def add_temporary_to_cls(): global temp_cls, cls, cls_path for k, v in temp_cls.items(): #print('{}: {}'.format(k, v)) cls[k] += v num = check_how_many_we_have(cls) print( "Found %d proper scenes: single object or multiple objects with same motion trend " % num) if num >= 500: tkMessageBox.showinfo("Congratulations", "You have more than 500 proper scenes already!") pickle_save(cls_path, cls)
def _save_state(self): obj = {} obj["iter"] = self.iter obj["done"] = self.done obj["loss"] = self.loss obj["rate"] = self.rate obj["mlp_best"] = self.mlp_best obj["halving"] = self.halving obj["wasAccepted"] = self.wasAccepted obj["train_sets"] = self.train_sets.get_state() obj["valid_sets"] = self.valid_sets.get_state() utils.pickle_save(obj, self.wdir+"/trainer_state")
def dataset_training_pair(subset_size=5000): """读取原始AST数据集,并将其分割成多个subset data 对每个AST,生成多个training pair""" data_path = js_train_data_dir total_size = 100000 print('begin to generate training pairs from dataset:{}'.format(data_path)) file = open(data_path, 'r') nt_train_pairs_list = [] tt_train_pairs_list = [] num_nt_train_pair = 0 num_tt_train_pair = 0 for i in range(1, total_size + 1): try: line = file.readline() # read a lind from file(one ast) ast = json.loads(line) # transform it to json format nt_train_pairs, tt_train_pairs = generate_train_pair(ast, nt_n_dim, nt_t_dim, tt_n_dim, tt_t_dim) except UnicodeDecodeError as error: # arise by readline print(error) except JSONDecodeError as error: # arise by json_load print(error) except RecursionError as error: print(error) except BaseException: print('other unknown error, plesae check the code') else: nt_train_pairs_list.extend(nt_train_pairs) tt_train_pairs_list.extend(tt_train_pairs) if i % subset_size == 0: # 当读入的ast已经等于给定的subset的大小时 # 对生成的training是否符合规格进行检查 # check_correct(nt_train_pairs_list, nt_n_dim, nt_t_dim) # check_correct(tt_train_pairs_list, tt_n_dim, tt_t_dim) nt_pair_path = nt_train_pair_dir + \ 'part{}'.format(i // subset_size) + '.json' tt_pair_path = tt_train_pair_dir + \ 'part{}'.format(i // subset_size) + '.json' pickle_save(nt_pair_path, nt_train_pairs_list) pickle_save(tt_pair_path, tt_train_pairs_list) print('There are {} nt_train_pairs in {}th subset'.format(len(nt_train_pairs_list), i)) print('There are {} tt_train_pairs in {}th subset'.format(len(tt_train_pairs_list), i)) num_nt_train_pair += len(nt_train_pairs_list) num_tt_train_pair += len(tt_train_pairs_list) nt_train_pairs_list = [] tt_train_pairs_list = [] print("Number of non-terminal training pairs: {}".format(num_nt_train_pair)) # 89512876 - 3 print("Number of terminal training pairs: {}".format(num_tt_train_pair)) # 82839660
def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") # save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) # self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # Entailment model self.entailment_tokenizer = AutoTokenizer.from_pretrained( 'textattack/roberta-base-MNLI') self.entailment_model = AutoModelForSequenceClassification.from_pretrained( 'textattack/roberta-base-MNLI') self.entailment_model = self.entailment_model.to('cuda')
def data_process(train_or_test, subset_size=5000): """读取原始AST数据集,并将其分割成多个subset data 对每个AST,将其转换成二叉树的形式,然后进行中序遍历生成一个nt-sequence""" sys.setrecursionlimit(10000) # 设置递归最大深度 print('setrecursionlimit == 10000') saved_to_path = sub_data_dir if train_or_test == 'train': # 对training数据集进行分割 data_path = js_train_data_dir total_size = 100000 base_num = 0 elif train_or_test == 'test': # 对test数据集进行分割 data_path = js_test_data_dir total_size = 50000 base_num = num_sub_train_data else: raise KeyError file = open(data_path, 'r') subset_list = [] nt_seq = [] for i in range(1, total_size + 1): try: line = file.readline() # read a lind from file(one ast) ast = json.loads(line) # transform it to json format binary_tree = bulid_binary_tree(ast) # AST to binary tree nt_seq = ast_to_seq(binary_tree, 'process') # binary to nt_sequence except UnicodeDecodeError as error: # arise by readline print(error) except JSONDecodeError as error: # arise by json_load print(error) except RecursionError as error: print(error) except BaseException as error: print('UNKNOWN ERROR', error) else: subset_list.append(nt_seq) # 将生成的nt sequence加入到list中 if i % subset_size == 0: # 当读入的ast已经等于给定的subset的大小时 sub_path = saved_to_path + \ 'sub_part{}'.format(base_num + (i // subset_size)) + '.json' utils.pickle_save(sub_path, subset_list) # 将subset dataset保存 subset_list = [] if train_or_test == 'train': # 当处理训练数据集时,需要保存映射map,测试数据集则不需要 save_string_int_dict() print('training data seperating finished...') print('encoding information has been saved in {}'.format(data_parameter_dir)) else: print('testing data seperating finished...')
def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) #use_task_specific_params(self.model, "summarization") #save_git_info(self.hparams.output_dir) #self.metrics_save_path = Path("/results/metrics.json") self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) #self.hparams.git_sha = get_git_info()["repo_sha"] try: self.num_workers = hparams.num_workers except AttributeError: self.num_workers = 2 self.decoder_start_token_id = None self.dataset_class = Seq2SeqDataset
def dataset_split(is_training=True, subset_size=5000): """读取原始AST数据集,并将其分割成多个subset data 对每个AST,将其转换成二叉树的形式,然后进行中序遍历生成一个nt-sequence""" sys.setrecursionlimit(10000) # 设置递归最大深度 print('setrecursionlimit == 10000') if is_training: # 对training数据集进行分割 data_path = js_train_data_dir total_size = 100000 saved_to_path = sub_train_data_dir else: # 对test数据集进行分割 data_path = js_test_data_dir total_size = 50000 saved_to_path = sub_test_data_dir file = open(data_path, 'r') subset_list = [] nt_seq = [] for i in range(1, total_size + 1): try: line = file.readline() # read a lind from file(one ast) ast = json.loads(line) # transform it to json format rename_ast = rename_variable(ast) binary_tree = bulid_binary_tree(rename_ast) # AST to binary tree nt_seq = ast_to_seq(binary_tree) # binary to nt_sequence except UnicodeDecodeError as error: # arise by readline print(error) except JSONDecodeError as error: # arise by json_load print(error) except RecursionError as error: print(error) except BaseException: print('other unknown error, plesae check the code') else: subset_list.append(nt_seq) # 将生成的nt sequence加入到list中 if i % subset_size == 0: # 当读入的ast已经等于给定的subset的大小时 sub_path = saved_to_path + \ 'part{}'.format(i // subset_size) + '.json' utils.pickle_save(sub_path, subset_list) # 将subset dataset保存 subset_list = [] if is_training: # 当处理训练数据集时,需要保存映射map,测试数据集则不需要 save_string_int_dict() print('training data seperating finished...') print('encoding information has been saved in {}'.format( data_parameter_dir)) else: print('testing data seperating finished...')
def align(params, load_prefix, save_path): """ Align depth and color images. Save everything into a single pickle object. Parameters ---------- params: Camera intrinsic parameters. load_prefix: Path to load data. Will load color stream from `load_prefix`_color.avi, depth stream from `load_prefix`_depth.pkl, and body stream from `load_prefix`_body.pkl. save_path: Path to save result data. """ color_src = cv2.VideoCapture(load_prefix + '_color.avi') depth_src = pickle_load(load_prefix + '_depth.pkl') body_src = pickle_load(load_prefix + '_body.pkl') depth_height = depth_src[0].shape[0] depth_width = depth_src[0].shape[1] h_coord = np.tile(np.reshape(np.arange(1, depth_width + 1), [1, -1]), [depth_height, 1]) - params['cx_d'] v_coord = np.tile(np.reshape(np.arange(1, depth_height + 1), [-1, 1]), [1, depth_width]) - params['cy_d'] pcloud_frames = [] depth_frames = [] color_frames = [] body_frames = [] for depth, body in tqdm(zip(depth_src, body_src)): _, color = color_src.read() pcloud = depth_to_world(depth, params, h_coord, v_coord) pcloud_frames.append(pcloud) color = world_to_color(params, pcloud, color) color_frames.append(color) body_frames.append(body) depth_frames.append(depth) data = { 'pclouds': pcloud_frames, 'depths': depth_frames, 'colors': color_frames, 'bodies': body_frames } pickle_save(save_path, data)
def trajectories_by_importance(execution_traces, state_importance, args): if args.load_trajectories: all_trajectories = pickle_load(join(args.results_dir, 'Trajectories.pkl')) if args.verbose: print(f"HIGHLIGHTS {15 * '-' + '>'} Trajectories Loaded") else: all_trajectories = get_all_trajectories(execution_traces, args.trajectory_length, state_importance) pickle_save(all_trajectories, join(args.results_dir, 'Trajectories.pkl')) if args.verbose: print(f"HIGHLIGHTS {15 * '-' + '>'} Trajectories Generated") sorted_by_method = sorted([(x.importance[args.trajectory_importance], x) for x in all_trajectories], key=lambda y: y[0], reverse=True) sorted_trajectories = [x[1] for x in sorted_by_method] trajectories_scores = [x[0] for x in sorted_by_method] summary_trajectories = trajectory_highlights(sorted_trajectories, trajectories_scores, args.allowed_similar_states, args.num_trajectories, args.highlights_selection_method) return all_trajectories, summary_trajectories
def fit_ir(qs, save_paths={}): cd = calibration_dataset(qs) df = pd.DataFrame(cd, columns=['x', 'y']) if save_paths: np.save(save_paths['calibration'], cd) model = IsotonicRegression( y_min=0, y_max=1, increasing=True, out_of_bounds='clip' ).fit(df['x'], df['y']) if save_paths: pickle_save(model, save_paths['model']) return model
def init_app(data_folder): cls_path = os.path.join(data_folder, 'classification.pkl') if not os.path.isfile(cls_path): print("initialize classification result file") print("file is {}".format(cls_path)) cls = dict() cls['processed'] = [] for name in button_names: cls[name] = [] pickle_save(cls_path, cls) else: print("load classification result file") cls = pickle_load(os.path.join(data_folder, 'classification.pkl')) all_files = get_all_scenes_files(data_folder) processed_files = cls['processed'] remaining_files = [f for f in all_files if f not in processed_files] print("%d files remain to be processed" % len(remaining_files)) show_progress(cls) return cls, cls_path, processed_files, remaining_files, all_files
def group_price_by_block(cityid): reader = shapefile.Reader(path.join(LATLNGS_SHP_DIR, str(cityid))) changed = False block_total = {} block_data = BLOCK_DATA_CACHE[cityid] for sr in reader.shapeRecords(): (lng, lat) = sr.shape.points[0] data = block_data.get((lat, lng)) if not data: continue if 'price' not in data and 'income' not in data: block_data[(lat, lng)]['price'] = sr.record[0] block_data[(lat, lng)]['income'] = sr.record[1] changed = True if 'Block' in data: if get_block_id(data) in block_total: block_total[get_block_id(data)]['total'] += float(sr.record[0]) block_total[get_block_id(data)]['count'] += 1 else: block_total[get_block_id(data)] = { 'total': float(sr.record[0]), 'count': 1 } block_averages = {k: v['total'] / v['count'] for k, v in block_total.iteritems()} print len(block_data) print len(block_averages) for key, data in block_data.iteritems(): if 'Block' in data: data['block_price_average'] = block_averages[get_block_id(data)] changed = True if changed: print 'CHANGED! updating block cache file' pickle_save(BLOCK_DATA_CACHE, path.join(CACHE_DIR, 'block_data'))
def save_model_with_weights(fname, model, scaler, metadata=None): """ Save a keras model config, weights and scaler into a .zip file metadata is a dict of arbitrary informations """ if metadata is None: metadata = {} # We'll bundle the config JSON and the weights HDF5 into a .zip with zipfile.ZipFile(fname, mode='w') as zipf: # First, save weights to a temporary hdf5 file and then zip it with tempfile.NamedTemporaryFile() as wf: model.save_weights(wf.name, overwrite=True) zipf.write(wf.name, arcname='weights.hdf5') with tempfile.NamedTemporaryFile() as wf: utils.pickle_save(wf.name, scaler) zipf.write(wf.name, arcname='scaler.pickle') # add the winsize to the model config config = model.get_config() config['metadata'] = metadata json_str = json.dumps(config) zipf.writestr('config.json', json_str)
def compute_city(cityid): reader = shapefile.Reader(path.join(LATLNGS_SHP_DIR, str(cityid))) writer = shapefile.Writer(shapefile.POINT) writer.autoBalance = 1 writer.field('price', 'N') writer.field('income', 'N') writer.field('age', 'N') total = len(reader.shapeRecords()) count = 0 for sr in reader.shapeRecords(): point = sr.shape.points[0] price = sr.record[0] income = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid], INCOME_VARIABLE) age = get_variable(point[1], point[0], BLOCK_DATA_CACHE[cityid], AGE_VARIABLE) writer.point(point[0], point[1]) writer.record(price, income, age) count += 1 if count % 100 == 0: print 'Processed %d out of %d' % (count, total) pickle_save(CENSUS_DATA_CACHE, path.join(CACHE_DIR, 'census_data_cache')) pickle_save(BLOCK_DATA_CACHE, path.join(CACHE_DIR, 'block_data')) writer.save(path.join(LATLNGS_SHP_DIR, str(cityid) + '_age'))
if sr.record[0] in zipcodeList: polygon = sr.shape.points zip_poly[str(sr.record[0])] = map(lambda x: [x[1], x[0]], polygon) print 'Finished reading polygon file!' print 'Writing city zipcodes file' city_zipcode_data = {} for key, data in all_cars_data.iteritems(): zipcode = data[94] cityid = int(data[95]) formatted_zipcode = format_zipcode(zipcode) if (formatted_zipcode not in city_zipcode_data) and (formatted_zipcode in zip_poly): city_zipcode_data[formatted_zipcode] = {} poly = zip_poly[formatted_zipcode] city_zipcode_data[formatted_zipcode]['polygon'] = poly print "adding %s" % formatted_zipcode else: pass print 'Saving pickle files' pickle_save(city_zipcode_data, path.join(CACHE_DIR, 'zipcodes/pickled/allCities')) print 'Finished!'
CITY_IDS_CUSTOM = {} def compute_hotspots(city_id): print 'Computing Moran index for', city_id shpfile = os.path.join(LATLNGS_SHP_DIR, str(city_id) + '.shp') moran_index = arcpy.SpatialAutocorrelation_stats(shpfile, 'price', 'NO_REPORT', 'INVERSE_DISTANCE_SQUARED', 'EUCLIDEAN DISTANCE', 'NONE', '100', '#') print 'FINISHED %s!' % city_id return moran_index if __name__ == '__main__': try: moran_indices = pickle_load(os.path.join(CACHE_DIR, 'moran_indices')) except Exception as e: print 'Error loading moran_indices: %r, creating a new one' % e moran_indices = {} for cityid in CITY_IDS_CUSTOM or CITY_IDS: if cityid in moran_indices: print '%s already exists, skipping' % cityid continue try: moran_indices[cityid] = compute_hotspots(cityid).getOutput(0) except Exception as e: print 'Cannot compute Morans I for %s: %s' % (cityid, str(e)) else: pickle_save(moran_indices, os.path.join(CACHE_DIR, 'moran_indices'))
print 'Writing city zipcodes file' city_zipcode_data = {} for key, data in all_cars_data.iteritems(): zipcode = data[94] cityid = int(data[95]) if cityid == 137: if cityid not in city_zipcode_data: city_zipcode_data[cityid] = {} formatted_zipcode = format_zipcode(zipcode) if formatted_zipcode not in city_zipcode_data[cityid]: city_zipcode_data[cityid][formatted_zipcode] = {} city_zipcode_data[cityid][formatted_zipcode]['price'] = data[2] city_zipcode_data[cityid][formatted_zipcode]['income'] = all_census_data[zipcode][11] if formatted_zipcode in zip_poly: poly = zip_poly[formatted_zipcode] else: poly = [] print 'zip_poly does not contain %s!' % formatted_zipcode city_zipcode_data[cityid][formatted_zipcode]['polygon'] = poly print 'Saving pickle files' for cityid in city_zipcode_data: pickle_save(city_zipcode_data[cityid], path.join(CACHE_DIR, 'zipcodes/pickled/' + str(cityid))) print 'Wrote pickled zipcode file for', cityid print 'Finished!'