def post_process(rec_prmtop, lig_prmtop, complex_prmtop, sampling_nc_file, nr_resampled_complexes, sander_tmp_dir, rec_pdb_out, lig_pdb_out, bpmf_pkl_out): post_pro = PostProcess(rec_prmtop, lig_prmtop, complex_prmtop, sampling_nc_file, SOLVENT_PHASES, nr_resampled_complexes, False, TEMPERATURE, sander_tmp_dir) post_pro.write_rececptor_pdb(rec_pdb_out) post_pro.write_resampled_ligand_pdb(lig_pdb_out) post_pro.pickle_bpmf(bpmf_pkl_out) return None
def toggle_effect(self, *args, **kwargs): """ override Effect.toggle_effect to copy shaders to group post_process """ super(Group, self).toggle_effect(*args, **kwargs) if self.is_group and (self.active_effects or self.post_process): if not self.post_process: from postprocess import PostProcess self.post_process = PostProcess(self.parent) self.post_process.set_visible(1) self.post_process.toggle_effect(*args, **kwargs)
def __init__(self, task_id, file_name, file_id, credentials, download_dir, listener): self.task_id = task_id self.file_id = file_id self.credentials = credentials self.dir = download_dir self.listener = listener self.earth_engine_status = EarthEngineStatus(task_id=task_id, listener=self) self.drive_download = DriveDownload(credentials=self.credentials, file_name=file_name, file_id=self.file_id, download_dir=self.dir, listener=self) self.post_process = PostProcess(file_name=file_name, download_dir=download_dir, listener=self) self.current_step = None
def main(): # build dataset batch_size = 1 height = 48 width = 48 dataset = TestDataGenerator(PuppetDataset, 4, batch_size, height=height, width=width) evaluate = Eval() postprocess = PostProcess(48, 48) # generate and display image_group, guide_mask_group, annkp_group = dataset.next() outobjects_group = [] for x in range(batch_size): image = image_group[x] # select last level see, and unuse other level mask = guide_mask_group[x][-1] display_my_masks(image, mask) # use groudtruth mask directly as predict mask to process outobjects_group.append(postprocess.process(mask)) evaluate.evaluate(annkp_group, outobjects_group)
def validate(model, test_data, golden_file, beam_size=8, alpha=0.6, max_time_step=100): """For development Only""" pp = PostProcess() ref_stream = [] for line in open(golden_file + '.input_clean'): if line.startswith('# ::tokens '): o = json.loads(line[len('# ::tokens '):].strip()) ref_stream.append(' '.join(o).lower()) # gold model output graph, gold_sys_stream, _, abstract = read_file(golden_file + '.preproc') ref_streams = [ref_stream] sys_stream = [] for batch in test_data: res = generate_batch(model, batch, beam_size, alpha, max_time_step) sys_stream.extend(res['token']) assert len(sys_stream) == len(ref_stream) sys_stream = [ pp.post_process(o, abstract[i], graph[i]) for i, o in enumerate(sys_stream) ] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=True, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) return bleu, chrf
def validate(model, test_data, beam_size=8, alpha=0.6, max_time_step=100): """For development Only""" pp = PostProcess() ref_stream = [] sys_stream = [] for batch in test_data: res = generate_batch(model, batch, beam_size, alpha, max_time_step) sys_stream.extend(res['token']) ref_stream.extend(batch['target']) assert len(sys_stream) == len(ref_stream) sys_stream = [pp.post_process(o) for o in sys_stream] ref_stream = [' '.join(o) for i in ref_stream] ref_streams = [ref_stream] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=False, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) return bleu, chrf
# from preprocess import preprocess import os import numpy as np from adapter import Adapt from attention import NeuralModel from postprocess import PostProcess from preprocess import preprocess from recommend import CFUtil from utils.utils import transform if __name__ == "__main__": np.random.seed(100) path = os.path.abspath('.') pst = PostProcess(path) samples, users, movies = preprocess() samples = samples[0:20000] users = transform(users) movies = transform(movies) # pst.saveSamples(samples, 'samples.csv') # pst.saveReviews(users, 'users.csv') # pst.saveReviews(movies, 'movies.csv') # load data # samples = pst.loadSamples('samples.csv') # users = pst.loadReviews('users.csv') # movies = pst.loadReviews('movies.csv')
def main(): args = get_arguments() random.seed(args.random_seed) np.random.seed(args.random_seed) # sklearn use np to generate random value # Create folders and set logging format args.model_dir = os.path.join(args.out_dir, 'ckpt-{}'.format(args.class_weight_scheme)) args.log_dir = os.path.join(args.out_dir, 'log') args.ensemble_dir = os.path.join(args.out_dir, 'ensemble-{}'.format(args.class_weight_scheme)) if args.class_weight_scheme == 'customize': args.model_dir = os.path.join(args.model_dir, 'weight{}'.format(args.additional_weight)) args.ensemble_dir = os.path.join(args.ensemble_dir, 'weight{}'.format(args.additional_weight)) prepare_folders(args) logger = set_logging(args) logger.info("Here is the arguments of this running:") logger.info("{}".format(args)) utils.check_args_conflict(args) # Set files which contain data for training and test. If use "trecis2019-A", it means we want to tune parameters. args.data_prefix = "trecis2019-B" # Note that for 2019-B submission, all '2019' means '2019-B' and '2018' means '2018 + 2019-A' label_file = os.path.join(args.data_dir, 'ITR-H.types.v{}.json'.format( 4 if args.data_prefix == "trecis2019-B" else 3)) tweet_file_list = [os.path.join(args.data_dir, 'all-tweets.txt')] tweet_file_list_2019 = [os.path.join(args.data_dir, 'all-tweets-2019.txt')] train_file_list = [os.path.join(args.data_dir, 'TRECIS-CTIT-H-Training.json')] train_file_list += [os.path.join(args.data_dir, 'TRECIS-2018-TestEvents-Labels', 'assr{}.test'.format(i)) for i in range(1, 7)] if args.data_prefix == "trecis2019-B": train_file_list += [os.path.join(args.data_dir, '2019ALabels', '2019A-assr{}.json'.format(i)) for i in range(1, 6)] train_file_list += [os.path.join(args.data_dir, '2019ALabels', '2019-assr2.json')] test_raw_tweets_json_folder = 'download_tweets' # Some output files which has been formalized for further usages. formal_train_file = os.path.join(args.data_dir, 'train.txt{}'.format('_small' if args.sanity_check else '')) formal_test_file = os.path.join(args.data_dir, 'test.txt{}') tweet_text_out_file = os.path.join(args.out_dir, 'tweets-clean-text.txt') tweet_id_out_file = os.path.join(args.out_dir, 'tweets-id.txt') tweet_text_out_file_2019 = os.path.join(args.out_dir, 'tweets-clean-text-2019.txt') tweet_id_out_file_2019 = os.path.join(args.out_dir, 'tweets-id-2019.txt') predict_priority_score_out_file = os.path.join(args.out_dir, 'predict_priority_score.txt') # Set files for submission. args.model_name = '{0}{1}'.format(args.model, '-event' if args.event_wise else '') args.dev_label_file = os.path.join(args.ensemble_dir, 'dev_label.txt') args.dev_predict_file = os.path.join(args.ensemble_dir, 'dev_predict_{}.txt'.format(args.model_name)) args.test_predict_file = os.path.join(args.ensemble_dir, 'test_predict_{}.txt'.format(args.model_name)) args.submission_folder = utils.prepare_submission_folder(args) args.submission_file = os.path.join(args.submission_folder, 'submission_{}'.format(args.model_name)) # As the original files provided by TREC is quite messy, we formalize them into train and test file. utils.formalize_files(train_file_list, formal_train_file, args) utils.formalize_test_file(test_raw_tweets_json_folder, formal_test_file, prefix=args.data_prefix) logger.info("The training data file is {0} and testing data file is {1}".format( formal_train_file, formal_test_file)) # Step0. Extract some info which can be used later (also useful for generating submission files). label2id, majority_label, short2long_label = utils.get_label2id(label_file, formal_train_file, args.cv_num) id2label = utils.get_id2label(label2id) class_weight = utils.get_class_weight(args, label2id, id2label, formal_train_file) # When get submission, there is no need to run all following steps, but only read the `test_predict_file` and # pick some classes as final output according to policy (such as top-2 or auto-threshold). # You MUST run `--predict_mode` in advance to get the `test_predict_file` prepared. if args.get_submission: postpro = PostProcess(args, label2id, id2label, class_weight, majority_label, short2long_label, formal_train_file, formal_test_file, test_raw_tweets_json_folder, predict_priority_score_out_file) postpro.pick_labels_and_write_final_result() quit() # Step1. Preprocess and extract features for all tweets tweetid_list, tweet_content_list = utils.get_tweetid_content(tweet_file_list) utils.write_tweet_and_ids(tweetid_list, tweet_content_list, tweet_text_out_file, tweet_id_out_file) tweetid_list_2019, tweet_content_list_2019 = utils.get_tweetid_content(tweet_file_list_2019) utils.write_tweet_and_ids(tweetid_list_2019, tweet_content_list_2019, tweet_text_out_file_2019, tweet_id_out_file_2019) # Note that before `extract_features()`, we should manually run the `extract_features.sh` in `feature_tools`. # quit() # The `extract_features.sh` only need to be run once for the same dataset. preprocess = Preprocess(args, tweetid_list, tweet_content_list, label2id, tweet_id_out_file) preprocess.extract_features() preprocess_2019 = Preprocess(args, tweetid_list_2019, tweet_content_list_2019, label2id, tweet_id_out_file_2019, test=True) preprocess_2019.extract_features() if args.train_regression: data_x, data_score = preprocess.extract_train_data(formal_train_file, get_score=True) train_regression = TrainRegression(args, data_x, data_score) if args.cross_validate: train_regression.cross_validate() quit() if args.cross_validate: # Step2. Train and Cross-validation (for tuning hyper-parameters). # If we want to do ensemble in the future, we need the prediction on dev data by setting `--cross_validate`. if args.event_wise: data_x, data_y, event2idx_list, line_num = preprocess.extract_train_data(formal_train_file) data_predict_collect = np.zeros([line_num, len(label2id)]) metrics_collect = [] metric_names = None for event_type in utils.idx2event_type: it_data_x, it_data_y = data_x[event_type], data_y[event_type] train = Train(args, it_data_x, it_data_y, id2label, preprocess.feature_len, class_weight, event_type) metrics, predict_score = train.train() for i, idx in enumerate(event2idx_list[event_type]): data_predict_collect[idx] = predict_score[i] metrics_collect.append((metrics, it_data_x.shape[0])) if metric_names is None: metric_names = train.metric_names utils.get_final_metrics(metrics_collect, metric_names) else: data_x, data_y = preprocess.extract_train_data(formal_train_file) train = Train(args, data_x, data_y, id2label, preprocess.feature_len, class_weight) _, data_predict_collect = train.train() if args.predict_mode: utils.write_predict_and_label(args, formal_train_file, label2id, data_predict_collect) if args.predict_mode: # Step3. Get the 2019 test data, and retrain the model on all training data, then predict on the 2019-test if args.event_wise: data_x, data_y, _, _ = preprocess.extract_train_data(formal_train_file) test_x, event2idx_list, line_num = preprocess_2019.extract_formalized_test_data(formal_test_file) test_predict_collect = np.zeros([line_num, len(label2id)]) for event_type in utils.idx2event_type: it_data_x, it_data_y, it_test_x = data_x[event_type], data_y[event_type], test_x[event_type] if len(it_test_x) == 0: print("[WARNING] There are no event belongs to {} for the test data".format(event_type)) continue train = Train(args, it_data_x, it_data_y, id2label, preprocess_2019.feature_len, class_weight, event_type) train.train_on_all() predict_score = train.predict_on_test(it_test_x) for i, idx in enumerate(event2idx_list[event_type]): test_predict_collect[idx] = predict_score[i] else: data_x, data_y = preprocess.extract_train_data(formal_train_file) test_x = preprocess_2019.extract_formalized_test_data(formal_test_file) train = Train(args, data_x, data_y, id2label, preprocess_2019.feature_len, class_weight) train.train_on_all() test_predict_collect = train.predict_on_test(test_x) utils.write_predict_res_to_file(args, test_predict_collect) if args.train_regression: test_x = preprocess_2019.extract_formalized_test_data(formal_test_file) if args.event_wise: # For event_wise setting, there will be many additional things extracted, what we need is only test_x. test_x = test_x[0] train_regression.train() predict_priority_score = train_regression.predict_on_test(test_x) utils.write_predict_score_to_file(predict_priority_score, predict_priority_score_out_file) if args.ensemble is not None: # TODO(junpeiz): Average the priority score for ensemble. # Step4 (optional). Do the ensemble of different model if args.event_wise: raise NotImplementedError("We don't want to ensemble for event-wise models") else: out_file = os.path.join(args.out_dir, 'ensemble_out.txt') # Note the file list contains predictions from all models with and without the '-event' suffix. # So, we need to train both event-wise and not event-wise models or just delete those files in the folder. dev_predict_file_list = utils.get_predict_file_list(args.ensemble_dir, 'dev_predict_') test_predict_file_list = utils.get_predict_file_list(args.ensemble_dir, 'test_predict_') train_x = utils.get_ensemble_feature(dev_predict_file_list) train_y = utils.get_ensemble_label(args.dev_label_file) print("The shape of ensemble train_x is {0}".format(train_x.shape)) utils.ensemble_cross_validate(train_x, train_y, id2label, train.mlb, args.ensemble) test_x = utils.get_ensemble_feature(test_predict_file_list) predict = utils.ensemble_train_and_predict(train_x, train.mlb.transform(train_y), test_x, id2label, args.ensemble) predict = [id2label[x] for x in predict] with open(out_file, 'w', encoding='utf8') as f: for it_predict in predict: f.write("{}\n".format(it_predict)) print("The ensemble result has been written to {}".format(out_file))
def __init__(self, generator, height=48, width=48, batch_size=1): super(EvalPuppet, self).__init__() self.batch_size = batch_size self.generator = generator self.postprocess = PostProcess(height, width) self.evaluate = Eval()
def get_imp_keywords_for_year(self, year): year_docs_word_tuple_dict, year_words_list = self.get_keywords_for_year(year) postProcess = PostProcess() return year_docs_word_tuple_dict, postProcess.find_common_words(year_words_list, 1000)