def do_niv(X_test, X_train, T_train, Y_train, n_niv_params, dataset_name, fold_idx): niv_filename = 'niv_' + dataset_name fold_name = 'fold' + str(fold_idx + 1) niv_vars = load_json(niv_filename) survived_vars = niv_vars.get(fold_name) if niv_vars else None if survived_vars: print('Stored NIV:', survived_vars) X_test = X_test[survived_vars] X_train = X_train[survived_vars] else: niv_start_time = time.time() print('Start NIV variable selection') survived_vars = niv_variable_selection(X_train, Y_train, T_train, n_niv_params) print('NIV:', list(survived_vars)) X_train = X_train[survived_vars] X_test = X_test[survived_vars] niv_end_time = time.time() print('NIV time:', niv_end_time - niv_start_time) if niv_vars: niv_vars.update({fold_name: survived_vars.tolist()}) else: niv_vars = {fold_name: survived_vars.tolist()} save_json(niv_filename, niv_vars) return X_test, X_train
def merge_datasets(dataset_filenames, out_filename): # the assumption here is that the abstract text and entity detected are identical # the only different things are relations # for chemprot and drugprot this is the case, see compare_datasets.py datasets = [] for filename in dataset_filenames: datasets.append(utils.read_json(filename)) # map all dataset to CPR-X for ds in datasets: map_to_cpr(ds) merged = datasets[0] for ds in datasets[1:]: for article_id, article in ds.items(): # if this article is unique to this dataset, just add it # but make sure its relations are converted to CPR-X if article_id not in merged: merged[article_id] = article else: print(f'merging {article_id}') merge_article(merged[article_id], article) # stats total_relation = 0 for article_id, article in merged.items(): for sent in article['abstract']: total_relation += len(sent['relations']) print(f'number of relation of merged dataset: {total_relation}') utils.save_json(out_filename, merged)
def main(base_model_name, weights_file, image_source, predictions_file, img_format='jpg'): # load samples if os.path.isfile(image_source): image_dir, samples = image_file_to_json(image_source) else: image_dir = image_source samples = image_dir_to_json(image_dir, img_type='jpg') # build model and load weights nima = Nima(base_model_name, weights=None) nima.build() nima.nima_model.load_weights(weights_file) # initialize data generator data_generator = TestDataGenerator(samples, image_dir, 64, 10, nima.preprocessing_function(), img_format=img_format) # get predictions predictions = predict(nima.nima_model, data_generator) # calc mean scores and add to samples for i, sample in enumerate(samples): sample['mean_score_prediction'] = calc_mean_score(predictions[i]) print(json.dumps(samples, indent=2)) if predictions_file is not None: save_json(samples, predictions_file)
def save_area_btn(self): _area_dict = { 'name': 'area1', 'areaName': '', 'area': '{"abs": [], "points": []}', 'alertType': '1', 'day': '1111111', 'hour': '0,24', 'minute': '0' } _name = "123" _H_start = '0' _H_end = '24' _sec = '1' _points = self.main.win.get_draw_point() _weekday = '1111111' _abs = [] _1080points = normalize_points([p for p in _points], self.main.canvasHandler.get_size()) _1080points.append(_1080points[0]) for i in range(len(_1080points) - 1): x1, y1 = _1080points[i] x2, y2 = _1080points[i + 1] if x1 - x2 == 0: a = float('inf') b = 0 else: a = round((y1 - y2) / (x1 - x2), 3) b = round(y1 - x1 * a, 3) _abs.append((a, b)) _1080points.pop() _area = json.dumps({'abs': _abs, 'points': _1080points}) _area_dict['area'] = _area save_json('./area.txt', _area_dict) self.main.win.add_point(_points[0])
def config(self): self._set_backup_folder() self._set_bucket() self._set_time_interval() self._set_salt() self._set_control_key() self._create_password_test_file() self._stat_cache = StatCache(self._stat_cache_dir, self._backup_folder) self._object_db = ObjectDB(self._object_db_path) config = { "backup_folder": self._backup_folder, "bucket": self._bucket, "time_interval": self._time_interval, } save_json(config, self._CONFIG_FILEPATH) print(config)
def select_relations(visualgenome_path, house_objects_path, model_path): ''' Select relations about how often attributes belong to objects of house domain ''' attribute_frequency = load_json( join(visualgenome_path, 'attribute_frequencies.json')) groups = classification(attribute_frequency.values(), model_path) save_json(groups, join(visualgenome_path, 'attribute_classes.json')) if 'others' in groups: del groups['others'] attribute_knowledge, relations = extract_knowledge(attribute_frequency, groups) house_objects = { v.replace(' ', '_'): k['dbpedia_uri'] for v, k in load_json(house_objects_path).items() } save_json(attribute_knowledge, join(visualgenome_path, 'attribute_knowledge.json')) create_triples(relations, house_objects, visualgenome_path)
def create_dataset(option, frame_raw_path, frame_parsed_path): ''' Create a dataset of frame triples according to a Validator (by core, by synset or by embeddings) ''' obj_validator = None if option == 'core': frame_types_path = join(dirname(__file__), '../../resource/frames/annotations_frame_types/') obj_validator = Validator_By_Core(frame_types_path) elif option == 'synset': frame_elements_path = join(dirname(__file__), '../../resource/frames/annotations_frame_elements/') obj_validator = Validator_By_Synset(frame_elements_path) elif option == 'embeddings': embeddings_path = join(dirname(__file__), '../../resource/embeddings/googlenews_negative300') obj_validator = Validator_By_Embeddings(embeddings_path) else: logging.error('Unknown "%s" option of frame validator' % option) if obj_validator: frame_instances = read_folder_frames(frame_raw_path, delete_repetition=False) filtered_frames = filter_instances(frame_instances, obj_validator) prototypical_frames = find_prototypical_instances(filtered_frames) save_json(prototypical_frames, join(frame_parsed_path, 'frame_instances.json')) logging.info('Selected %s prototypical frames' % len(prototypical_frames))
def get_relations(object_name, relation, conceptnet_path, limit=100): ''' Get relations of an object throught Conceptnet RESTful API ''' base_query = 'http://api.conceptnet.io/query?node=/c/en/%s&rel=/r/%s&offset=%d&limit=%d' data = {} flag = True index = 0 while flag: try: data = requests.get(base_query % (object_name, relation, index, limit)).json() save_json( data, join(conceptnet_path, '%s_%s_%d.json' % (object_name, relation, index))) except: query = base_query % (object_name, relation, index, limit) logging.error('Corrupted JSON file in "%s"' % query) if 'view' in data and 'nextPage' in data['view']: index += limit else: flag = False
def create_dataset(visualgenome_raw_path, visualgenome_parsed_path): ''' Create a dataset of objects and their attributes using VisualGenome dataset ''' visualgenome_data = load_json( join(visualgenome_raw_path, 'attributes.json')) attribute_synsets = load_json( join(visualgenome_raw_path, 'attribute_synsets.json')) frequency_data = {} for image in visualgenome_data: objects = set() for attribute_data in image['attributes']: if 'attributes' in attribute_data and len( set(attribute_data['synsets'])) == 1: object_name = attribute_data['synsets'][0] assigned = assign_attribute(object_name, attribute_data['attributes'], attribute_synsets, frequency_data) if assigned and object_name not in objects: objects.add(object_name) frequency_data[object_name]['images'] += 1 logging.info('Size: %s objects selected' % len(frequency_data)) save_json(frequency_data, join(visualgenome_parsed_path, 'attribute_frequencies.json'))
def save_metrics(self, latest_metrics, type_path) -> None: self.metrics[type_path].append(latest_metrics) save_json(self.metrics, self.metrics_save_path)
def generate_summaries_or_translations( data_dir: str, out_dir: str, model_path: str, config_path: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", prefix=None, max_source_length=1024, max_target_length=142, eval_beams=5, eval_max_gen_length=142, n_obs=-1, type_path="test", num_return_sequences=1, distill=None, num_layers=None, do_encoder=False, do_decoder=False, **generate_kwargs, ) -> Dict: out_dir = Path(out_dir) save_path = out_dir.joinpath( f"rank_{utils.distributed_utils.get_rank()}_output.json") if num_return_sequences > eval_beams: eval_beams = num_return_sequences ### Define BART model # Config from "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json # Vocab modified to 50265 to be consistent with facebook/bart-large default config = BartConfig(**json.load(open(config_path, "r"))) config.fp16 = fp16 model = BartForConditionalGeneration.from_pretrained( model_path, config=config).to(device) # if distilling, change model if distill == "sft": model = distill_sft(model, num_layers, do_encoder, do_decoder) if fp16: model = model.half() model.eval() tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. start_time = time.time() # update config with task specific params use_task_specific_params(model, task) if prefix is None: prefix = prefix or getattr(model.config, "prefix", "") or "" ds = Seq2SeqDataset(tokenizer, data_dir, max_source_length, max_target_length, type_path=type_path, n_obs=n_obs, prefix=prefix) # I set shuffle=True for a more accurate progress bar. # If all the longest samples are first, the prog bar estimate is too high at the beginning. is_distributed = True if utils.distributed_utils.get_world_size( ) > 1 else False sampler = ds.make_sortish_sampler(batch_size, distributed=is_distributed, add_extra_examples=False, shuffle=True) data_loader = DataLoader(ds, sampler=sampler, batch_size=batch_size, collate_fn=ds.collate_fn) results = [] with torch.no_grad(): for batch in tqdm(data_loader): t0 = time.time() summaries = model.generate( input_ids=batch["input_ids"].to(device), attention_mask=batch["attention_mask"].to(device), use_cache=True, num_return_sequences=num_return_sequences, num_beams=eval_beams, max_length=eval_max_gen_length, num_beam_groups=1, output_scores=False, return_dict_in_generate=False, encoder_no_repeat_ngram_size=0, diversity_penalty=0.0, **generate_kwargs, ) preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) ids = batch["ids"] if num_return_sequences > 1: preds = chunks( preds, num_return_sequences ) # batch size chunks, each of size num_return_seq eval_time = time.time() - t0 for i, pred in enumerate(preds): store_time = eval_time if i == 0 else None #only store latency for element 0 of every batch results.append( dict(pred=pred, id=ids[i].item(), eval_time=store_time)) save_json(results, save_path) runtime = int(time.time() - start_time) # seconds num_replicas = sampler.num_replicas if is_distributed else 1 n_obs = len(results) return results, num_replicas, dict(n_obs=n_obs, eval_only_runtime=runtime, seconds_per_sample=round( runtime / n_obs, 4))
def run_generate(verbose=True): """ Takes input text, generates output, and then using reference calculates the BLEU scores. The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed. Args: verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout Returns: a tuple: ``(scores, params}`` - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}`` - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}`` """ parser = argparse.ArgumentParser() parser.add_argument("model_path", type=str, help="like facebook/bart-large-cnn or path to ckpt") parser.add_argument("config_path", type=str, help="path to config") parser.add_argument("data_dir", type=str, help="like cnn_dm/test.source") parser.add_argument("save_path", type=str, help="where to save summaries") parser.add_argument("--type_path", type=str, required=False, default="test", help="like cnn_dm/test.target") parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.") parser.add_argument("--prefix", type=str, required=False, default=None, help="will be added to the begininng of src examples") parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics") parser.add_argument("--bs", type=int, default=8, required=False, help="batch size") parser.add_argument("--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all.") parser.add_argument("--num_return_sequences", type=int, default=1, required=False, help="How many sequences to return") parser.add_argument("--fp16", action="store_true") parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results") parser.add_argument( "--info", nargs="?", type=str, const=datetime_now(), help= "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.", ) parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens") parser.add_argument( "--eval_beams", type=int, default=None, required=False, help="# beams to use. 0 corresponds to not using beam search.") parser.add_argument( "--max_source_length", default=1024, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--max_target_length", default=142, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--sync_timeout", type=int, default=600, required=False, help= "How long should master process wait for other processes to finish.", ) parser.add_argument("--debug", action="store_true") parser.add_argument('--json-summary', type=str, default="results/dllogger.json", help='If provided, the json summary will be written to' 'the specified file.') parser.add_argument( '--distill', type=str, default=None, help="string indicating how model is distilled, only sft supported", choices=["sft", None]) parser.add_argument( '--layers', type=str, default=None, help= "string indicating which teacher layers remain, split by '-' (ex. 0-6-11)" ) parser.add_argument('--do_encoder', action="store_true", default=False, help="if true encoder distilled") parser.add_argument('--do_decoder', action="store_true", default=False, help="if true decoder distilled") dist = parser.add_argument_group('distributed setup') dist.add_argument('--local_rank', type=int, default=os.getenv('LOCAL_RANK', 0), help='Used for multi-process training.') start_time = time.time() # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate args, rest = parser.parse_known_args() parsed_args = parse_numeric_n_bool_cl_kwargs(rest) if args.local_rank <= 0: print(args) print(rest) # Initialize device and distributed backend utils.distributed_utils.init_distributed(args.device == "cuda") if utils.distributed_utils.get_world_size() > 1: utils.distributed_utils.set_affinity(args.local_rank) torch.cuda.set_device(args.local_rank) if Path(args.json_summary).exists(): warnings.warn( f"json_summary {args.json_summary} will be overwritten unless you type ctrl-c." ) if utils.distributed_utils.get_rank() == 0: dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) if parsed_args and verbose: print(f"parsed the following generate kwargs: {parsed_args}") Path(args.save_path).parent.mkdir(exist_ok=True) json_save_path = Path(args.save_path + "/tmp") Path(json_save_path).mkdir(exist_ok=True) # this handles locking. if args.layers: num_layers = len(args.layers.split('-')) else: num_layers = None results, num_replicas, runtime_metrics = generate_summaries_or_translations( args.data_dir, json_save_path, args.model_path, args.config_path, batch_size=args.bs, device=args.device, fp16=args.fp16, task=args.task, prefix=args.prefix, eval_beams=args.eval_beams, max_source_length=args.max_source_length, max_target_length=args.max_target_length, eval_max_gen_length=args.eval_max_gen_length, n_obs=args.n_obs, type_path=args.type_path, num_return_sequences=args.num_return_sequences, distill=args.distill, num_layers=num_layers, do_encoder=args.do_encoder, do_decoder=args.do_decoder, **parsed_args, ) if args.local_rank <= 0: save_path = Path(args.save_path) save_path.mkdir(exist_ok=True) partial_results = gather_results_from_each_node( num_replicas, json_save_path, args.sync_timeout) preds, time_list = combine_partial_results(partial_results) if args.num_return_sequences > 1: save_path = save_path.joinpath("pseudolabel_results.json") print( f"Saving aggregated results at {save_path}, intermediate in {json_save_path}/" ) save_json(preds, save_path) return tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target") labels = [x.rstrip() for x in open(tgt_file).readlines()][:len(preds)] # Calculate metrics, save metrics, and save _generations.txt calc_bleu = "translation" in args.task score_fn = calculate_bleu if calc_bleu else calculate_rouge metric_name = "bleu" if calc_bleu else "rouge" metrics: Dict = score_fn(preds, labels) metrics["n_obs"] = len(preds) runtime = time.time() - start_time metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4) metrics["n_gpus"] = num_replicas metrics.update(runtime_metrics) time_list.sort() metrics["inference_latency_mean"] = np.mean(time_list) metrics["inference_latency_conf_50"] = max( time_list[:int(len(time_list) * 0.50)]) metrics["inference_latency_conf_90"] = max( time_list[:int(len(time_list) * 0.90)]) metrics["inference_latency_conf_95"] = max( time_list[:int(len(time_list) * 0.95)]) metrics["inference_latency_conf_99"] = max( time_list[:int(len(time_list) * 0.99)]) metrics["inference_latency_conf_100"] = max( time_list[:int(len(time_list) * 1)]) metrics["inference_throughput_mean"] = len(preds) * 1.0 / sum( time_list) metrics_save_path = save_path.joinpath( f"{args.type_path}_{metric_name}.json") save_json(metrics, metrics_save_path, indent=None) dllogger.log(step=tuple(), data=metrics) print(metrics) write_txt_file(preds, save_path.joinpath(f"{args.type_path}_generations.txt")) if args.debug: write_txt_file(labels, save_path.joinpath(f"{args.type_path}.target")) else: shutil.rmtree(json_save_path) dllogger.flush()
#!/usr/bin/env python3 import sys from utils.cr_utils import get_sample from utils.utils import save_json sample = get_sample(10000, {'has-references': True}) save_json(sample, sys.argv[1]) references = [r for item in sample for r in item.get('reference', [])] doi_publ = [r for r in references if r.get('doi-asserted-by') == 'publisher'] doi_cr_str = [ r for r in references if r.get('doi-asserted-by') == 'crossref' and ( 'year' in r or 'author' in r) ] doi_cr_uns = [ r for r in references if r.get('doi-asserted-by') == 'crossref' and 'year' not in r and 'author' not in r ] no_match_str = [ r for r in references if 'DOI' not in r and ('year' in r or 'author' in r) ] no_match_uns = [ r for r in references if 'DOI' not in r and 'year' not in r and 'author' not in r ] print(','.join([ str(len(e) / len(references)) for e in [doi_publ, doi_cr_uns, doi_cr_str, no_match_uns, no_match_str]
def save_dataset(ref_strings, file_path): save_json(ref_strings, file_path) logging.info('Dataset written to {}'.format(file_path))
with Pool(config.THREADS) as p: results = p.map( matcher.match, [r['reference']['unstructured'] for r in refs_unstructured]) [ d.update({'sbmv_unstructured': { 'DOI': r[0], 'score': r[1] }}) for d, r in zip(refs_unstructured, results) ] matcher = matching.openurl_query_matcher.Matcher() with Pool(config.THREADS) as p: results = p.map(matcher.match, [r['reference'] for r in refs_structured]) [d.update({'open_url': r[0]}) for d, r in zip(refs_structured, results)] matcher = matching.stq_matcher.Matcher() with Pool(config.THREADS) as p: results = p.map( matcher.match, [r['reference']['unstructured'] for r in refs_unstructured]) [ d.update({'simple_text_query': r[0]}) for d, r in zip(refs_unstructured, results) ] [d.update({'gt': ''}) for d in data] save_json(data, args.output)
#!/usr/bin/env python3 import matching.cr_search_validation_matcher import sys from evaluation.link_metrics import LinkMetricsResults from multiprocessing import Pool from utils.utils import read_json, save_json dataset = read_json(sys.argv[1])['dataset'] matcher = matching.cr_search_validation_matcher.Matcher(0.4, 0.34, []) with Pool(10) as p: results = p.map(matcher.match, [item.get('ref_string') for item in dataset]) for item, target in zip(dataset, results): item['target_test']['DOI'] = target[0] save_json(dataset, sys.argv[2]) link_results = LinkMetricsResults(dataset) print(','.join([str(link_results.get(m)) for m in ['precision', 'recall', 'F1']]))
def main(ent_file, rel_file, abs_file, out_file, verbose=False): processed_data = process_dataset(ent_file, rel_file, abs_file, verbose) utils.save_json(out_file, processed_data)
def save_sample_data(sample_data, file_path): save_json(sample_data, file_path) logging.info('Sample data written to {}'.format(file_path))
'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4, } # hard coding train_dic, dev_dic, test_dic = {}, {}, {} train_dic["texts"] = list(train["OriginalTweet"]) train_dic["categories"] = [cat2idx[cat] for cat in train['Sentiment']] dev_dic["texts"] = list(dev["OriginalTweet"]) dev_dic["categories"] = [cat2idx[cat] for cat in dev['Sentiment']] test_dic["texts"] = list(test["OriginalTweet"]) test_dic["categories"] = [cat2idx[cat] for cat in test['Sentiment']] train_idx = list(train.index) dev_idx = list(dev.index) try: os.mkdir(f"{_data_root}/info") except FileExistsError: pass save_json(f"{_data_root}/train.json", train_dic) save_json(f"{_data_root}/dev.json", dev_dic) save_json(f"{_data_root}/test.json", test_dic) save_json(f"{_data_root}/info/cat2idx.json", cat2idx) save_json(f"{_data_root}/info/train_idx.json", train_idx) save_json(f"{_data_root}/info/dev_idx.json", dev_idx)