def from_args(cls, args, options=''): """ Initialize this class from some cli arguments. Used in train, test. """ for opt in options: args.add_argument(*opt.flags, default=None, type=opt.type) if not isinstance(args, tuple): args = args.parse_args() if args.device is not None: os.environ["CUDA_VISIBLE_DEVICES"] = args.device if args.resume is not None: resume = Path(args.resume) cfg_fname = resume.parent / 'config.json' else: msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example." assert args.config is not None, msg_no_cfg resume = None cfg_fname = Path(args.config) config = read_json(cfg_fname) if args.config and resume: # update new config for fine-tuning config.update(read_json(args.config)) # parse custom cli options into dictionary modification = { opt.target: getattr(args, _get_opt_name(opt.flags)) for opt in options } return cls(config, resume, modification)
def __init__(self, root, phase, tokenizer, max_len): self.root = root self.phase = phase self.tokenizer = tokenizer self.max_len = max_len self.cat2idx = read_json(f"{root}/info/cat2idx.json") self.idx2cat = {idx: cat for idx, cat in self.cat2idx.items()} self.n_outputs = len(self.cat2idx) data = read_json(f"{root}/{phase}.json") self.texts = data['texts'] self.categories = data['categories'] if phase != "test" else None self.pad_token_id = 0 if self.tokenizer.pad_token_id is None else self.tokenizer.pad_token_id
def test_full(self, datadir): data = read_json(datadir + 'test_dataset.json') r = SplitByRefAttrResults(data.get('dataset'), 'style', data.get('dataset_dois')) split_r = r.get(dfk.EVAL_SPLIT_METRICS) assert split_r.shape == (2, 13) r = SplitByRefAttrResults(data.get('dataset'), 'style', [ '10.1103/physrevb.67.134406', '10.1159/000408205', '10.1002/chin.199827068' ]) split_r = r.get(dfk.EVAL_SPLIT_METRICS) split_r = split_r.sort_values(by='style') assert split_r.shape == (2, 13) assert split_r['style'].tolist() == ['apa', 'ieee'] assert split_r['correct ref links (fraction)'].tolist() \ == approx([2/5, 1/2]) assert split_r['correct missing ref links (fraction)'].tolist() \ == approx([1/10, 1/10]) assert split_r['incorrect ref links (fraction)'].tolist() \ == approx([2/5, 1/10]) assert split_r['incorrect existing ref links (fraction)'].tolist() \ == approx([0, 3/10]) assert split_r['incorrect missing ref links (fraction)'].tolist() \ == approx([1/10, 0]) assert split_r['accuracy'].tolist() == approx([5 / 10, 3 / 5]) assert split_r['average precision over target docs'].tolist() \ == approx([1, 2/3]) assert split_r['average recall over target docs'].tolist() \ == approx([2/3, 1]) assert split_r['average F1 over target docs'].tolist() \ == approx([2/3, 2/3]) assert split_r['precision'].tolist() == approx([1 / 2, 5 / 9]) assert split_r['recall'].tolist() == approx([4 / 9, 5 / 6])
def _extract_1stframe(self, dir_path, json_path, relabel): if osp.exists(json_path): print("=> {} generated before, awesome!".format(json_path)) split = read_json(json_path) return split['tracklets'] print( "=> Automatically generating split (might take a while for the first time, have a coffe)" ) pdirs = glob.glob(osp.join(dir_path, '*')) # avoid .DS_Store print("Processing {} with {} person identities".format( dir_path, len(pdirs))) pid_container = set() for pdir in pdirs: pid = int(osp.basename(pdir)) pid_container.add(pid) pid2label = {pid: label for label, pid in enumerate(pid_container)} tracklets = [] for pdir in pdirs: pid = int(osp.basename(pdir)) if relabel: pid = pid2label[pid] tdirs = glob.glob(osp.join(pdir, '*')) for tdir in tdirs: raw_img_paths = glob.glob(osp.join(tdir, '*.jpg')) num_imgs = len(raw_img_paths) if num_imgs < self.min_seq_len: continue img_paths = [] for img_idx in range(num_imgs): # some tracklet starts from 0002 instead of 0001 img_idx_name = 'F' + str(img_idx + 1).zfill(4) res = glob.glob( osp.join(tdir, '*' + img_idx_name + '*.jpg')) if len(res) == 0: print( "Warn: index name {} in {} is missing, jump to next" .format(img_idx_name, tdir)) continue img_paths.append(res[0]) img_name = osp.basename(img_paths[0]) if img_name.find('_') == -1: # old naming format: 0001C6F0099X30823.jpg camid = int(img_name[5]) - 1 else: # new naming format: 0001_C6_F0099_X30823.jpg camid = int(img_name[6]) - 1 img_paths = tuple(img_paths) tracklets.append((img_paths[0], pid, camid)) print("Saving split to {}".format(json_path)) split_dict = { 'tracklets': tracklets, } write_json(split_dict, json_path) return tracklets
def test_format_ref_string(self, datadir): record = read_json(datadir + 'test_record.json') assert format_ref_string(record, 'apa') == \ 'Tkaczyk, D., Szostek, P., Fedoryszak, M., Dendek, P. J., & ' + \ 'Bolikowski, Ł. (2015). CERMINE: automatic extraction of ' + \ 'structured metadata from scientific literature. ' + \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR), 18(4), 317–335.' assert format_ref_string(record, 'chicago-author-date') == \ 'Tkaczyk, Dominika, Paweł Szostek, Mateusz Fedoryszak, Piotr ' + \ 'Jan Dendek, and Łukasz Bolikowski. 2015. “CERMINE: Automatic ' + \ 'Extraction of Structured Metadata from Scientific ' + \ 'Literature.” International Journal on Document Analysis and ' + \ 'Recognition (IJDAR) 18 (4) (July 3): 317–335.' assert format_ref_string(record, 'modern-language-association') == \ 'Tkaczyk, Dominika et al. “CERMINE: Automatic Extraction of ' + \ 'Structured Metadata from Scientific Literature.” ' + \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR) 18.4 (2015): 317–335.' assert format_ref_string(record, 'american-chemical-society') == \ '(1) Tkaczyk, D.; Szostek, P.; Fedoryszak, M.; Dendek, P. J.; ' + \ 'Bolikowski, Ł. International Journal on Document Analysis ' + \ 'and Recognition (IJDAR) 2015, 18, 317–335.' assert format_ref_string(record, 'degraded_one_author') == \ 'Tkaczyk, Dominika. CERMINE: automatic extraction of ' + \ 'structured metadata from scientific literature. ' + \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR). 2015. 18. 4. 317-335' assert format_ref_string(record, 'degraded_title_scrambled') == \ 'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, Mateusz, ' + \ 'Dendek, Piotr Jan, Bolikowski, Łukasz. automatic metadata ' + \ 'from scientific of literature CERMINE: extraction ' + \ 'structured. International Journal on Document Analysis and ' + \ 'Recognition (IJDAR). 2015. 18. 4. 317-335'
def test_degraded_one_author(self, datadir): record = read_json(datadir + 'test_record.json') assert degraded_one_author(record) == \ 'Tkaczyk, Dominika. CERMINE: automatic extraction of ' + \ 'structured metadata from scientific literature. ' + \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR). 2015. 18. 4. 317-335'
def execute(self, conf_path: str, input_path: str, output_path: str, on_adls: bool): """ Clean a list of JSON files and writing them Args: conf_path: File path of the params.json input_path: Folder path to read raw files output_path: Folder path to write files on_adls: If the data are on the Azure Data Lake set true to use the correct package Returns: Nothing the data are directly write at the desired location """ self.load_params(conf_path) self.params.get("json") self.data_lake = uts.connect_to_data_lake_store( self.params) if on_adls else None res = [] for file in self.params.get("json"): json_file_name = "{}.json".format(file) read_path = path.join(input_path, json_file_name) self.logger.info( "Reading and parsing JSON from: {}".format(read_path)) data = uts.read_json(read_path, self.data_lake, advanced_parsing=True) write_path = path.join(output_path, json_file_name) self.logger.info( "Writing the parsed JSON to: {}".format(write_path)) uts.write_json(data, write_path, self.data_lake) res.append(data) return res
def merge_datasets(dataset_filenames, out_filename): # the assumption here is that the abstract text and entity detected are identical # the only different things are relations # for chemprot and drugprot this is the case, see compare_datasets.py datasets = [] for filename in dataset_filenames: datasets.append(utils.read_json(filename)) # map all dataset to CPR-X for ds in datasets: map_to_cpr(ds) merged = datasets[0] for ds in datasets[1:]: for article_id, article in ds.items(): # if this article is unique to this dataset, just add it # but make sure its relations are converted to CPR-X if article_id not in merged: merged[article_id] = article else: print(f'merging {article_id}') merge_article(merged[article_id], article) # stats total_relation = 0 for article_id, article in merged.items(): for sent in article['abstract']: total_relation += len(sent['relations']) print(f'number of relation of merged dataset: {total_relation}') utils.save_json(out_filename, merged)
def get_dataset(num_gpus=1, mode='train'): assert mode in ['train', 'val'] global id_bboxs_dict, img_path, params, id_kps_dict if mode == 'train': json_file = params['train_json_file'] img_path = params['train_img_path'] else: json_file = params['val_json_file'] img_path = params['val_img_path'] img_ids, id_bboxs_dict, id_kps_dict = read_json(json_file) if mode == 'train': random.shuffle(img_ids) dataset = tf.data.Dataset.from_tensor_slices(img_ids) else: dataset = tf.data.Dataset.from_tensor_slices(img_ids) dataset = dataset.shuffle(buffer_size=1000).repeat(1) if mode == 'train': dataset = dataset.map(tf_parse_func, num_parallel_calls=tf.data.experimental.AUTOTUNE) else: dataset = dataset.map(tf_parse_func_for_val, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(params['batch_size'] * num_gpus, drop_remainder=True) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return dataset
def test_similar_search_query(self, datadir): record = read_json(datadir + 'test_record.json') assert similar_search_query(record) == \ 'CERMINE: automatic extraction of structured metadata from ' + \ 'scientific literature International Journal on Document ' + \ 'Analysis and Recognition (IJDAR) Tkaczyk Szostek Fedoryszak ' + \ 'Dendek Bolikowski'
def generate(parser): parser.add_argument("--val", dest="val", action='store_true') parser.add_argument("-c", dest="config", type=str) args = parser.parse_args() config = read_json(args.config) # We load a dummy data loader for post-processing transform_config = config['transform_config'] loader_config = config['loader_config'] processor = AudioProcessor(**transform_config) postprocess = processor.get_postprocessor() assert os.path.exists(args.outdir), "Output path does not exist" # Create output evaluation dir trval = 'val' if args.val else 'train' output_dir = mkdir_in_path(args.outdir, f"true_sample_{config['name']}") output_dir = mkdir_in_path( output_dir, f"{trval}_{args.n_gen}_{datetime.now().strftime('%Y-%m-%d_%H_%M')}") dbname = loader_config['dbname'] loader = get_data_loader(dbname)(name=dbname + '_' + transform_config['transform'], preprocessing=processor, **loader_config) if args.val: data, _ = loader.get_validation_set(args.n_gen) else: data = random.sample(loader.data, k=args.n_gen) audio_out = map(postprocess, data) saveAudioBatch(audio_out, path=output_dir, basename='true_sample', sr=config["transform_config"]["sample_rate"]) print("FINISHED!\n")
def test_degraded_no_stopwords(self, datadir): record = read_json(datadir + 'test_record.json') assert degraded_no_stopwords(record) == \ 'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, Mateusz, ' + \ 'Dendek, Piotr Jan, Bolikowski, Łukasz. CERMINE: automatic ' + \ 'extraction structured metadata scientific literature. ' + \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR). 2015. 18. 4. 317-335'
def test_full(self, datadir): data = read_json(datadir + 'test_dataset.json').get('dataset') r = ByDocumentMetricsResults(data, ['10.1103/physrevb.67.134406', '10.1159/000408205', '10.1002/chin.199827068']) assert r.get(dfk.EVAL_MEAN_PREC) == approx(7/9) assert r.get(dfk.EVAL_MEAN_REC) == approx(5/6) assert r.get(dfk.EVAL_MEAN_F1) == approx(13/18)
def test_generate_target_gt(self, datadir): record = read_json(datadir + 'test_record.json') assert generate_target_gt(record, []) == \ {'DOI': '10.1007/s10032-015-0249-8'} assert generate_target_gt(record, ['publisher', 'type']) == \ {'DOI': '10.1007/s10032-015-0249-8', 'publisher': 'Springer Nature', 'type': 'journal-article'}
def test_degraded_title_scrambled(self, datadir): record = read_json(datadir + 'test_record.json') random.seed(10) assert degraded_title_scrambled(record) == \ 'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, Mateusz, ' + \ 'Dendek, Piotr Jan, Bolikowski, Łukasz. metadata extraction ' + \ 'scientific automatic literature structured of from CERMINE:. ' + \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR). 2015. 18. 4. 317-335'
def __init__(self, split_id=1): self._download_data() self._check_before_run() self._prepare_split() splits = read_json(self.split_path) if split_id >= len(splits): raise ValueError( "split_id exceeds range, received {}, but expected between 0 and {}" .format(split_id, len(splits) - 1)) split = splits[split_id] train_dirs, test_dirs = split['train'], split['test'] print("# train identites: {}, # test identites {}".format( len(train_dirs), len(test_dirs))) train, num_train_tracklets, num_train_pids, num_imgs_train = \ self._process_data(train_dirs, cam1=True, cam2=True) query, num_query_tracklets, num_query_pids, num_imgs_query = \ self._process_data(test_dirs, cam1=True, cam2=False) gallery, num_gallery_tracklets, num_gallery_pids, num_imgs_gallery = \ self._process_data(test_dirs, cam1=False, cam2=True) num_imgs_per_tracklet = num_imgs_train + num_imgs_query + num_imgs_gallery min_num = np.min(num_imgs_per_tracklet) max_num = np.max(num_imgs_per_tracklet) avg_num = np.mean(num_imgs_per_tracklet) num_total_pids = num_train_pids + num_query_pids num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets print("=> iLIDS-VID loaded") print("Dataset statistics:") print(" ------------------------------") print(" subset | # ids | # tracklets") print(" ------------------------------") print(" train | {:5d} | {:8d}".format(num_train_pids, num_train_tracklets)) print(" query | {:5d} | {:8d}".format(num_query_pids, num_query_tracklets)) print(" gallery | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_tracklets)) print(" ------------------------------") print(" total | {:5d} | {:8d}".format(num_total_pids, num_total_tracklets)) print( " number of images per tracklet: {} ~ {}, average {:.1f}".format( min_num, max_num, avg_num)) print(" ------------------------------") self.train = train self.query = query self.gallery = gallery self.num_train_pids = num_train_pids self.num_query_pids = num_query_pids self.num_gallery_pids = num_gallery_pids
def __init__(self, annot_path, video_id_path, metadata_path, fps, window_size, out_path): ''' Given videos, we create segments (of frames) and their corresponding labels. A segment is a start/end frame numbers (for a video) and label is whether compression occurs or not in that segment. We use the annotations in secs of the videos to calc the label. :param annot_path: str, path to the cpr annotations :param video_id_path: str, path for the video ids by train/val/test splits :param metadata_path: str, path to the metadata of the videos :param fps: int, fps of frames videos were converted to :param window_size: int, num of frames in a sliding window :param out_path: str, path to output the segment and labels json ''' self.fps = fps self.window_size = window_size self.annot_json = read_json(annot_path) video_id_by_split = read_json(video_id_path) self.metadata_path = read_json(metadata_path) # store each split here all_data = {} # loop thru each data split for split_type in video_id_by_split.keys(): video_id_list = video_id_by_split[ split_type] # retrieve a video id segments, labels = self._create_segments_labels( video_id_list) # create the segments/labels data = { 'segments': segments, 'labels': labels } # store both in a dict all_data[split_type] = data # store for entire video # write all to disk out_path = os.path.join(out_path, 'segments_and_labels.json') write_json(all_data, out_path, indent=None)
def __init__(self, root='/data/datasets/', split_id=0): self.root = osp.join(root, 'iLIDS-VID') self.dataset_url = 'http://www.eecs.qmul.ac.uk/~xiatian/iLIDS-VID/iLIDS-VID.tar' self.data_dir = osp.join(self.root, 'i-LIDS-VID') self.split_dir = osp.join(self.root, 'train-test people splits') self.split_mat_path = osp.join(self.split_dir, 'train_test_splits_ilidsvid.mat') self.split_path = osp.join(self.root, 'splits.json') self.cam_1_path = osp.join(self.root, 'i-LIDS-VID/sequences/cam1') self.cam_2_path = osp.join(self.root, 'i-LIDS-VID/sequences/cam2') # self._download_data() self._check_before_run() self._prepare_split() splits = read_json(self.split_path) if split_id >= len(splits): raise ValueError("split_id exceeds range, received {}, but expected between 0 and {}".format(split_id, len(splits)-1)) split = splits[split_id] train_dirs, test_dirs = split['train'], split['test'] print("# train identites: {}, # test identites {}".format(len(train_dirs), len(test_dirs))) train, num_train_tracklets, num_train_pids, num_imgs_train = \ self._process_train_data(train_dirs, cam1=True, cam2=True) query, num_query_tracklets, num_query_pids, num_imgs_query = \ self._process_test_data(test_dirs, cam1=True, cam2=False) gallery, num_gallery_tracklets, num_gallery_pids, num_imgs_gallery = \ self._process_test_data(test_dirs, cam1=False, cam2=True) num_imgs_per_tracklet = num_imgs_train + num_imgs_query + num_imgs_gallery min_num = np.min(num_imgs_per_tracklet) max_num = np.max(num_imgs_per_tracklet) avg_num = np.mean(num_imgs_per_tracklet) num_total_pids = num_train_pids + num_query_pids num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets print("=> iLIDS-VID loaded") print("Dataset statistics:") print(" ------------------------------") print(" subset | # ids | # tracklets") print(" ------------------------------") print(" train | {:5d} | {:8d}".format(num_train_pids, num_train_tracklets)) print(" query | {:5d} | {:8d}".format(num_query_pids, num_query_tracklets)) print(" gallery | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_tracklets)) print(" ------------------------------") print(" total | {:5d} | {:8d}".format(num_total_pids, num_total_tracklets)) print(" number of images per tracklet: {} ~ {}, average {:.1f}".format(min_num, max_num, avg_num)) print(" ------------------------------") self.train = train self.query = query self.gallery = gallery self.num_train_pids = num_train_pids self.num_query_pids = num_query_pids self.num_gallery_pids = num_gallery_pids
def __init__(self, args, options=""): """ - class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving and logging module. input: args: Dict containing configurations, hyperparameters for training. contents of `parameters.json` file for example. options: Dict keychain:value, specifying position values to be replaced from config dict. """ # parse default and custom cli options for opt in options: args.add_argument(*opt.flags, default=None, type=opt.type) args = args.parse_args() self.cfg_fname = Path(args.config) # load json file as python dictionary config = read_json(self.cfg_fname) config["src_data"] = args.src_data config["tgt_data"] = args.tgt_data config["src_data_prefix"] = args.src_data_prefix config["tgt_data_prefix"] = args.tgt_data_prefix # load config file and apply custom cli options self._config = _update_config(config, options, args) # set save directory where trained embedding and log will be saved save_dir_name = args.save_name if args.save_name else config[ "src_data_prefix"] + "_" + config["tgt_data_prefix"] save_dir = Path(args.save) / save_dir_name timestamp = datetime.now().strftime(r'%m%d_%H%M%S') exper_name = self.config['name'] print(f"Result will be saved in {save_dir}") self._save_dir = save_dir / 'best' / exper_name / timestamp self._log_dir = save_dir / 'log' / exper_name / timestamp self.save_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) # save updated config file to the checkpoint dir write_json(self.config, self.save_dir / "parameters.json") # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def test_full(self, datadir): data = read_json(datadir + 'test_dataset.json').get('dataset') r = DocAttrLinkMetricsResults(data, 'type', 'journal-article') assert r.get(dfk.EVAL_PREC) == approx(1 / 2) assert r.get(dfk.EVAL_REC) == approx(2 / 3) assert r.get(dfk.EVAL_F1) == approx(4 / 7) r = DocAttrLinkMetricsResults(data, 'type', 'reference-entry') assert r.get(dfk.EVAL_PREC) == approx(1) assert r.get(dfk.EVAL_REC) == approx(1) assert r.get(dfk.EVAL_F1) == approx(1)
def test_get_journal_title(self, datadir): record = read_json(datadir + 'test_record.json') assert get_journal_title(record) == \ 'International Journal on Document Analysis and Recognition ' + \ '(IJDAR)' record['container-title'] = [] assert get_journal_title(record) == '' del record['container-title'] assert get_journal_title(record) == ''
def test_get_authors(self, datadir): record = read_json(datadir + 'test_record.json') authors = 'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, ' + \ 'Mateusz, Dendek, Piotr Jan, Bolikowski, Łukasz' assert get_authors(record) == authors record['author'] = record['author'] * 50 assert get_authors(record) == ', '.join([authors] * 10) record['author'] = [] assert get_authors(record) == ''
def main(json_filename1, json_filename2, brat_diff_dir=None): dataset1 = utils.read_json(json_filename1) dataset2 = utils.read_json(json_filename2) # calculate article overlaps ds1_size, ds2_size, unique_size, common_size = article_id_overlap_check( dataset1.keys(), dataset2.keys()) print(f'number of articles in {basename(json_filename1)}: {ds1_size}') print(f'number of articles in {basename(json_filename2)}: {ds2_size}') print(f'number of unique articles in both files: {unique_size}') print(f'number of common articles in both files: {common_size}') # if an article is mentioned in both dataset, compare their abstract, relations, and entities print('trying to find article ids in both dataset') print( f'comparing {basename(json_filename1)} to {basename(json_filename2)}') for article_id, data in dataset1.items(): abstract = data['abstract'] if article_id in dataset2: abstract_other = dataset2[article_id]['abstract'] diff = compare_abstract(abstract, abstract_other) if any([len(x) > 0 for x in diff]): print(f'article id {article_id}') print_diff(diff) if brat_diff_dir: txt, ann = json_to_brat.article_brat_repr( data, include_entities=True) rels_in1not2, rels_in2not1 = diff[2], diff[3] for i, rel in enumerate(rels_in1not2): ann.append( json_to_brat.rel_brat_repr(rel, i, type_suffix='_1')) for i, rel in enumerate(rels_in2not1): ann.append( json_to_brat.rel_brat_repr(rel, i + len(rels_in1not2), type_suffix='_2')) json_to_brat.write_brat(article_id, brat_diff_dir, txt, ann)
def test_full(self, datadir): data = read_json(datadir + 'test_dataset.json').get('dataset') r = SplitByDocAttrResults(data, 'type') r = r.get(dfk.EVAL_SPLIT_METRICS) r = r.sort_values(by='type') assert r.shape == (3, 4) assert r['type'].tolist() == [ 'book-chapter', 'journal-article', 'reference-entry' ] assert r['precision'].tolist() == approx([1 / 3, 1 / 2, 1]) assert r['recall'].tolist() == approx([1 / 4, 2 / 3, 1]) assert r['F1'].tolist() == approx([2 / 7, 4 / 7, 1])
def from_config_file(cls, config_file, identifier, verbosity): """ Initialize this class from config file. Used in train, test. :param config_file: config file. :param identifier: identifier :return: ConfigParser. """ message = "Configuration file need to be specified. Add '-c ./config/config.json', for example." assert config_file is not None, message config = read_json(config_file) run_id = identifier # Return ConfigParser object. return cls(config, run_id, verbosity)
def __init__(self, config=None, resume=None, modification=None, run_id=None): """ class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving and logging module. :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example. :param resume: String, path to the checkpoint being loaded. :param modification: Dict keychain:value, specifying position values to be replaced from config dict. :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default """ if config is None: config_path = UTILS_DIR / 'config.json' config = read_json(config_path) # load config file and apply modification self._config = _update_config(config, modification) self.resume = resume # set save_dir where trained model and log will be saved. save_dir = ROOT_DIR / self.config['trainer']['save_dir'] # set the data_dir self.data_dir = ROOT_DIR / self.config['data_loader']['args'][ 'data_dir'] exper_name = self.config['name'] if run_id is None: # use timestamp as default run-id run_id = datetime.now().strftime(r'%m%d_%H%M%S') self._save_dir = save_dir / 'models' / exper_name / run_id self._log_dir = save_dir / 'log' / exper_name / run_id # make directory for saving checkpoints and log. exist_ok = run_id == '' self.save_dir.mkdir(parents=True, exist_ok=exist_ok) self.log_dir.mkdir(parents=True, exist_ok=exist_ok) # save updated config file to the checkpoint dir write_json(self.config, self.save_dir / 'config.json') # configure logging module # setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG }
def get_dataset_definition(dataset_name, train_with): if dataset_name == "librispeech": dataset_config_path = "cfg/dataset_definition/librispeech.json" elif dataset_name == "TIMIT": dataset_config_path = "cfg/dataset_definition/TIMIT.json" else: raise NotImplementedError(dataset_name) dataset_definition = read_json(dataset_config_path) dataset_definition['data_info']['labels'] = { k: v for k, v in dataset_definition['data_info']['labels'].items() if k in dataset_definition['datasets'][train_with]['labels'] } for label in dataset_definition['data_info']['labels']: label_info = dataset_definition['data_info']['labels'][label] if label_info['num_lab'] is None: if label == "lab_cd": folder_lab_count = dataset_definition['datasets'][train_with] \ ['labels'][label]['label_folder'] hmm_info = run_shell_info( f"hmm-info {folder_lab_count}/final.mdl") label_info['num_lab'] = int( hmm_info.split("\n")[1].rsplit(" ", 1)[1]) label_info['lab_count'] = get_lab_count( label_opts=dataset_definition['datasets'][train_with] \ ['labels'][label]['label_opts'], num_label=label_info["num_lab"], folder_lab_count=folder_lab_count) elif label == "lab_mono" or label == "lab_phn" or label == "lab_phnframe": folder_lab_count = dataset_definition['datasets'][train_with] \ ['labels'][label]['label_folder'] hmm_info = run_shell_info( f"hmm-info {folder_lab_count}/final.mdl") label_info['num_lab'] = int( hmm_info.split("\n")[0].rsplit(" ", 1)[1]) label_info['lab_count'] = get_lab_count( label_opts=dataset_definition['datasets'][train_with] \ ['labels'][label]['label_opts'], num_label=label_info["num_lab"], folder_lab_count=folder_lab_count) else: raise NotImplementedError(label) return dataset_definition
def test_full_summary(self, datadir): data = read_json(datadir + 'test_dataset.json') r = ByDocumentMetricsResults(data.get('dataset'), data.get('dataset_dois')) doc_r = r.get(dfk.EVAL_DOC_METRICS) doc_r = doc_r.sort_values(by='doc') assert doc_r.shape == (10, 4) assert doc_r['doc'].tolist() == sorted(data.get('dataset_dois')) assert doc_r['precision'].tolist()[0] == approx(1/3) assert doc_r['precision'].tolist()[9] == approx(1) assert doc_r['recall'].tolist()[0] == approx(1) assert doc_r['recall'].tolist()[9] == approx(0) assert doc_r['F1'].tolist()[0] == approx(1/2) assert doc_r['F1'].tolist()[9] == approx(0)
def test_full(self, datadir): data = read_json(datadir + 'test_dataset.json').get('dataset') r = ReferenceMetricsResults(data) assert r.get(dfk.EVAL_REF_TOTAL) == approx(20) assert r.get(dfk.EVAL_CORR_LINK_C) == approx(9) assert r.get(dfk.EVAL_CORR_NO_LINK_C) == approx(2) assert r.get(dfk.EVAL_INCORR_LINK_C) == approx(5) assert r.get(dfk.EVAL_INCORR_EXISTS_C) == approx(3) assert r.get(dfk.EVAL_INCORR_MISSING_C) == approx(1) assert r.get(dfk.EVAL_CORR_LINK_F) == approx(0.45) assert r.get(dfk.EVAL_CORR_NO_LINK_F) == approx(0.1) assert r.get(dfk.EVAL_INCORR_LINK_F) == approx(0.25) assert r.get(dfk.EVAL_INCORR_EXISTS_F) == approx(0.15) assert r.get(dfk.EVAL_INCORR_MISSING_F) == approx(0.05) assert r.get(dfk.EVAL_ACCURACY) == approx(0.55)
def get_dataset(mode='train'): global id_bboxs_dict, img_path, params, id_kps_dict json_file = params['train_json_file'] img_path = params['train_img_path'] img_ids, id_bboxs_dict, id_kps_dict = read_json(json_file) random.shuffle(img_ids) dataset = tf.data.Dataset.from_tensor_slices(img_ids) dataset = dataset.shuffle(buffer_size=1000).repeat(1) dataset = dataset.map(tf_parse_func, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(params['batch_size'], drop_remainder=True) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return dataset