コード例 #1
0
ファイル: parse_config.py プロジェクト: erytheis/BattLeDIM
    def from_args(cls, args, options=''):
        """
        Initialize this class from some cli arguments. Used in train, test.
        """
        for opt in options:
            args.add_argument(*opt.flags, default=None, type=opt.type)
        if not isinstance(args, tuple):
            args = args.parse_args()

        if args.device is not None:
            os.environ["CUDA_VISIBLE_DEVICES"] = args.device
        if args.resume is not None:
            resume = Path(args.resume)
            cfg_fname = resume.parent / 'config.json'
        else:
            msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example."
            assert args.config is not None, msg_no_cfg
            resume = None
            cfg_fname = Path(args.config)

        config = read_json(cfg_fname)
        if args.config and resume:
            # update new config for fine-tuning
            config.update(read_json(args.config))

        # parse custom cli options into dictionary
        modification = {
            opt.target: getattr(args, _get_opt_name(opt.flags))
            for opt in options
        }
        return cls(config, resume, modification)
コード例 #2
0
ファイル: dataset.py プロジェクト: KhelKim/pytorch-template
    def __init__(self, root, phase, tokenizer, max_len):
        self.root = root
        self.phase = phase
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.cat2idx = read_json(f"{root}/info/cat2idx.json")
        self.idx2cat = {idx: cat for idx, cat in self.cat2idx.items()}
        self.n_outputs = len(self.cat2idx)

        data = read_json(f"{root}/{phase}.json")
        self.texts = data['texts']
        self.categories = data['categories'] if phase != "test" else None

        self.pad_token_id = 0 if self.tokenizer.pad_token_id is None else self.tokenizer.pad_token_id
コード例 #3
0
    def test_full(self, datadir):
        data = read_json(datadir + 'test_dataset.json')
        r = SplitByRefAttrResults(data.get('dataset'), 'style',
                                  data.get('dataset_dois'))
        split_r = r.get(dfk.EVAL_SPLIT_METRICS)
        assert split_r.shape == (2, 13)

        r = SplitByRefAttrResults(data.get('dataset'), 'style', [
            '10.1103/physrevb.67.134406', '10.1159/000408205',
            '10.1002/chin.199827068'
        ])
        split_r = r.get(dfk.EVAL_SPLIT_METRICS)
        split_r = split_r.sort_values(by='style')
        assert split_r.shape == (2, 13)
        assert split_r['style'].tolist() == ['apa', 'ieee']
        assert split_r['correct ref links (fraction)'].tolist() \
            == approx([2/5, 1/2])
        assert split_r['correct missing ref links (fraction)'].tolist() \
            == approx([1/10, 1/10])
        assert split_r['incorrect ref links (fraction)'].tolist() \
            == approx([2/5, 1/10])
        assert split_r['incorrect existing ref links (fraction)'].tolist() \
            == approx([0, 3/10])
        assert split_r['incorrect missing ref links (fraction)'].tolist() \
            == approx([1/10, 0])
        assert split_r['accuracy'].tolist() == approx([5 / 10, 3 / 5])
        assert split_r['average precision over target docs'].tolist() \
            == approx([1, 2/3])
        assert split_r['average recall over target docs'].tolist() \
            == approx([2/3, 1])
        assert split_r['average F1 over target docs'].tolist() \
            == approx([2/3, 2/3])
        assert split_r['precision'].tolist() == approx([1 / 2, 5 / 9])
        assert split_r['recall'].tolist() == approx([4 / 9, 5 / 6])
コード例 #4
0
ファイル: data_manager.py プロジェクト: Chezacar/AmurTiger2.0
    def _extract_1stframe(self, dir_path, json_path, relabel):
        if osp.exists(json_path):
            print("=> {} generated before, awesome!".format(json_path))
            split = read_json(json_path)
            return split['tracklets']

        print(
            "=> Automatically generating split (might take a while for the first time, have a coffe)"
        )
        pdirs = glob.glob(osp.join(dir_path, '*'))  # avoid .DS_Store
        print("Processing {} with {} person identities".format(
            dir_path, len(pdirs)))

        pid_container = set()
        for pdir in pdirs:
            pid = int(osp.basename(pdir))
            pid_container.add(pid)
        pid2label = {pid: label for label, pid in enumerate(pid_container)}

        tracklets = []
        for pdir in pdirs:
            pid = int(osp.basename(pdir))
            if relabel: pid = pid2label[pid]
            tdirs = glob.glob(osp.join(pdir, '*'))
            for tdir in tdirs:
                raw_img_paths = glob.glob(osp.join(tdir, '*.jpg'))
                num_imgs = len(raw_img_paths)

                if num_imgs < self.min_seq_len:
                    continue

                img_paths = []
                for img_idx in range(num_imgs):
                    # some tracklet starts from 0002 instead of 0001
                    img_idx_name = 'F' + str(img_idx + 1).zfill(4)
                    res = glob.glob(
                        osp.join(tdir, '*' + img_idx_name + '*.jpg'))
                    if len(res) == 0:
                        print(
                            "Warn: index name {} in {} is missing, jump to next"
                            .format(img_idx_name, tdir))
                        continue
                    img_paths.append(res[0])
                img_name = osp.basename(img_paths[0])
                if img_name.find('_') == -1:
                    # old naming format: 0001C6F0099X30823.jpg
                    camid = int(img_name[5]) - 1
                else:
                    # new naming format: 0001_C6_F0099_X30823.jpg
                    camid = int(img_name[6]) - 1
                img_paths = tuple(img_paths)
                tracklets.append((img_paths[0], pid, camid))

        print("Saving split to {}".format(json_path))
        split_dict = {
            'tracklets': tracklets,
        }
        write_json(split_dict, json_path)

        return tracklets
コード例 #5
0
 def test_format_ref_string(self, datadir):
     record = read_json(datadir + 'test_record.json')
     assert format_ref_string(record, 'apa') == \
         'Tkaczyk, D., Szostek, P., Fedoryszak, M., Dendek, P. J., & ' + \
         'Bolikowski, Ł. (2015). CERMINE: automatic extraction of ' + \
         'structured metadata from scientific literature. ' + \
         'International Journal on Document Analysis and Recognition ' + \
         '(IJDAR), 18(4), 317–335.'
     assert format_ref_string(record, 'chicago-author-date') == \
         'Tkaczyk, Dominika, Paweł Szostek, Mateusz Fedoryszak, Piotr ' + \
         'Jan Dendek, and Łukasz Bolikowski. 2015. “CERMINE: Automatic ' + \
         'Extraction of Structured Metadata from Scientific ' + \
         'Literature.” International Journal on Document Analysis and ' + \
         'Recognition (IJDAR) 18 (4) (July 3): 317–335.'
     assert format_ref_string(record, 'modern-language-association') == \
         'Tkaczyk, Dominika et al. “CERMINE: Automatic Extraction of ' + \
         'Structured Metadata from Scientific Literature.” ' + \
         'International Journal on Document Analysis and Recognition ' + \
         '(IJDAR) 18.4 (2015): 317–335.'
     assert format_ref_string(record, 'american-chemical-society') == \
         '(1) Tkaczyk, D.; Szostek, P.; Fedoryszak, M.; Dendek, P. J.; ' + \
         'Bolikowski, Ł. International Journal on Document Analysis ' + \
         'and Recognition (IJDAR) 2015, 18, 317–335.'
     assert format_ref_string(record, 'degraded_one_author') == \
         'Tkaczyk, Dominika. CERMINE: automatic extraction of ' + \
         'structured metadata from scientific literature. ' + \
         'International Journal on Document Analysis and Recognition ' + \
         '(IJDAR). 2015. 18. 4. 317-335'
     assert format_ref_string(record, 'degraded_title_scrambled') == \
         'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, Mateusz, ' + \
         'Dendek, Piotr Jan, Bolikowski, Łukasz. automatic metadata ' + \
         'from scientific of literature CERMINE: extraction ' + \
         'structured. International Journal on Document Analysis and ' + \
         'Recognition (IJDAR). 2015. 18. 4. 317-335'
コード例 #6
0
 def test_degraded_one_author(self, datadir):
     record = read_json(datadir + 'test_record.json')
     assert degraded_one_author(record) == \
         'Tkaczyk, Dominika. CERMINE: automatic extraction of ' + \
         'structured metadata from scientific literature. ' + \
         'International Journal on Document Analysis and Recognition ' + \
         '(IJDAR). 2015. 18. 4. 317-335'
コード例 #7
0
    def execute(self, conf_path: str, input_path: str, output_path: str,
                on_adls: bool):
        """
        Clean a list of JSON files and writing them
        Args:
            conf_path: File path of the params.json
            input_path: Folder path to read raw files
            output_path: Folder path to write files
            on_adls: If the data are on the Azure Data Lake set true to use the correct package

        Returns: Nothing the data are directly write at the desired location

        """
        self.load_params(conf_path)
        self.params.get("json")
        self.data_lake = uts.connect_to_data_lake_store(
            self.params) if on_adls else None

        res = []
        for file in self.params.get("json"):
            json_file_name = "{}.json".format(file)
            read_path = path.join(input_path, json_file_name)
            self.logger.info(
                "Reading and parsing JSON from: {}".format(read_path))
            data = uts.read_json(read_path,
                                 self.data_lake,
                                 advanced_parsing=True)

            write_path = path.join(output_path, json_file_name)
            self.logger.info(
                "Writing the parsed JSON to: {}".format(write_path))
            uts.write_json(data, write_path, self.data_lake)
            res.append(data)

        return res
コード例 #8
0
def merge_datasets(dataset_filenames, out_filename):
    # the assumption here is that the abstract text and entity detected are identical
    # the only different things are relations
    # for chemprot and drugprot this is the case, see compare_datasets.py
    datasets = []
    for filename in dataset_filenames:
        datasets.append(utils.read_json(filename))
    # map all dataset to CPR-X
    for ds in datasets:
        map_to_cpr(ds)
    merged = datasets[0]
    for ds in datasets[1:]:
        for article_id, article in ds.items():

            # if this article is unique to this dataset, just add it
            # but make sure its relations are converted to CPR-X
            if article_id not in merged:
                merged[article_id] = article
            else:
                print(f'merging {article_id}')
                merge_article(merged[article_id], article)

    # stats
    total_relation = 0
    for article_id, article in merged.items():
        for sent in article['abstract']:
            total_relation += len(sent['relations'])
    print(f'number of relation of merged dataset: {total_relation}')

    utils.save_json(out_filename, merged)
コード例 #9
0
def get_dataset(num_gpus=1, mode='train'):
    assert mode in ['train', 'val']
    global id_bboxs_dict, img_path, params, id_kps_dict

    if mode == 'train':
        json_file = params['train_json_file']
        img_path = params['train_img_path']
    else:
        json_file = params['val_json_file']
        img_path = params['val_img_path']

    img_ids, id_bboxs_dict, id_kps_dict = read_json(json_file)
    if mode == 'train':
        random.shuffle(img_ids)
        dataset = tf.data.Dataset.from_tensor_slices(img_ids)
    else:
        dataset = tf.data.Dataset.from_tensor_slices(img_ids)

    dataset = dataset.shuffle(buffer_size=1000).repeat(1)

    if mode == 'train':
        dataset = dataset.map(tf_parse_func,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
    else:
        dataset = dataset.map(tf_parse_func_for_val,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

    dataset = dataset.batch(params['batch_size'] * num_gpus,
                            drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset
コード例 #10
0
 def test_similar_search_query(self, datadir):
     record = read_json(datadir + 'test_record.json')
     assert similar_search_query(record) == \
         'CERMINE: automatic extraction of structured metadata from ' + \
         'scientific literature International Journal on Document ' + \
         'Analysis and Recognition (IJDAR) Tkaczyk Szostek Fedoryszak ' + \
         'Dendek Bolikowski'
コード例 #11
0
def generate(parser):
    parser.add_argument("--val", dest="val", action='store_true')
    parser.add_argument("-c", dest="config", type=str)
    args = parser.parse_args()

    config = read_json(args.config)
    # We load a dummy data loader for post-processing
    transform_config = config['transform_config']
    loader_config = config['loader_config']
    processor = AudioProcessor(**transform_config)
    postprocess = processor.get_postprocessor()
    assert os.path.exists(args.outdir), "Output path does not exist"
    # Create output evaluation dir
    trval = 'val' if args.val else 'train'
    output_dir = mkdir_in_path(args.outdir, f"true_sample_{config['name']}")
    output_dir = mkdir_in_path(
        output_dir,
        f"{trval}_{args.n_gen}_{datetime.now().strftime('%Y-%m-%d_%H_%M')}")

    dbname = loader_config['dbname']
    loader = get_data_loader(dbname)(name=dbname + '_' +
                                     transform_config['transform'],
                                     preprocessing=processor,
                                     **loader_config)

    if args.val:
        data, _ = loader.get_validation_set(args.n_gen)
    else:
        data = random.sample(loader.data, k=args.n_gen)
    audio_out = map(postprocess, data)
    saveAudioBatch(audio_out,
                   path=output_dir,
                   basename='true_sample',
                   sr=config["transform_config"]["sample_rate"])
    print("FINISHED!\n")
コード例 #12
0
 def test_degraded_no_stopwords(self, datadir):
     record = read_json(datadir + 'test_record.json')
     assert degraded_no_stopwords(record) == \
         'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, Mateusz, ' + \
         'Dendek, Piotr Jan, Bolikowski, Łukasz. CERMINE: automatic ' + \
         'extraction structured metadata scientific literature. ' + \
         'International Journal on Document Analysis and Recognition ' + \
         '(IJDAR). 2015. 18. 4. 317-335'
コード例 #13
0
 def test_full(self, datadir):
     data = read_json(datadir + 'test_dataset.json').get('dataset')
     r = ByDocumentMetricsResults(data, ['10.1103/physrevb.67.134406',
                                         '10.1159/000408205',
                                         '10.1002/chin.199827068'])
     assert r.get(dfk.EVAL_MEAN_PREC) == approx(7/9)
     assert r.get(dfk.EVAL_MEAN_REC) == approx(5/6)
     assert r.get(dfk.EVAL_MEAN_F1) == approx(13/18)
コード例 #14
0
 def test_generate_target_gt(self, datadir):
     record = read_json(datadir + 'test_record.json')
     assert generate_target_gt(record, []) == \
         {'DOI': '10.1007/s10032-015-0249-8'}
     assert generate_target_gt(record, ['publisher', 'type']) == \
         {'DOI': '10.1007/s10032-015-0249-8',
          'publisher': 'Springer Nature',
          'type': 'journal-article'}
コード例 #15
0
 def test_degraded_title_scrambled(self, datadir):
     record = read_json(datadir + 'test_record.json')
     random.seed(10)
     assert degraded_title_scrambled(record) == \
         'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, Mateusz, ' + \
         'Dendek, Piotr Jan, Bolikowski, Łukasz. metadata extraction ' + \
         'scientific automatic literature structured of from CERMINE:. ' + \
         'International Journal on Document Analysis and Recognition ' + \
         '(IJDAR). 2015. 18. 4. 317-335'
コード例 #16
0
ファイル: data_manager.py プロジェクト: xiayang14551/RAPA
    def __init__(self, split_id=1):
        self._download_data()
        self._check_before_run()

        self._prepare_split()
        splits = read_json(self.split_path)
        if split_id >= len(splits):
            raise ValueError(
                "split_id exceeds range, received {}, but expected between 0 and {}"
                .format(split_id,
                        len(splits) - 1))
        split = splits[split_id]
        train_dirs, test_dirs = split['train'], split['test']
        print("# train identites: {}, # test identites {}".format(
            len(train_dirs), len(test_dirs)))

        train, num_train_tracklets, num_train_pids, num_imgs_train = \
          self._process_data(train_dirs, cam1=True, cam2=True)
        query, num_query_tracklets, num_query_pids, num_imgs_query = \
          self._process_data(test_dirs, cam1=True, cam2=False)
        gallery, num_gallery_tracklets, num_gallery_pids, num_imgs_gallery = \
          self._process_data(test_dirs, cam1=False, cam2=True)

        num_imgs_per_tracklet = num_imgs_train + num_imgs_query + num_imgs_gallery
        min_num = np.min(num_imgs_per_tracklet)
        max_num = np.max(num_imgs_per_tracklet)
        avg_num = np.mean(num_imgs_per_tracklet)

        num_total_pids = num_train_pids + num_query_pids
        num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets

        print("=> iLIDS-VID loaded")
        print("Dataset statistics:")
        print("  ------------------------------")
        print("  subset   | # ids | # tracklets")
        print("  ------------------------------")
        print("  train    | {:5d} | {:8d}".format(num_train_pids,
                                                  num_train_tracklets))
        print("  query    | {:5d} | {:8d}".format(num_query_pids,
                                                  num_query_tracklets))
        print("  gallery  | {:5d} | {:8d}".format(num_gallery_pids,
                                                  num_gallery_tracklets))
        print("  ------------------------------")
        print("  total    | {:5d} | {:8d}".format(num_total_pids,
                                                  num_total_tracklets))
        print(
            "  number of images per tracklet: {} ~ {}, average {:.1f}".format(
                min_num, max_num, avg_num))
        print("  ------------------------------")

        self.train = train
        self.query = query
        self.gallery = gallery

        self.num_train_pids = num_train_pids
        self.num_query_pids = num_query_pids
        self.num_gallery_pids = num_gallery_pids
コード例 #17
0
    def __init__(self, annot_path, video_id_path, metadata_path, fps,
                 window_size, out_path):
        '''

        Given videos, we create segments (of frames) and their corresponding labels.
        A segment is a start/end frame numbers (for a video) and label is whether compression
        occurs or not in that segment.  We use the annotations in secs of the videos to calc
        the label.

        :param annot_path: str, path to the cpr annotations
        :param video_id_path: str, path for the video ids by train/val/test splits
        :param metadata_path: str, path to the metadata of the videos
        :param fps: int, fps of frames videos were converted to
        :param window_size: int, num of frames in a sliding window
        :param out_path: str, path to output the segment and labels json

        '''

        self.fps = fps
        self.window_size = window_size

        self.annot_json = read_json(annot_path)
        video_id_by_split = read_json(video_id_path)
        self.metadata_path = read_json(metadata_path)

        # store each split here
        all_data = {}

        # loop thru each data split
        for split_type in video_id_by_split.keys():
            video_id_list = video_id_by_split[
                split_type]  # retrieve a video id
            segments, labels = self._create_segments_labels(
                video_id_list)  # create the segments/labels
            data = {
                'segments': segments,
                'labels': labels
            }  # store both in a dict
            all_data[split_type] = data  # store for entire video

        # write all to disk
        out_path = os.path.join(out_path, 'segments_and_labels.json')
        write_json(all_data, out_path, indent=None)
コード例 #18
0
ファイル: data_manager.py プロジェクト: victor-gui/TKP
    def __init__(self, root='/data/datasets/', split_id=0):
        self.root = osp.join(root, 'iLIDS-VID')
        self.dataset_url = 'http://www.eecs.qmul.ac.uk/~xiatian/iLIDS-VID/iLIDS-VID.tar'
        self.data_dir = osp.join(self.root, 'i-LIDS-VID')
        self.split_dir = osp.join(self.root, 'train-test people splits')
        self.split_mat_path = osp.join(self.split_dir, 'train_test_splits_ilidsvid.mat')
        self.split_path = osp.join(self.root, 'splits.json')
        self.cam_1_path = osp.join(self.root, 'i-LIDS-VID/sequences/cam1')
        self.cam_2_path = osp.join(self.root, 'i-LIDS-VID/sequences/cam2')
        # self._download_data()
        self._check_before_run()

        self._prepare_split()
        splits = read_json(self.split_path)
        if split_id >= len(splits):
            raise ValueError("split_id exceeds range, received {}, but expected between 0 and {}".format(split_id, len(splits)-1))
        split = splits[split_id]
        train_dirs, test_dirs = split['train'], split['test']
        print("# train identites: {}, # test identites {}".format(len(train_dirs), len(test_dirs)))

        train, num_train_tracklets, num_train_pids, num_imgs_train = \
          self._process_train_data(train_dirs, cam1=True, cam2=True)
        query, num_query_tracklets, num_query_pids, num_imgs_query = \
          self._process_test_data(test_dirs, cam1=True, cam2=False)
        gallery, num_gallery_tracklets, num_gallery_pids, num_imgs_gallery = \
          self._process_test_data(test_dirs, cam1=False, cam2=True)

        num_imgs_per_tracklet = num_imgs_train + num_imgs_query + num_imgs_gallery
        min_num = np.min(num_imgs_per_tracklet)
        max_num = np.max(num_imgs_per_tracklet)
        avg_num = np.mean(num_imgs_per_tracklet)

        num_total_pids = num_train_pids + num_query_pids
        num_total_tracklets = num_train_tracklets + num_query_tracklets + num_gallery_tracklets

        print("=> iLIDS-VID loaded")
        print("Dataset statistics:")
        print("  ------------------------------")
        print("  subset   | # ids | # tracklets")
        print("  ------------------------------")
        print("  train    | {:5d} | {:8d}".format(num_train_pids, num_train_tracklets))
        print("  query    | {:5d} | {:8d}".format(num_query_pids, num_query_tracklets))
        print("  gallery  | {:5d} | {:8d}".format(num_gallery_pids, num_gallery_tracklets))
        print("  ------------------------------")
        print("  total    | {:5d} | {:8d}".format(num_total_pids, num_total_tracklets))
        print("  number of images per tracklet: {} ~ {}, average {:.1f}".format(min_num, max_num, avg_num))
        print("  ------------------------------")

        self.train = train
        self.query = query
        self.gallery = gallery

        self.num_train_pids = num_train_pids
        self.num_query_pids = num_query_pids
        self.num_gallery_pids = num_gallery_pids
コード例 #19
0
    def __init__(self, args, options=""):
        """
        - class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
        and logging module.

        input:
            args: Dict containing configurations, hyperparameters for training. contents of `parameters.json` file for example.
            options: Dict keychain:value, specifying position values to be replaced from config dict.
        """
        # parse default and custom cli options
        for opt in options:
            args.add_argument(*opt.flags, default=None, type=opt.type)
        args = args.parse_args()

        self.cfg_fname = Path(args.config)

        # load json file as python dictionary
        config = read_json(self.cfg_fname)

        config["src_data"] = args.src_data
        config["tgt_data"] = args.tgt_data

        config["src_data_prefix"] = args.src_data_prefix
        config["tgt_data_prefix"] = args.tgt_data_prefix

        # load config file and apply custom cli options
        self._config = _update_config(config, options, args)

        # set save directory where trained embedding and log will be saved
        save_dir_name = args.save_name if args.save_name else config[
            "src_data_prefix"] + "_" + config["tgt_data_prefix"]
        save_dir = Path(args.save) / save_dir_name

        timestamp = datetime.now().strftime(r'%m%d_%H%M%S')

        exper_name = self.config['name']

        print(f"Result will be saved in {save_dir}")

        self._save_dir = save_dir / 'best' / exper_name / timestamp
        self._log_dir = save_dir / 'log' / exper_name / timestamp

        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.log_dir.mkdir(parents=True, exist_ok=True)

        # save updated config file to the checkpoint dir
        write_json(self.config, self.save_dir / "parameters.json")

        # configure logging module
        setup_logging(self.log_dir)
        self.log_levels = {
            0: logging.WARNING,
            1: logging.INFO,
            2: logging.DEBUG
        }
コード例 #20
0
    def test_full(self, datadir):
        data = read_json(datadir + 'test_dataset.json').get('dataset')
        r = DocAttrLinkMetricsResults(data, 'type', 'journal-article')
        assert r.get(dfk.EVAL_PREC) == approx(1 / 2)
        assert r.get(dfk.EVAL_REC) == approx(2 / 3)
        assert r.get(dfk.EVAL_F1) == approx(4 / 7)

        r = DocAttrLinkMetricsResults(data, 'type', 'reference-entry')
        assert r.get(dfk.EVAL_PREC) == approx(1)
        assert r.get(dfk.EVAL_REC) == approx(1)
        assert r.get(dfk.EVAL_F1) == approx(1)
コード例 #21
0
    def test_get_journal_title(self, datadir):
        record = read_json(datadir + 'test_record.json')
        assert get_journal_title(record) == \
            'International Journal on Document Analysis and Recognition ' + \
            '(IJDAR)'

        record['container-title'] = []
        assert get_journal_title(record) == ''

        del record['container-title']
        assert get_journal_title(record) == ''
コード例 #22
0
    def test_get_authors(self, datadir):
        record = read_json(datadir + 'test_record.json')
        authors = 'Tkaczyk, Dominika, Szostek, Paweł, Fedoryszak, ' + \
            'Mateusz, Dendek, Piotr Jan, Bolikowski, Łukasz'
        assert get_authors(record) == authors

        record['author'] = record['author'] * 50
        assert get_authors(record) == ', '.join([authors] * 10)

        record['author'] = []
        assert get_authors(record) == ''
コード例 #23
0
def main(json_filename1, json_filename2, brat_diff_dir=None):

    dataset1 = utils.read_json(json_filename1)
    dataset2 = utils.read_json(json_filename2)

    # calculate article overlaps
    ds1_size, ds2_size, unique_size, common_size = article_id_overlap_check(
        dataset1.keys(), dataset2.keys())
    print(f'number of articles in {basename(json_filename1)}: {ds1_size}')
    print(f'number of articles in {basename(json_filename2)}: {ds2_size}')
    print(f'number of unique articles in both files: {unique_size}')
    print(f'number of common articles in both files: {common_size}')

    # if an article is mentioned in both dataset, compare their abstract, relations, and entities
    print('trying to find article ids in both dataset')
    print(
        f'comparing {basename(json_filename1)} to {basename(json_filename2)}')
    for article_id, data in dataset1.items():
        abstract = data['abstract']
        if article_id in dataset2:
            abstract_other = dataset2[article_id]['abstract']
            diff = compare_abstract(abstract, abstract_other)
            if any([len(x) > 0 for x in diff]):
                print(f'article id {article_id}')
                print_diff(diff)
                if brat_diff_dir:
                    txt, ann = json_to_brat.article_brat_repr(
                        data, include_entities=True)
                    rels_in1not2, rels_in2not1 = diff[2], diff[3]
                    for i, rel in enumerate(rels_in1not2):
                        ann.append(
                            json_to_brat.rel_brat_repr(rel,
                                                       i,
                                                       type_suffix='_1'))
                    for i, rel in enumerate(rels_in2not1):
                        ann.append(
                            json_to_brat.rel_brat_repr(rel,
                                                       i + len(rels_in1not2),
                                                       type_suffix='_2'))
                    json_to_brat.write_brat(article_id, brat_diff_dir, txt,
                                            ann)
コード例 #24
0
 def test_full(self, datadir):
     data = read_json(datadir + 'test_dataset.json').get('dataset')
     r = SplitByDocAttrResults(data, 'type')
     r = r.get(dfk.EVAL_SPLIT_METRICS)
     r = r.sort_values(by='type')
     assert r.shape == (3, 4)
     assert r['type'].tolist() == [
         'book-chapter', 'journal-article', 'reference-entry'
     ]
     assert r['precision'].tolist() == approx([1 / 3, 1 / 2, 1])
     assert r['recall'].tolist() == approx([1 / 4, 2 / 3, 1])
     assert r['F1'].tolist() == approx([2 / 7, 4 / 7, 1])
コード例 #25
0
ファイル: configer.py プロジェクト: kaczmarj/Bioinfor-DeepATT
    def from_config_file(cls, config_file, identifier, verbosity):
        """
        Initialize this class from config file. Used in train, test.
        :param config_file: config file.
        :param identifier: identifier
        :return: ConfigParser.
        """
        message = "Configuration file need to be specified. Add '-c ./config/config.json', for example."
        assert config_file is not None, message
        config = read_json(config_file)
        run_id = identifier

        # Return ConfigParser object.
        return cls(config, run_id, verbosity)
コード例 #26
0
ファイル: parse_config.py プロジェクト: erytheis/BattLeDIM
    def __init__(self,
                 config=None,
                 resume=None,
                 modification=None,
                 run_id=None):
        """
        class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
        and logging module.
        :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example.
        :param resume: String, path to the checkpoint being loaded.
        :param modification: Dict keychain:value, specifying position values to be replaced from config dict.
        :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default
        """

        if config is None:
            config_path = UTILS_DIR / 'config.json'
            config = read_json(config_path)

        # load config file and apply modification
        self._config = _update_config(config, modification)
        self.resume = resume

        # set save_dir where trained model and log will be saved.
        save_dir = ROOT_DIR / self.config['trainer']['save_dir']

        # set the data_dir
        self.data_dir = ROOT_DIR / self.config['data_loader']['args'][
            'data_dir']

        exper_name = self.config['name']
        if run_id is None:  # use timestamp as default run-id
            run_id = datetime.now().strftime(r'%m%d_%H%M%S')
        self._save_dir = save_dir / 'models' / exper_name / run_id
        self._log_dir = save_dir / 'log' / exper_name / run_id

        # make directory for saving checkpoints and log.
        exist_ok = run_id == ''
        self.save_dir.mkdir(parents=True, exist_ok=exist_ok)
        self.log_dir.mkdir(parents=True, exist_ok=exist_ok)

        # save updated config file to the checkpoint dir
        write_json(self.config, self.save_dir / 'config.json')

        # configure logging module
        # setup_logging(self.log_dir)
        self.log_levels = {
            0: logging.WARNING,
            1: logging.INFO,
            2: logging.DEBUG
        }
コード例 #27
0
def get_dataset_definition(dataset_name, train_with):
    if dataset_name == "librispeech":
        dataset_config_path = "cfg/dataset_definition/librispeech.json"
    elif dataset_name == "TIMIT":
        dataset_config_path = "cfg/dataset_definition/TIMIT.json"
    else:
        raise NotImplementedError(dataset_name)

    dataset_definition = read_json(dataset_config_path)
    dataset_definition['data_info']['labels'] = {
        k: v
        for k, v in dataset_definition['data_info']['labels'].items()
        if k in dataset_definition['datasets'][train_with]['labels']
    }
    for label in dataset_definition['data_info']['labels']:
        label_info = dataset_definition['data_info']['labels'][label]
        if label_info['num_lab'] is None:
            if label == "lab_cd":

                folder_lab_count = dataset_definition['datasets'][train_with] \
                    ['labels'][label]['label_folder']
                hmm_info = run_shell_info(
                    f"hmm-info {folder_lab_count}/final.mdl")
                label_info['num_lab'] = int(
                    hmm_info.split("\n")[1].rsplit(" ", 1)[1])

                label_info['lab_count'] = get_lab_count(
                    label_opts=dataset_definition['datasets'][train_with] \
                        ['labels'][label]['label_opts'],
                    num_label=label_info["num_lab"],
                    folder_lab_count=folder_lab_count)

            elif label == "lab_mono" or label == "lab_phn" or label == "lab_phnframe":
                folder_lab_count = dataset_definition['datasets'][train_with] \
                    ['labels'][label]['label_folder']
                hmm_info = run_shell_info(
                    f"hmm-info {folder_lab_count}/final.mdl")
                label_info['num_lab'] = int(
                    hmm_info.split("\n")[0].rsplit(" ", 1)[1])

                label_info['lab_count'] = get_lab_count(
                    label_opts=dataset_definition['datasets'][train_with] \
                        ['labels'][label]['label_opts'],
                    num_label=label_info["num_lab"],
                    folder_lab_count=folder_lab_count)

            else:
                raise NotImplementedError(label)

    return dataset_definition
コード例 #28
0
 def test_full_summary(self, datadir):
     data = read_json(datadir + 'test_dataset.json')
     r = ByDocumentMetricsResults(data.get('dataset'),
                                  data.get('dataset_dois'))
     doc_r = r.get(dfk.EVAL_DOC_METRICS)
     doc_r = doc_r.sort_values(by='doc')
     assert doc_r.shape == (10, 4)
     assert doc_r['doc'].tolist() == sorted(data.get('dataset_dois'))
     assert doc_r['precision'].tolist()[0] == approx(1/3)
     assert doc_r['precision'].tolist()[9] == approx(1)
     assert doc_r['recall'].tolist()[0] == approx(1)
     assert doc_r['recall'].tolist()[9] == approx(0)
     assert doc_r['F1'].tolist()[0] == approx(1/2)
     assert doc_r['F1'].tolist()[9] == approx(0)
コード例 #29
0
 def test_full(self, datadir):
     data = read_json(datadir + 'test_dataset.json').get('dataset')
     r = ReferenceMetricsResults(data)
     assert r.get(dfk.EVAL_REF_TOTAL) == approx(20)
     assert r.get(dfk.EVAL_CORR_LINK_C) == approx(9)
     assert r.get(dfk.EVAL_CORR_NO_LINK_C) == approx(2)
     assert r.get(dfk.EVAL_INCORR_LINK_C) == approx(5)
     assert r.get(dfk.EVAL_INCORR_EXISTS_C) == approx(3)
     assert r.get(dfk.EVAL_INCORR_MISSING_C) == approx(1)
     assert r.get(dfk.EVAL_CORR_LINK_F) == approx(0.45)
     assert r.get(dfk.EVAL_CORR_NO_LINK_F) == approx(0.1)
     assert r.get(dfk.EVAL_INCORR_LINK_F) == approx(0.25)
     assert r.get(dfk.EVAL_INCORR_EXISTS_F) == approx(0.15)
     assert r.get(dfk.EVAL_INCORR_MISSING_F) == approx(0.05)
     assert r.get(dfk.EVAL_ACCURACY) == approx(0.55)
コード例 #30
0
def get_dataset(mode='train'):
    global id_bboxs_dict, img_path, params, id_kps_dict

    json_file = params['train_json_file']
    img_path = params['train_img_path']

    img_ids, id_bboxs_dict, id_kps_dict = read_json(json_file)
    random.shuffle(img_ids)
    dataset = tf.data.Dataset.from_tensor_slices(img_ids)
    dataset = dataset.shuffle(buffer_size=1000).repeat(1)
    dataset = dataset.map(tf_parse_func,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(params['batch_size'], drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset