def make_index(self):
        one_index = self.data_parser.iter_pages(self.pages_per_file)
        self._update_vocab(one_index)
        dump_pickle(one_index, self.index_path)
        # dump_json(one_index, self.index_path)

        self.data_parser.save_dicts()
        self.page_count = self.data_parser.page_count

        dump_json(self.vocab, self.vocab_path, indent=4)
    def make_index_large(self):
        for one_index in self.data_parser.iter_pages(self.pages_per_file):
            self._update_vocab(one_index)
            self._write_file(one_index)
            self.file_count += 1

        self.data_parser.save_page_position()
        self.page_count = self.data_parser.page_count

        dump_json(self.vocab, self.vocab_path)
        self._merge_index()
Ejemplo n.º 3
0
def query_one_dataset(project_name, top_k=None):
    dataset_dir = os.path.join(data_root, project_name)
    os.makedirs(dataset_dir, exist_ok=True)
    prefix = '{}_'.format(top_k) if top_k else ''
    project_cfg_path = os.path.join(dataset_dir, '{}{}_cfg.json'.format(prefix, project_name))

    if os.path.exists(project_cfg_path):  # has dataset_file
        print('read {} from json!'.format(project_name))
        project = load_json(project_cfg_path)
    else:
        print('read {} from mysql!'.format(project_name))
        project = create_dataset_from_sql(project_name, top_k)
        dump_json(project, out_path=os.path.join(dataset_dir, '{}{}_cfg.json'.format(prefix, project_name)))

    return project
Ejemplo n.º 4
0
def cvt_json_suepr(in_json, A_subs, a_subs):
    train_dict = load_json(in_json)
    for ann in train_dict['annotations']:
        if ann['category_id'] in A_subs:
            ann['category_id'] = 0
        elif ann['category_id'] in a_subs:
            ann['category_id'] = 1
    train_dict['categories'] = [
        {
            "id": 0,
            "name": "A"
        },
        {
            "id": 1,
            "name": "a"
        }]
    dump_json(train_dict, in_json.replace('.json', '_super.json'))
Ejemplo n.º 5
0
def project_detail(project_name, top_k=None):
    dataset_dir = os.path.join(data_root, project_name)
    print('dataset_dir', dataset_dir)
    os.makedirs(dataset_dir, exist_ok=True)
    prefix = '{}_'.format(top_k) if top_k else ''
    project_cfg_path = os.path.join(
        dataset_dir, '{}{}_cfg.json'.format(prefix, project_name))

    if os.path.exists(project_cfg_path):  # has dataset_file
        print('read {} from json!'.format(project_name))
        project = load_json(project_cfg_path)
    else:
        print('read {} from mysql!'.format(project_name))
        # read project cfgs
        sql = "select id, name, taskType, taskRules from d_projects where name='{}'".format(
            project_name)
        res = db.session.execute(sql)
        project = res.next()
        project = parse_projects(project)

        # read data from mysql
        # todo: store the largest hitId for a project, better for update dataset
        sql = "select d_hits.id as img_id, d_hits.data as path, d_hits_result.result as anns from d_hits, d_hits_result " \
              "where d_hits.projectId='{}' and d_hits.id=d_hits_result.hitId and d_hits.status='done'".format(project['id'])
        res = db.session.execute(sql)
        dataset = create_dataset_from_sql_res(res)
        filted_cats, filted_cats_num, train_num, val_num, test_num = split_and_save_coco_dataset(
            dataset, dataset_dir, top_k)

        # update project
        project['cats'] = filted_cats
        project['cats_num'] = filted_cats_num
        project['classes'] = len(filted_cats)
        project['train'] = train_num
        project['valid'] = val_num
        project['test'] = test_num

        dump_json(project,
                  out_path=os.path.join(
                      dataset_dir,
                      '{}{}_cfg.json'.format(prefix, project_name)))

    return project
Ejemplo n.º 6
0
def save_coco_dataset(train_data,
                      val_data,
                      test_data,
                      cats,
                      dataset_dir,
                      use_prefix=False):
    if use_prefix:
        prefix = '{}_'.format(len(cats))
    else:
        prefix = ''

    # cvt to coco
    dataset_name = os.path.basename(dataset_dir)
    train_coco = convert_to_coco(train_data,
                                 cats,
                                 info=dataset_name + ' train ' +
                                 prefix.replace('_', ''))
    val_coco = convert_to_coco(val_data,
                               cats,
                               info=dataset_name + ' val ' +
                               prefix.replace('_', ''))
    test_coco = convert_to_coco(test_data,
                                cats,
                                info=dataset_name + ' test ' +
                                prefix.replace('_', ''))

    # save
    dump_json(train_coco,
              out_path=os.path.join(dataset_dir, '{}{}_train.json').format(
                  prefix, dataset_name))
    dump_json(val_coco,
              out_path=os.path.join(dataset_dir, '{}{}_val.json').format(
                  prefix, dataset_name))
    dump_json(test_coco,
              out_path=os.path.join(dataset_dir, '{}{}_test.json').format(
                  prefix, dataset_name))
 def save_cfg(self, path):
     cfg = self.data_parser.get_cfg()
     for k, v in self.__dict__.items():
         if type(v) == type("string") or type(v) == type(1):
             cfg[k] = v
     dump_json(cfg, path, indent=4)
 def save_dicts(self):
     dump_pickle(self.page_positions, self.page_positions_path)
     dump_pickle(self.page_len_list, self.page_len_path)
     dump_pickle(self.page_word_index, self.page_word_index_path)
     dump_json(self.stem_word_dict, self.stemmer_path, indent=4)