def __init__(self, data_path="", output_dir="", debug=False): self.name = "WikiParser" self.data_path = data_path self.STOP_WORDS_PATH = os.path.join("index", "stop_words.json") self.stemmer_path = output_dir + "/stem.json" self.stemmer = nltk.stem.SnowballStemmer( 'english') # You can use porter or other stemmer if os.path.exists(self.stemmer_path): self.stem_word_dict = load_json(self.stemmer_path) self.stem_word_dict = defaultdict(lambda: False, self.stem_word_dict) else: self.stem_word_dict = defaultdict(lambda: False) self.stop_words = set(load_json(self.STOP_WORDS_PATH)["stop_words"]) self.tokenizer = BasicTokenizer(never_split=[]) self.punctuation = re.compile(r"[{}]+".format(punctuation)) self.page_count = 0 # denote which number of wiki page it is self.page_positions = dict( ) # store position of each page in source file self.page_positions_path = output_dir + "/page_positions.pickle" self.page_len_path = output_dir + "/page_len.pickle" # a list to store length of each page self.page_len_list = [] self.page_len = 0 self.page_word_index_path = output_dir + "/page_word_idnex.pickle" self.page_word_index = [] self.debug = debug
def initialize(obj_name, to_file=False): """We need to make sure we have the database schema first.""" db_schema = load_json("resources/database-structure-definition.json") if not validate_schema(db_schema, obj_name): return if to_file: global dao_file dao_file = open('app/dao/{}.py'.format(obj_name), 'w') create_object_comment_block(obj_name) create_class(obj_name) create_init_func(obj_name) create_insert_object_to_db(obj_name, db_schema[obj_name]) create_update_object_to_db(obj_name, db_schema[obj_name]) create_delete_object_to_db(obj_name, db_schema[obj_name]) create_list_objects_of_type(obj_name) create_methods_by_unique_key(obj_name, db_schema[obj_name]) create_get_objs_for_key(obj_name, db_schema[obj_name]) printt_dao("") printt_dao("{}DAO = {}DAO()".format(uncapitalize(obj_name), obj_name)) if to_file: dao_file.close()
def __init__(self, index_config_path, ranker_name): self.index_cfg = load_json(index_config_path) self.index_pst = defaultdict(lambda: [[], []], load_pickle(self.index_cfg["index_path"])) self.page_count = self.index_cfg["page_count"] self.vocab = load_json(self.index_cfg["vocab_path"]) self.total_word = 0 for v in self.vocab.values(): self.total_word += v # self.tokenizer = BasicTokenizer(never_split=[]) self.ranker = None self._start_ranker(ranker_name) self.parser = parser_strategy[self.index_cfg["name"]]() # source file self.fstream = open(self.index_cfg["data_path"], "r", encoding="utf8") self.page_positions = load_pickle( self.index_cfg["page_positions_path"]) self.punc = " " + punctuation + "\n"
def test(): index_cfg = load_json(settings.cfg_path) doc_word_index = load_pickle(index_cfg["page_word_index_path"]) # list page_count = index_cfg["page_count"] doc_len = load_pickle(index_cfg["page_len_path"]) fstream = open(index_cfg["data_path"], "r", encoding="utf8") page_positions = load_pickle(index_cfg["page_positions_path"]) for i, x in tqdm(enumerate(doc_word_index)): if doc_len[i] == 0: fstream.seek(page_positions[i], 0) print(fstream.readline()) return
def load_black_Json_Result(self, dict_name): black_origin_outputs = None black_adv_outputs = None black_defense_origin_outputs = None black_defense_adv_outputs = None CD_dict = str(dict_name).split(".")[0] #print(CD_dict) if self.IS_COMPARE_MODEL: black_outputs_path =self.black_Result_dir json_content = load_json(black_outputs_path) analyze_json(json_content) model_content= output_value(json_content,"model") model_BDResult = output_value(model_content, "BDResult") model_CDResult = output_value(model_content, "CDResult") model_CDResult_dict = output_value(model_CDResult, CD_dict) black_origin_outputs = dict_list_to_np(model_BDResult) black_adv_outputs = dict_list_to_np(model_CDResult_dict) model_defense_content= output_value(json_content,"compare_model") model_defense_BDResult = output_value(model_defense_content, "BDResult") model_defense_CDResult = output_value(model_defense_content, "CDResult") model_defense_CDResult_dict = output_value(model_defense_CDResult, CD_dict) black_defense_origin_outputs = dict_list_to_np(model_defense_BDResult) black_defense_adv_outputs = dict_list_to_np(model_defense_CDResult_dict) return black_origin_outputs, black_adv_outputs, black_defense_origin_outputs, black_defense_adv_outputs else: black_outputs_path = self.black_Result_dir json_content = load_json(black_outputs_path) analyze_json(json_content) model_content= output_value(json_content,"model") model_BDResult = output_value(model_content, "BDResult") model_CDResult = output_value(model_content, "CDResult") model_CDResult_dict = output_value(model_CDResult, CD_dict) black_origin_outputs = dict_list_to_np(model_BDResult) black_adv_outputs = dict_list_to_np(model_CDResult_dict) return black_origin_outputs, black_adv_outputs
def query_one_dataset(project_name, top_k=None): dataset_dir = os.path.join(data_root, project_name) os.makedirs(dataset_dir, exist_ok=True) prefix = '{}_'.format(top_k) if top_k else '' project_cfg_path = os.path.join(dataset_dir, '{}{}_cfg.json'.format(prefix, project_name)) if os.path.exists(project_cfg_path): # has dataset_file print('read {} from json!'.format(project_name)) project = load_json(project_cfg_path) else: print('read {} from mysql!'.format(project_name)) project = create_dataset_from_sql(project_name, top_k) dump_json(project, out_path=os.path.join(dataset_dir, '{}{}_cfg.json'.format(prefix, project_name))) return project
def cvt_json_suepr(in_json, A_subs, a_subs): train_dict = load_json(in_json) for ann in train_dict['annotations']: if ann['category_id'] in A_subs: ann['category_id'] = 0 elif ann['category_id'] in a_subs: ann['category_id'] = 1 train_dict['categories'] = [ { "id": 0, "name": "A" }, { "id": 1, "name": "a" }] dump_json(train_dict, in_json.replace('.json', '_super.json'))
def update_page_word_index(): # used for VSM tfidf index_cfg = load_json(settings.cfg_path) doc_word_index = load_pickle(index_cfg["page_word_index_path"]) # list page_count = index_cfg["page_count"] doc_len = load_pickle(index_cfg["page_len_path"]) index = defaultdict(lambda: [[], []], load_pickle(index_cfg["index_path"])) for i, x in tqdm(enumerate(doc_word_index), total=len(doc_word_index)): sum_v = 0 for k, v in x.items(): sum_v += (v * log10(page_count / (len(index[k][0]) + 1)))**2 if doc_len[i]: sum_v = (sum_v / doc_len[i])**0.5 else: sum_v = 0 x["_sum_tfidf"] = sum_v dump_pickle(doc_word_index, index_cfg["page_word_index_path"])
def project_detail(project_name, top_k=None): dataset_dir = os.path.join(data_root, project_name) print('dataset_dir', dataset_dir) os.makedirs(dataset_dir, exist_ok=True) prefix = '{}_'.format(top_k) if top_k else '' project_cfg_path = os.path.join( dataset_dir, '{}{}_cfg.json'.format(prefix, project_name)) if os.path.exists(project_cfg_path): # has dataset_file print('read {} from json!'.format(project_name)) project = load_json(project_cfg_path) else: print('read {} from mysql!'.format(project_name)) # read project cfgs sql = "select id, name, taskType, taskRules from d_projects where name='{}'".format( project_name) res = db.session.execute(sql) project = res.next() project = parse_projects(project) # read data from mysql # todo: store the largest hitId for a project, better for update dataset sql = "select d_hits.id as img_id, d_hits.data as path, d_hits_result.result as anns from d_hits, d_hits_result " \ "where d_hits.projectId='{}' and d_hits.id=d_hits_result.hitId and d_hits.status='done'".format(project['id']) res = db.session.execute(sql) dataset = create_dataset_from_sql_res(res) filted_cats, filted_cats_num, train_num, val_num, test_num = split_and_save_coco_dataset( dataset, dataset_dir, top_k) # update project project['cats'] = filted_cats project['cats_num'] = filted_cats_num project['classes'] = len(filted_cats) project['train'] = train_num project['valid'] = val_num project['test'] = test_num dump_json(project, out_path=os.path.join( dataset_dir, '{}{}_cfg.json'.format(prefix, project_name))) return project
def apply_syntax_colorization(text, language=None): if not language: return db_schema = load_json("resources/language-syntax/python-syntax.json") keywords = db_schema["keywords"] ops = db_schema["operators"] braces = db_schema["braces"] for op in ops: text = text.replace(op, "{}{}{}".format(cs.PASTEL_PURPLE, op, cs.DEFAULT)) for key in keywords: text = text.replace(key, "{}{}{}".format(cs.PASTEL_PINK, key, cs.DEFAULT)) return text
def initialize(obj_name, to_file = False): """We need to make sure we have the database schema first.""" db_schema = load_json("resources/database-structure-definition.json") if not validate_schema(db_schema, obj_name): return if to_file: global bo_file bo_file = open('app/bo/{}.py'.format(obj_name), 'w') create_object_comment_block(obj_name) get_external_imports(db_schema, obj_name) create_class(obj_name) create_init(obj_name) create_getters_setters(obj_name, db_schema[obj_name]) create_crud_functions(obj_name) create_list_referenced_methods(db_schema, obj_name) if to_file: bo_file.close()
def initialize(obj_name, to_file=False): """We need to make sure we have the database schema first.""" db_schema = load_json("resources/database-structure-definition.json") if not validate_schema(db_schema, obj_name): return if to_file: global do_file do_file = open('app/do/{}.py'.format(obj_name), 'w') create_object_comment_block(obj_name) create_class(obj_name) create_init_method(obj_name, db_schema[obj_name]) create_to_json(obj_name, db_schema[obj_name]) create_to_string(obj_name, db_schema[obj_name]) create_equals_to(obj_name, db_schema[obj_name]) create_to_obj(obj_name, db_schema[obj_name]) printt_do("") if to_file: do_file.close()
def getSchema(self, objName): dbSchema = load_json("resources/database-structure-definition.json") if not validate_schema(dbSchema, objName): raise Exception('The object name provided does not exists.') return dbSchema
def split_coco(coco_path, train_path, test_path, val_path, train_ratio=0.7, val_ratio=0.3): origin = load_json(coco_path) train_dataset = {} test_dataset = {} val_dataset = {} train_dataset['info'] = origin['info'] + '_train' test_dataset['info'] = origin['info'] + '_test' val_dataset['info'] = origin['info'] + '_val' train_dataset['licenses'] = test_dataset['licenses'] = val_dataset[ 'licenses'] = origin['licenses'] train_dataset['categories'] = test_dataset['categories'] = val_dataset[ 'categories'] = origin['categories'] train_dataset['images'] = [] test_dataset['images'] = [] val_dataset['images'] = [] train_dataset['annotations'] = [] test_dataset['annotations'] = [] val_dataset['annotations'] = [] images = origin['images'] annotations = origin['annotations'] annos = {} # dict { image_id : anns idx } cat_images = {} # dict { cat_id: image_ids } for index, annotation in enumerate(annotations): if annos.get(annotation['image_id'], -1) == -1: annos[annotation['image_id']] = [] # create ann image_id key if cat_images.get(annotation['category_id'], -1) == -1: cat_images[annotation['category_id']] = [ ] # create cat image_id key annos[annotation['image_id']].append(index) cat_images[annotation['category_id']].append(annotation['image_id']) id_images = {} # dict { image_id: image_dict } for image in images: id_images[image['id']] = image for (cat_id, per_cat_images) in cat_images.items(): # each cat split random.shuffle(per_cat_images) dataset_size = len(per_cat_images) train_size = int(dataset_size * train_ratio) val_size = int(dataset_size * val_ratio) train_images = per_cat_images[0:train_size] test_images = per_cat_images[train_size:] # add val val_from_train = int(val_size * train_ratio) val_from_test = val_size - val_from_train val_images = random.sample(train_images, val_from_train) + random.sample( test_images, val_from_test) for image_id in train_images: train_dataset['images'].append(id_images[image_id]) anno_indexs = annos[image_id] # anns in an img for anno_index in anno_indexs: if annotations[anno_index][ 'category_id'] == cat_id: # judge cat id of anns in an img train_dataset['annotations'].append( annotations[anno_index]) for image_id in test_images: test_dataset['images'].append(id_images[image_id]) anno_indexs = annos[image_id] for anno_index in anno_indexs: if annotations[anno_index]['category_id'] == cat_id: test_dataset['annotations'].append(annotations[anno_index]) for image_id in val_images: val_dataset['images'].append(id_images[image_id]) anno_indexs = annos[image_id] for anno_index in anno_indexs: if annotations[anno_index]['category_id'] == cat_id: val_dataset['annotations'].append(annotations[anno_index]) write_json(train_dataset, train_path) write_json(test_dataset, test_path) write_json(val_dataset, val_path)
def load_database_definition(): """Loads the database definition and return definition as JSON dictionary.""" return load_json("resources/database-structure-definition.json")
def load_properties(): global app_properties app_properties = load_json("resources/properties.json")