def __init__(self, data_path="", output_dir="", debug=False):
        self.name = "WikiParser"
        self.data_path = data_path
        self.STOP_WORDS_PATH = os.path.join("index", "stop_words.json")
        self.stemmer_path = output_dir + "/stem.json"
        self.stemmer = nltk.stem.SnowballStemmer(
            'english')  # You can use porter or other stemmer

        if os.path.exists(self.stemmer_path):
            self.stem_word_dict = load_json(self.stemmer_path)
            self.stem_word_dict = defaultdict(lambda: False,
                                              self.stem_word_dict)
        else:
            self.stem_word_dict = defaultdict(lambda: False)
        self.stop_words = set(load_json(self.STOP_WORDS_PATH)["stop_words"])
        self.tokenizer = BasicTokenizer(never_split=[])
        self.punctuation = re.compile(r"[{}]+".format(punctuation))

        self.page_count = 0  # denote which number of wiki page it is
        self.page_positions = dict(
        )  # store position of each page in source file
        self.page_positions_path = output_dir + "/page_positions.pickle"
        self.page_len_path = output_dir + "/page_len.pickle"  # a list to store length of each page
        self.page_len_list = []
        self.page_len = 0

        self.page_word_index_path = output_dir + "/page_word_idnex.pickle"
        self.page_word_index = []

        self.debug = debug
Example #2
0
def initialize(obj_name, to_file=False):
    """We need to make sure we have the database schema first."""
    db_schema = load_json("resources/database-structure-definition.json")

    if not validate_schema(db_schema, obj_name):
        return

    if to_file:
        global dao_file
        dao_file = open('app/dao/{}.py'.format(obj_name), 'w')

    create_object_comment_block(obj_name)
    create_class(obj_name)
    create_init_func(obj_name)
    create_insert_object_to_db(obj_name, db_schema[obj_name])
    create_update_object_to_db(obj_name, db_schema[obj_name])
    create_delete_object_to_db(obj_name, db_schema[obj_name])
    create_list_objects_of_type(obj_name)
    create_methods_by_unique_key(obj_name, db_schema[obj_name])
    create_get_objs_for_key(obj_name, db_schema[obj_name])
    printt_dao("")
    printt_dao("{}DAO = {}DAO()".format(uncapitalize(obj_name), obj_name))

    if to_file:
        dao_file.close()
Example #3
0
    def __init__(self, index_config_path, ranker_name):
        self.index_cfg = load_json(index_config_path)
        self.index_pst = defaultdict(lambda: [[], []],
                                     load_pickle(self.index_cfg["index_path"]))
        self.page_count = self.index_cfg["page_count"]
        self.vocab = load_json(self.index_cfg["vocab_path"])
        self.total_word = 0
        for v in self.vocab.values():
            self.total_word += v

        # self.tokenizer = BasicTokenizer(never_split=[])
        self.ranker = None
        self._start_ranker(ranker_name)
        self.parser = parser_strategy[self.index_cfg["name"]]()

        # source file
        self.fstream = open(self.index_cfg["data_path"], "r", encoding="utf8")
        self.page_positions = load_pickle(
            self.index_cfg["page_positions_path"])

        self.punc = " " + punctuation + "\n"
def test():
    index_cfg = load_json(settings.cfg_path)
    doc_word_index = load_pickle(index_cfg["page_word_index_path"])  # list
    page_count = index_cfg["page_count"]
    doc_len = load_pickle(index_cfg["page_len_path"])

    fstream = open(index_cfg["data_path"], "r", encoding="utf8")
    page_positions = load_pickle(index_cfg["page_positions_path"])

    for i, x in tqdm(enumerate(doc_word_index)):
        if doc_len[i] == 0:
            fstream.seek(page_positions[i], 0)
            print(fstream.readline())
            return
Example #5
0
    def load_black_Json_Result(self, dict_name):
        black_origin_outputs = None
        black_adv_outputs = None
        black_defense_origin_outputs = None
        black_defense_adv_outputs = None
        CD_dict = str(dict_name).split(".")[0]
        #print(CD_dict)
        if self.IS_COMPARE_MODEL:
            black_outputs_path =self.black_Result_dir
            json_content = load_json(black_outputs_path)
            analyze_json(json_content)
            model_content= output_value(json_content,"model")
            model_BDResult = output_value(model_content, "BDResult")
            model_CDResult = output_value(model_content, "CDResult")
            model_CDResult_dict = output_value(model_CDResult, CD_dict)
            black_origin_outputs = dict_list_to_np(model_BDResult)
            black_adv_outputs = dict_list_to_np(model_CDResult_dict)

            model_defense_content= output_value(json_content,"compare_model")
            model_defense_BDResult = output_value(model_defense_content, "BDResult")
            model_defense_CDResult = output_value(model_defense_content, "CDResult")
            model_defense_CDResult_dict = output_value(model_defense_CDResult, CD_dict)
            black_defense_origin_outputs = dict_list_to_np(model_defense_BDResult)
            black_defense_adv_outputs = dict_list_to_np(model_defense_CDResult_dict)

            return black_origin_outputs, black_adv_outputs, black_defense_origin_outputs, black_defense_adv_outputs
        else:
            black_outputs_path = self.black_Result_dir
            json_content = load_json(black_outputs_path)
            analyze_json(json_content)
            model_content= output_value(json_content,"model")
            model_BDResult = output_value(model_content, "BDResult")
            model_CDResult = output_value(model_content, "CDResult")
            model_CDResult_dict = output_value(model_CDResult, CD_dict)
            black_origin_outputs = dict_list_to_np(model_BDResult)
            black_adv_outputs = dict_list_to_np(model_CDResult_dict)
            return black_origin_outputs, black_adv_outputs
Example #6
0
def query_one_dataset(project_name, top_k=None):
    dataset_dir = os.path.join(data_root, project_name)
    os.makedirs(dataset_dir, exist_ok=True)
    prefix = '{}_'.format(top_k) if top_k else ''
    project_cfg_path = os.path.join(dataset_dir, '{}{}_cfg.json'.format(prefix, project_name))

    if os.path.exists(project_cfg_path):  # has dataset_file
        print('read {} from json!'.format(project_name))
        project = load_json(project_cfg_path)
    else:
        print('read {} from mysql!'.format(project_name))
        project = create_dataset_from_sql(project_name, top_k)
        dump_json(project, out_path=os.path.join(dataset_dir, '{}{}_cfg.json'.format(prefix, project_name)))

    return project
Example #7
0
def cvt_json_suepr(in_json, A_subs, a_subs):
    train_dict = load_json(in_json)
    for ann in train_dict['annotations']:
        if ann['category_id'] in A_subs:
            ann['category_id'] = 0
        elif ann['category_id'] in a_subs:
            ann['category_id'] = 1
    train_dict['categories'] = [
        {
            "id": 0,
            "name": "A"
        },
        {
            "id": 1,
            "name": "a"
        }]
    dump_json(train_dict, in_json.replace('.json', '_super.json'))
def update_page_word_index():
    # used for VSM tfidf
    index_cfg = load_json(settings.cfg_path)
    doc_word_index = load_pickle(index_cfg["page_word_index_path"])  # list
    page_count = index_cfg["page_count"]
    doc_len = load_pickle(index_cfg["page_len_path"])
    index = defaultdict(lambda: [[], []], load_pickle(index_cfg["index_path"]))

    for i, x in tqdm(enumerate(doc_word_index), total=len(doc_word_index)):
        sum_v = 0
        for k, v in x.items():
            sum_v += (v * log10(page_count / (len(index[k][0]) + 1)))**2
        if doc_len[i]:
            sum_v = (sum_v / doc_len[i])**0.5
        else:
            sum_v = 0
        x["_sum_tfidf"] = sum_v
    dump_pickle(doc_word_index, index_cfg["page_word_index_path"])
Example #9
0
def project_detail(project_name, top_k=None):
    dataset_dir = os.path.join(data_root, project_name)
    print('dataset_dir', dataset_dir)
    os.makedirs(dataset_dir, exist_ok=True)
    prefix = '{}_'.format(top_k) if top_k else ''
    project_cfg_path = os.path.join(
        dataset_dir, '{}{}_cfg.json'.format(prefix, project_name))

    if os.path.exists(project_cfg_path):  # has dataset_file
        print('read {} from json!'.format(project_name))
        project = load_json(project_cfg_path)
    else:
        print('read {} from mysql!'.format(project_name))
        # read project cfgs
        sql = "select id, name, taskType, taskRules from d_projects where name='{}'".format(
            project_name)
        res = db.session.execute(sql)
        project = res.next()
        project = parse_projects(project)

        # read data from mysql
        # todo: store the largest hitId for a project, better for update dataset
        sql = "select d_hits.id as img_id, d_hits.data as path, d_hits_result.result as anns from d_hits, d_hits_result " \
              "where d_hits.projectId='{}' and d_hits.id=d_hits_result.hitId and d_hits.status='done'".format(project['id'])
        res = db.session.execute(sql)
        dataset = create_dataset_from_sql_res(res)
        filted_cats, filted_cats_num, train_num, val_num, test_num = split_and_save_coco_dataset(
            dataset, dataset_dir, top_k)

        # update project
        project['cats'] = filted_cats
        project['cats_num'] = filted_cats_num
        project['classes'] = len(filted_cats)
        project['train'] = train_num
        project['valid'] = val_num
        project['test'] = test_num

        dump_json(project,
                  out_path=os.path.join(
                      dataset_dir,
                      '{}{}_cfg.json'.format(prefix, project_name)))

    return project
Example #10
0
def apply_syntax_colorization(text, language=None):
    if not language:
        return

    db_schema = load_json("resources/language-syntax/python-syntax.json")

    keywords = db_schema["keywords"]
    ops = db_schema["operators"]
    braces = db_schema["braces"]

    for op in ops:
        text = text.replace(op, "{}{}{}".format(cs.PASTEL_PURPLE, op,
                                                cs.DEFAULT))

    for key in keywords:
        text = text.replace(key, "{}{}{}".format(cs.PASTEL_PINK, key,
                                                 cs.DEFAULT))

    return text
Example #11
0
def initialize(obj_name, to_file = False):
	"""We need to make sure we have the database schema first."""
	db_schema = load_json("resources/database-structure-definition.json")

	if not validate_schema(db_schema, obj_name):
		return

	if to_file:
		global bo_file
		bo_file = open('app/bo/{}.py'.format(obj_name), 'w')

	create_object_comment_block(obj_name)
	get_external_imports(db_schema, obj_name)
	create_class(obj_name)
	create_init(obj_name)
	create_getters_setters(obj_name, db_schema[obj_name])
	create_crud_functions(obj_name)
	create_list_referenced_methods(db_schema, obj_name)

	if to_file:
		bo_file.close()
Example #12
0
def initialize(obj_name, to_file=False):
    """We need to make sure we have the database schema first."""
    db_schema = load_json("resources/database-structure-definition.json")

    if not validate_schema(db_schema, obj_name):
        return

    if to_file:
        global do_file
        do_file = open('app/do/{}.py'.format(obj_name), 'w')

    create_object_comment_block(obj_name)
    create_class(obj_name)
    create_init_method(obj_name, db_schema[obj_name])
    create_to_json(obj_name, db_schema[obj_name])
    create_to_string(obj_name, db_schema[obj_name])
    create_equals_to(obj_name, db_schema[obj_name])
    create_to_obj(obj_name, db_schema[obj_name])
    printt_do("")

    if to_file:
        do_file.close()
Example #13
0
 def getSchema(self, objName):
     dbSchema = load_json("resources/database-structure-definition.json")
     if not validate_schema(dbSchema, objName):
         raise Exception('The object name provided does not exists.')
     return dbSchema
Example #14
0
def split_coco(coco_path,
               train_path,
               test_path,
               val_path,
               train_ratio=0.7,
               val_ratio=0.3):
    origin = load_json(coco_path)
    train_dataset = {}
    test_dataset = {}
    val_dataset = {}

    train_dataset['info'] = origin['info'] + '_train'
    test_dataset['info'] = origin['info'] + '_test'
    val_dataset['info'] = origin['info'] + '_val'

    train_dataset['licenses'] = test_dataset['licenses'] = val_dataset[
        'licenses'] = origin['licenses']
    train_dataset['categories'] = test_dataset['categories'] = val_dataset[
        'categories'] = origin['categories']
    train_dataset['images'] = []
    test_dataset['images'] = []
    val_dataset['images'] = []
    train_dataset['annotations'] = []
    test_dataset['annotations'] = []
    val_dataset['annotations'] = []

    images = origin['images']
    annotations = origin['annotations']

    annos = {}  # dict { image_id : anns idx }
    cat_images = {}  # dict { cat_id: image_ids }
    for index, annotation in enumerate(annotations):
        if annos.get(annotation['image_id'], -1) == -1:
            annos[annotation['image_id']] = []  # create ann image_id key
        if cat_images.get(annotation['category_id'], -1) == -1:
            cat_images[annotation['category_id']] = [
            ]  # create cat image_id key
        annos[annotation['image_id']].append(index)
        cat_images[annotation['category_id']].append(annotation['image_id'])

    id_images = {}  # dict { image_id: image_dict }
    for image in images:
        id_images[image['id']] = image

    for (cat_id, per_cat_images) in cat_images.items():  # each cat split

        random.shuffle(per_cat_images)

        dataset_size = len(per_cat_images)
        train_size = int(dataset_size * train_ratio)
        val_size = int(dataset_size * val_ratio)
        train_images = per_cat_images[0:train_size]
        test_images = per_cat_images[train_size:]

        # add val
        val_from_train = int(val_size * train_ratio)
        val_from_test = val_size - val_from_train
        val_images = random.sample(train_images,
                                   val_from_train) + random.sample(
                                       test_images, val_from_test)

        for image_id in train_images:
            train_dataset['images'].append(id_images[image_id])
            anno_indexs = annos[image_id]  # anns in an img
            for anno_index in anno_indexs:
                if annotations[anno_index][
                        'category_id'] == cat_id:  # judge cat id of anns in an img
                    train_dataset['annotations'].append(
                        annotations[anno_index])

        for image_id in test_images:
            test_dataset['images'].append(id_images[image_id])
            anno_indexs = annos[image_id]
            for anno_index in anno_indexs:
                if annotations[anno_index]['category_id'] == cat_id:
                    test_dataset['annotations'].append(annotations[anno_index])

        for image_id in val_images:
            val_dataset['images'].append(id_images[image_id])
            anno_indexs = annos[image_id]
            for anno_index in anno_indexs:
                if annotations[anno_index]['category_id'] == cat_id:
                    val_dataset['annotations'].append(annotations[anno_index])

    write_json(train_dataset, train_path)
    write_json(test_dataset, test_path)
    write_json(val_dataset, val_path)
Example #15
0
def load_database_definition():
    """Loads the database definition and return definition as JSON dictionary."""
    return load_json("resources/database-structure-definition.json")
Example #16
0
def load_properties():
    global app_properties
    app_properties = load_json("resources/properties.json")