Ejemplo n.º 1
0
    def load_data(self):
        self.data_path = "data/annotated/" + self.name + ".json"
        make_path(self.data_path)

        if os.path.exists(self.data_path):
            self.ann_data = load_json(self.data_path)
        else:
            self.ann_data = []
        self.raw_data = load_json(config.processed_path)

        self.total_num = len(self.raw_data)
        self.annotated_num = len(self.ann_data)
        self.position = self.annotated_num  # the page showing
Ejemplo n.º 2
0
def do_niv(X_test, X_train, T_train, Y_train, n_niv_params, dataset_name,
           fold_idx):
    niv_filename = 'niv_' + dataset_name
    fold_name = 'fold' + str(fold_idx + 1)
    niv_vars = load_json(niv_filename)
    survived_vars = niv_vars.get(fold_name) if niv_vars else None

    if survived_vars:
        print('Stored NIV:', survived_vars)
        X_test = X_test[survived_vars]
        X_train = X_train[survived_vars]
    else:
        niv_start_time = time.time()
        print('Start NIV variable selection')

        survived_vars = niv_variable_selection(X_train, Y_train, T_train,
                                               n_niv_params)
        print('NIV:', list(survived_vars))

        X_train = X_train[survived_vars]
        X_test = X_test[survived_vars]

        niv_end_time = time.time()
        print('NIV time:', niv_end_time - niv_start_time)

        if niv_vars:
            niv_vars.update({fold_name: survived_vars.tolist()})
        else:
            niv_vars = {fold_name: survived_vars.tolist()}
        save_json(niv_filename, niv_vars)

    return X_test, X_train
Ejemplo n.º 3
0
def select_relations(visualgenome_path, house_objects_path, model_path):
    ''' Select relations about how often attributes belong to objects of house domain '''
    attribute_frequency = load_json(
        join(visualgenome_path, 'attribute_frequencies.json'))
    groups = classification(attribute_frequency.values(), model_path)
    save_json(groups, join(visualgenome_path, 'attribute_classes.json'))
    if 'others' in groups: del groups['others']
    attribute_knowledge, relations = extract_knowledge(attribute_frequency,
                                                       groups)
    house_objects = {
        v.replace(' ', '_'): k['dbpedia_uri']
        for v, k in load_json(house_objects_path).items()
    }
    save_json(attribute_knowledge,
              join(visualgenome_path, 'attribute_knowledge.json'))
    create_triples(relations, house_objects, visualgenome_path)
Ejemplo n.º 4
0
    def is_already_config(self):
        if not os.path.isfile(self._CONFIG_FILEPATH):
            return False
        config = load_json(self._CONFIG_FILEPATH)
        config_keys = config.keys()
        if not "backup_folder" in config_keys \
                or not "bucket" in config_keys \
                or not "time_interval" in config_keys:
            return False
        if not os.path.isfile(self._control_key_salt_dir):
            return False

        print("Backup program is already config")
        self._backup_folder = config["backup_folder"]
        self._bucket = config["bucket"]
        self._time_interval = config["time_interval"]
        self._stat_cache = StatCache(self._stat_cache_dir, self._backup_folder)
        self._object_db = ObjectDB(self._object_db_path)

        self._set_salt()
        while True:
            self._set_control_key()
            if self._correct_password_entered():
                break

        return True
Ejemplo n.º 5
0
def eval(path1, path2):
    data1 = load_json(path1)
    data2 = load_json(path2)

    count1 = find_ann_num(data1)
    count2 = find_ann_num(data2)
    count_all = min(count1, count2)

    data1 = data1[:count_all]
    data2 = data2[:count_all]

    count = 0
    f_all = 0

    c_e = 0
    c_g = 0
    c_p = 0
    for gt_data, pred_data in zip(data1, data2):
        assert gt_data["context_tokens"] == pred_data["context_tokens"]
        doc_list = gt_data["context_tokens"].split(" ")

        gt_ann = gt_data["ann"]
        pred_ann = pred_data["ann"]

        gt_tokens = get_token_list(doc_list, gt_ann)
        pred_tokens = get_token_list(doc_list, pred_ann)

        f1_matrix = get_f1_matrix(gt_tokens, pred_tokens)

        if len(pred_tokens) != 0:
            p = np.average(np.max(f1_matrix, axis=0))
            r = np.average(np.max(f1_matrix, axis=1))
            f = 2 * p * r / (p + r) if p > 0 and r > 0 else 0

            f_all += f
            count += 1

        c_e += get_em(gt_tokens, pred_tokens)
        c_g += len(gt_tokens)
        c_p += len(pred_tokens)

    print(c_e / c_g)
    print(c_e / c_p)

    print(f_all / count)
Ejemplo n.º 6
0
 def __init__(self, player):
     self.button_events = btn_events(self)
     self.win = main_window(self, self.button_events)
     self.player = player
     # self.player.play()
     self.win.show()
     self.canvasHandler = canvasHandler(self, self.win, self.win.ui.label,
                                        self.player)
     self.canvasHandler.frameSignal.connect(self.win.display_video)
     """
     初始化電子圍籬 並且將繪製好的區域加入
     下一步見 ./ui_controller/view.py 的 display_video()
     """
     self.crossing_detector = crossing_detector("A")
     ok, area = load_json('./area.txt')
     self.crossing_detector.add_area_dict(area)
     ok, area = load_json('./area2.txt')
     self.crossing_detector.add_area_dict(area)
def preprocess(path, pid=None):
    data = load_json(path)
    list_data = []
    for i in data:
        list_data += i["paragraphs"]
    data = list_data
    data = sample(data, config.num_sample)

    count = 0
    tokenizer = StanfordTokenizer()
    examples = []

    tqdm_text = get_tqdm_text(pid, 3)
    for j in tqdm(data, desc=tqdm_text, position=pid):
        c = j["context"].replace("''", '" ').replace("``", '" ')  #.lower()

        tc = tokenizer.tokenize(c)
        if len(tc) > config.max_len:
            continue

        if isinstance(tokenizer, StanfordTokenizer):
            c_idx = tokenizer.character_level_idx()
        else:
            c_idx = convert_idx(tc)

        y1s, y2s = [], []
        answer_texts = []

        qas_sorted = sorted(j["qas"],
                            key=lambda x: x["answers"][0]["answer_start"])
        for k in qas_sorted:
            # q = k["question"].replace("''", '" ').replace("``", '" ').lower() # we don't use question here
            ans = k["answers"][0]
            a_s = ans["answer_start"]
            a = ans["text"].replace("''", '" ').replace("``", '" ')  #.lower()
            a_e = a_s + len(a)
            answer_span = []

            for idx, span in enumerate(c_idx):
                if not (a_e <= span[0] or a_s >= span[1]):
                    answer_span.append(idx)

            assert len(answer_span) > 0, "Didn't find answer span"
            # y1s.append(answer_span[0])
            # y2s.append(answer_span[-1])
            answer_texts.append((a, answer_span[0], answer_span[-1]))
            count += 1

        examples.append({
            "context_tokens": " ".join(tc),
            "answers": answer_texts
            # "ans_starts": y1s,
            # "ans_ends": y2s
        })

    print(count / len(data))
    return examples
Ejemplo n.º 8
0
def test(args, MODEL_LOC, LABEL_JSON_LOC):
    print_statement('LOAD EMBEDDINGS')
    label_map = load_json(LABEL_JSON_LOC, reverse=True, name='Label Mapping')
    with open('dataset/ind2token', 'rb') as f:
        ind2token = pickle.load(f)
    with open('dataset/token2ind', 'rb') as f:
        token2ind = pickle.load(f)
    with open('dataset/embeddings_vector', 'rb') as f:
        embeddings_vector = pickle.load(f)
    print_value('Embed shape', embeddings_vector.shape)
    print_value('Vocab size', len(ind2token))
    batch_size = args.batch_size
    embedding_size = embeddings_vector.shape[1]
    model = TextCNN(batch_size=batch_size,
                    c_out=args.c_out,
                    output_size=args.num_classes,
                    vocab_size=len(ind2token),
                    embedding_size=embedding_size,
                    embeddings_vector=torch.from_numpy(embeddings_vector),
                    kernel_sizes=args.kernel_sizes,
                    trainable=args.embed_trainable,
                    p=args.p)
    model.to(args.device)
    ckpt = torch.load(MODEL_LOC, map_location=args.device)
    model.load_state_dict(ckpt["state_dict"])
    model.eval()
    print_statement('MODEL TESTING')
    qcdataset = QCDataset(token2ind, ind2token, split='test', batch_first=True)
    dataloader_test = DataLoader(qcdataset,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 collate_fn=qcdataset.collate_fn)
    ct = ClassificationTool(len(label_map))
    accs = []
    length = []
    for batch_inputs, batch_targets in dataloader_test:
        batch_inputs = batch_inputs.to(args.device)
        batch_targets = batch_targets.to(args.device)
        with torch.no_grad():
            output = model(batch_inputs)
        acc = torch.sum(output.argmax(dim=1) == batch_targets)
        accs.append(acc)
        length.append(len(batch_targets))
        ct.update(output, batch_targets)
    test_acc = float(np.sum(accs)) / sum(length)
    print('Testing on {} data:'.format(sum(length)))
    print('+ Overall ACC: {:.3f}'.format(test_acc))
    PREC, REC, F1 = ct.get_result()
    for i, classname in enumerate(label_map.values()):
        print('* {} PREC: {:.3f}, {} REC: {:.3f}, {} F1: {:.3f}'.format(
            classname[:3], PREC[i], classname[:3], REC[i], classname[:3],
            F1[i]))
Ejemplo n.º 9
0
def create_dataset(visualgenome_raw_path, visualgenome_parsed_path):
    ''' Create a dataset of objects and their attributes using VisualGenome dataset '''
    visualgenome_data = load_json(
        join(visualgenome_raw_path, 'attributes.json'))
    attribute_synsets = load_json(
        join(visualgenome_raw_path, 'attribute_synsets.json'))
    frequency_data = {}

    for image in visualgenome_data:
        objects = set()
        for attribute_data in image['attributes']:
            if 'attributes' in attribute_data and len(
                    set(attribute_data['synsets'])) == 1:
                object_name = attribute_data['synsets'][0]
                assigned = assign_attribute(object_name,
                                            attribute_data['attributes'],
                                            attribute_synsets, frequency_data)
                if assigned and object_name not in objects:
                    objects.add(object_name)
                    frequency_data[object_name]['images'] += 1

    logging.info('Size: %s objects selected' % len(frequency_data))
    save_json(frequency_data,
              join(visualgenome_parsed_path, 'attribute_frequencies.json'))
Ejemplo n.º 10
0
def select_relations(frame_parsed_path, house_objects_path):
    ''' Select unique relations of frames about house's object '''
    house_objects = {v.replace(' ', '_'):k['dbpedia_uri'] for v,k in load_json(house_objects_path).items()}
    house_object_uris = set(house_objects.values())
    house_object_names = set(house_objects.keys())
    frame_instances = load_json(join(frame_parsed_path, 'frame_instances.json'))
    netlemma = map_netlemma()
    wn31db = map_wn31db()
    triple_uris = []
    triple_labels = []   

    for frame_id in frame_instances.keys():
        valid_frame = False
        frame = frame_instances[frame_id]
        frame_uris, frame_labels = create_triples(frame['type'], frame['elements'], wn31db, netlemma)
        for i in range(len(frame_uris)):
            object_uri = re.match('<(.+)> <(.+)> <(.+)>', frame_uris[i]).group(1)
            if object_uri in house_object_uris and frame_uris[i] not in triple_uris:
                triple_uris.append(frame_uris[i])
                triple_labels.append(frame_labels[i])
                valid_frame = True
            else:
                object_name = re.match('<(.+)> <(.+)> <(.+)>', frame_labels[i]).group(1)
                if object_name in house_object_names and frame_uris[i] not in triple_uris:
                    triple_uris.append(frame_uris[i])
                    triple_labels.append(frame_labels[i])
                    valid_frame = True
        
        if not valid_frame:
            del frame_instances[frame_id]

    calculate_statistics(triple_uris, frame_parsed_path)
    save_file(join(frame_parsed_path, 'selected_triples.nt'), triple_uris)
    save_file(join(frame_parsed_path, 'selected_triples_label.nt'), triple_labels)
    save_file(join(frame_parsed_path, 'selected_verbalized.txt'), [verbalize_frame(f['type'], f['elements'].items(), netlemma) for f in frame_instances.values()])
    logging.info('Total valid relations with URIs: %s' % len(triple_uris))
Ejemplo n.º 11
0
def load_model(expdir, model_type, ckpt_name='bestmodel'):
    """Load a pre-trained model.

    Args:
        expdir (str): directory where model checkpoint is saved.
        model_type (str): either "CNN_classifier" or "MLP_regressor", depending
                  on what type of model we wish to load.
        ckpt_name (str, optional): identifier of model checkpoint we wish to
                  load.
    Returns:
        model (tf.keras.Model): a pre-trained model.
    """
    param_file = os.path.join(expdir, 'params.json')
    model_params = utils.load_json(param_file)
    ckpt_path = os.path.join(expdir, 'ckpts/{}-1'.format(ckpt_name))
    if model_type == 'CNN_classifier':
        return load_cnn_classifier(model_params, ckpt_path)
    else:
        return load_mlp_regressor(model_params, ckpt_path)
Ejemplo n.º 12
0
def validate_relations(concepnet_path):
    ''' Validate if the searched objects are the subjects in the relation '''
    validated_relations = []

    for file_name in os.listdir(concepnet_path):
        data = load_json(join(concepnet_path, file_name))
        result = re.match('.+node=(.+)&.+', data['@id'])
        if result:
            object_id = result.group(1)
        else:
            object_id = None

        for element in data['edges']:
            if object_id == element['start']['@id']:
                object1 = element['start']['@id'].split('/')[-1]
                object2 = element['end']['@id'].split('/')[-1]
                relation = element['rel']['@id'].split('/')[-1]
                validated_relations.append((object1, relation, object2))

    return validated_relations
Ejemplo n.º 13
0
 def _reset(self, data_path, save):
     if not save:
         print("extract arch2vec on DARTS search space ...")
         dataset = load_json(data_path)
         print("length of the dataset: {}".format(len(dataset)))
         self.f_path = os.path.join(self.dir_name, 'arch2vec-darts.pt')
         if os.path.exists(self.f_path):
             print('{} is already saved'.format(self.f_path))
             exit()
         print('save to {}'.format(self.f_path))
         counter = 0
         self.model.eval()
         for k, v in dataset.items():
             adj = torch.Tensor(v[0]).unsqueeze(0).cuda()
             ops = torch.Tensor(one_hot_darts(v[1])).unsqueeze(0).cuda()
             adj, ops, prep_reverse = preprocessing(adj, ops, **cfg['prep'])
             with torch.no_grad():
                 x, _ = self.model._encoder(ops, adj)
                 self.embedding[counter] = {
                     'feature': x.squeeze(0).mean(dim=0).cpu(),
                     'genotype': process(v[2])
                 }
             print("{}/{}".format(counter, len(dataset)))
             counter += 1
         torch.save(self.embedding, self.f_path)
         print("finished arch2vec extraction")
         exit()
     else:
         self.f_path = os.path.join(self.dir_name, 'arch2vec-darts.pt')
         print("load arch2vec from: {}".format(self.f_path))
         self.embedding = torch.load(self.f_path)
         for ind in range(len(self.embedding)):
             self.features.append(self.embedding[ind]['feature'])
             self.genotype.append(self.embedding[ind]['genotype'])
         self.features = torch.stack(self.features, dim=0)
         print('loading finished. pretrained embeddings shape: {}'.format(
             self.features.shape))
Ejemplo n.º 14
0
    def build_lipreadingnet(self,
                            config_path,
                            weights='',
                            extract_feats=False):
        if os.path.exists(config_path):
            args_loaded = load_json(config_path)
            print('Lipreading configuration file loaded.')
            tcn_options = {
                'num_layers': args_loaded['tcn_num_layers'],
                'kernel_size': args_loaded['tcn_kernel_size'],
                'dropout': args_loaded['tcn_dropout'],
                'dwpw': args_loaded['tcn_dwpw'],
                'width_mult': args_loaded['tcn_width_mult']
            }
        net = Lipreading(tcn_options=tcn_options,
                         backbone_type=args_loaded['backbone_type'],
                         relu_type=args_loaded['relu_type'],
                         width_mult=args_loaded['width_mult'],
                         extract_feats=extract_feats)

        if len(weights) > 0:
            print('Loading weights for lipreading stream')
            net.load_state_dict(torch.load(weights))
        return net
Ejemplo n.º 15
0
 def _reset(self, data_path, save):
     if not save:
         print("extract arch2vec from {}".format(os.path.join(self.dir_name, self.model_path)))
         if not os.path.exists(os.path.join(self.dir_name, self.model_path)):
             exit()
         dataset = load_json(data_path)
         self.model = Model(input_dim=5, hidden_dim=128, latent_dim=16, num_hops=5, num_mlp_layers=2, dropout=0, **cfg['GAE']).cuda()
         self.model.load_state_dict(torch.load(os.path.join(self.dir_name, self.model_path).format(args.dim))['model_state'])
         self.model.eval()
         with torch.no_grad():
             print("length of the dataset: {}".format(len(dataset)))
             self.f_path = os.path.join(self.dir_name, 'arch2vec-{}'.format(self.model_path))
             if os.path.exists(self.f_path):
                 print('{} is already saved'.format(self.f_path))
                 exit()
             print('save to {}'.format(self.f_path))
             for ind in range(len(dataset)):
                 adj = torch.Tensor(dataset[str(ind)]['module_adjacency']).unsqueeze(0).cuda()
                 ops = torch.Tensor(dataset[str(ind)]['module_operations']).unsqueeze(0).cuda()
                 adj, ops, prep_reverse = preprocessing(adj, ops, **cfg['prep'])
                 test_acc = dataset[str(ind)]['test_accuracy']
                 valid_acc = dataset[str(ind)]['validation_accuracy']
                 time = dataset[str(ind)]['training_time']
                 x,_ = self.model._encoder(ops, adj)
                 self.embedding[ind] = {'feature': x.squeeze(0).mean(dim=0).cpu(), 'valid_accuracy': float(valid_acc), 'test_accuracy': float(test_acc), 'time': float(time)}
             torch.save(self.embedding, self.f_path)
             print("finish arch2vec extraction")
             exit()
     else:
         self.f_path = os.path.join(self.dir_name, self.emb_path)
         print("load arch2vec from: {}".format(self.f_path))
         self.embedding = torch.load(self.f_path)
         for ind in range(len(self.embedding)):
             self.features.append(self.embedding[ind]['feature'])
         self.features = torch.stack(self.features, dim=0)
         print('loading finished. pretrained embeddings shape: {}'.format(self.features.shape))
Ejemplo n.º 16
0
def get_select_people_complex_payload(crm_complex_data, grp1_label_list,
                                      grp2_label_list):
    """
    generate two group search people complex json.
    :param crm_complex_data:
    :param grp1_label_list: group 1 labels list, the groupId for labels in list should be same in yml.
    :param grp2_label_list: group 2 labels list, eg: ['latestRiskScore', 'installDate']
    :return: payload json
    """
    if not isinstance(crm_complex_data, dict) or len(crm_complex_data) <= 0:
        utils.warn('please pass a valid dict parameter')
        return None
    group1_label_data_list, group2_label_data_list = [], []
    for label in grp1_label_list:
        group1_label_data_list.append(crm_complex_data[label])
    for label in grp2_label_list:
        group2_label_data_list.append(crm_complex_data[label])
    search_complex_payload = SearchLabelGroupReq([
        get_logical_condition(group1_label_data_list),
        get_logical_condition(group2_label_data_list)
    ])
    json_str = utils.dump_obj(search_complex_payload)
    payload_json = utils.load_json(json_str)
    return payload_json
Ejemplo n.º 17
0
def select_relations(conceptnet_raw_path, concepnet_parsed_path,
                     house_objects_path):  #
    ''' Select some relations from Conceptnet JSON files '''
    validated_relations = validate_relations(conceptnet_raw_path)
    wn31db = map_wn31db()
    relations_with_uris = []
    objects_with_uris = {
        v.replace(' ', '_'): k['dbpedia_uri']
        for v, k in load_json(house_objects_path).items()
    }
    triple_labels = []
    triple_uris = []

    for object1, relation, object2 in validated_relations:
        if object2 not in objects_with_uris:
            objects_with_uris[object2] = to_dbpedia(
                get_uri('/c/en/' + object2, 10), wn31db)

    for object1, relation, object2 in validated_relations:
        if objects_with_uris[
                object2]:  # object2 must to have an URI (by default object1 has an URI)
            relation_uri = 'http://ns.inria.fr/deko/ontology/deko.owl#' + relation
            relations_with_uris.append((objects_with_uris[object1], relation,
                                        objects_with_uris[object2]))
            triple_uris.append('<%s> <%s> <%s>' %
                               (objects_with_uris[object1], relation_uri,
                                objects_with_uris[object2]))
            triple_labels.append('<%s> <%s> <%s>' %
                                 (object1, relation, object2))

    calculate_statistics(relations_with_uris, concepnet_parsed_path)
    save_file(join(concepnet_parsed_path, 'selected_triples.nt'), triple_uris)
    save_file(join(concepnet_parsed_path, 'selected_triples_label.nt'),
              triple_labels)
    logging.info('Total valid relations with URIs: %s' %
                 len(relations_with_uris))
def load_samples(samples_file):
    return load_json(samples_file)
Ejemplo n.º 19
0
def load_config(config_file):
    config = load_json(config_file)
    return config
Ejemplo n.º 20
0
    def make_dictionary(self, question_dir, vocab_file, ent_setup,
                        remove_notfound):
        if os.path.exists(vocab_file):
            print("loading vocabularies from " + vocab_file + " ...")
            vocabularies = list(
                map(lambda x: x.strip(),
                    codecs.open(vocab_file, encoding="utf-8").readlines()))
        else:
            print("no " + vocab_file +
                  " found, constructing the vocabulary list ...")

            fnames = glob.glob(question_dir + "/training/*.question")
            dataset_dev = load_json(question_dir + "dev1.0.json")
            dataset_test = load_json(question_dir + "test1.0.json")

            # first TRAINING ****************************************
            vocab_set = set()
            n = 0.
            for fname in fnames:

                fp = open(fname)
                fp.readline()
                fp.readline()
                document = fp.readline().split()
                fp.readline()
                query = fp.readline().split()
                fp.close()

                vocab_set |= set(document) | set(query)

                # show progress
                n += 1
                if n % 10000 == 0:
                    print('%3d%%' % int(100 * n / len(fnames)))

            # DEV + TEST *******************************************
            assert ent_setup == "ent-anonym" or ent_setup == "ent"
            for datum in dataset_test[DATA_KEY] + dataset_dev[DATA_KEY]:
                document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" +
                                       datum[DOC_KEY][CONTEXT_KEY])
                document = document.lower()

                assert document
                for qa in datum[DOC_KEY][QAS_KEY]:
                    doc_raw = document.split()
                    question = to_entities(qa[QUERY_KEY]).lower()
                    assert question
                    qry_raw = question.split()
                    ans_raw = ""
                    for ans in qa[ANS_KEY]:
                        if ans[ORIG_KEY] == "dataset":
                            ans_raw = ("@entity" +
                                       "_".join(ans[TXT_KEY].split())).lower()
                    assert ans_raw
                    if remove_notfound:
                        if ans_raw not in doc_raw:
                            found_umls = False
                            for ans in qa[ANS_KEY]:
                                if ans[ORIG_KEY] == "UMLS":
                                    umls_answer = ("@entity" + "_".join(
                                        ans[TXT_KEY].split())).lower()
                                    if umls_answer in doc_raw:
                                        found_umls = True
                                        ans_raw = umls_answer
                            if not found_umls:
                                continue
                    if ent_setup == "ent-anonym":
                        entity_dict = {}
                        entity_id = 0
                        lst = doc_raw + qry_raw
                        lst.append(ans_raw)
                        for word in lst:
                            if (word.startswith('@entity')) and (
                                    word not in entity_dict):
                                entity_dict[word] = '@entity' + str(entity_id)
                                entity_id += 1
                        qry_raw = [
                            entity_dict[w] if w in entity_dict else w
                            for w in qry_raw
                        ]
                        doc_raw = [
                            entity_dict[w] if w in entity_dict else w
                            for w in doc_raw
                        ]
                        ans_raw = entity_dict[ans_raw]
                    vocab_set |= set(qry_raw)
                    vocab_set |= set(doc_raw)
                    vocab_set.add(ans_raw)
                    # show progress
                    n += 1
                    if n % 10000 == 0:
                        print(n)

            entities = set(e for e in vocab_set if e.startswith('@entity'))
            # @placehoder, @begin and @end are included in the vocabulary list
            tokens = vocab_set.difference(entities)
            tokens.add(SYMB_BEGIN)
            tokens.add(SYMB_END)

            vocabularies = list(entities) + list(tokens)

            print("writing vocabularies to " + vocab_file + " ...")
            vocab_fp = codecs.open(vocab_file, "w", encoding="utf-8")
            vocab_fp.write('\n'.join(vocabularies))
            vocab_fp.close()

        vocab_size = len(vocabularies)
        word_dictionary = dict(zip(vocabularies, range(vocab_size)))
        char_set = set([c for w in vocabularies for c in list(w)])
        char_set.add(' ')
        char_dictionary = dict(zip(list(char_set), range(len(char_set))))
        num_entities = len(
            [v for v in vocabularies if v.startswith('@entity')])
        print("vocab_size = %d" % vocab_size)
        print("num characters = %d" % len(char_set))
        print("%d anonymoused entities" % num_entities)
        print("%d other tokens (including @placeholder, %s and %s)" %
              (vocab_size - num_entities, SYMB_BEGIN, SYMB_END))

        return word_dictionary, char_dictionary, num_entities
Ejemplo n.º 21
0
    def parse_file(self, file_path, dictionary, use_chars, ent_setup,
                   remove_notfound):
        """
        parse a *.json dataset file into a list of questions, where each element is tuple(document, query, answer, filename, query_id)
        """
        questions = []
        w_dict, c_dict = dictionary[0], dictionary[1]
        relabeling_dicts = {}
        raw = load_json(file_path)
        for datum in raw[DATA_KEY]:
            document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" +
                                   datum[DOC_KEY][CONTEXT_KEY])
            document = document.lower()

            assert document
            for qa in datum[DOC_KEY][QAS_KEY]:
                if ent_setup in ["ent-anonym", "ent"]:
                    doc_raw = document.split()
                    question = to_entities(qa[QUERY_KEY]).lower()
                    qry_id = qa[ID_KEY]
                    assert question
                    ans_raw = ""
                    for ans in qa[ANS_KEY]:
                        if ans[ORIG_KEY] == "dataset":
                            ans_raw = ("@entity" +
                                       "_".join(ans[TXT_KEY].split())).lower()
                    assert ans_raw
                    if remove_notfound:
                        if ans_raw not in doc_raw:
                            found_umls = False
                            for ans in qa[ANS_KEY]:
                                if ans[ORIG_KEY] == "UMLS":
                                    umls_answer = ("@entity" + "_".join(
                                        ans[TXT_KEY].split())).lower()
                                    if umls_answer in doc_raw:
                                        found_umls = True
                                        ans_raw = umls_answer
                            if not found_umls:
                                continue
                    qry_raw = question.split()
                    if ent_setup == "ent-anonym":
                        entity_dict = {}
                        entity_id = 0
                        lst = doc_raw + qry_raw
                        lst.append(ans_raw)
                        for word in lst:
                            if (word.startswith('@entity')) and (
                                    word not in entity_dict):
                                entity_dict[word] = '@entity' + str(entity_id)
                                entity_id += 1
                        qry_raw = [
                            entity_dict[w] if w in entity_dict else w
                            for w in qry_raw
                        ]
                        doc_raw = [
                            entity_dict[w] if w in entity_dict else w
                            for w in doc_raw
                        ]
                        ans_raw = entity_dict[ans_raw]
                        inv_entity_dict = {
                            ent_id: ent_ans
                            for ent_ans, ent_id in entity_dict.items()
                        }
                        assert len(entity_dict) == len(inv_entity_dict)
                        relabeling_dicts[qa[ID_KEY]] = inv_entity_dict

                    cand_e = [w for w in doc_raw if w.startswith('@entity')]
                    cand_raw = [[e] for e in cand_e]
                    # wrap the query with special symbols
                    qry_raw.insert(0, SYMB_BEGIN)
                    qry_raw.append(SYMB_END)
                    try:
                        cloze = qry_raw.index('@placeholder')
                    except ValueError:
                        print('@placeholder not found in ', qry_raw,
                              '. Fixing...')
                        at = qry_raw.index('@')
                        qry_raw = qry_raw[:at] + [''.join(qry_raw[at:at + 2])
                                                  ] + qry_raw[at + 2:]
                        cloze = qry_raw.index('@placeholder')

                    # tokens/entities --> indexes
                    doc_words = list(map(lambda w: w_dict[w], doc_raw))

                    # tokens/entities --> indexes
                    qry_words = list(map(lambda w: w_dict[w], qry_raw))
                    if use_chars:
                        qry_chars = list(
                            map(
                                lambda w: list(
                                    map(lambda c: c_dict.get(c, c_dict[' ']),
                                        list(w)[:MAX_WORD_LEN])), qry_raw))
                    else:
                        qry_chars = []
                    ans = list(map(lambda w: w_dict.get(w, 0),
                                   ans_raw.split()))
                    cand = [
                        list(map(lambda w: w_dict.get(w, 0), c))
                        for c in cand_raw
                    ]

                    if use_chars:
                        doc_chars = list(
                            map(
                                lambda w: list(
                                    map(lambda c: c_dict.get(c, c_dict[' ']),
                                        list(w)[:MAX_WORD_LEN])), doc_raw))
                    else:
                        doc_chars = []

                    questions.append((doc_words, qry_words, ans, cand,
                                      doc_chars, qry_chars, cloze, qry_id))

                elif ent_setup == "no-ent":
                    # collect candidate ents using @entity marks
                    cand_e = [
                        w for w in to_entities(
                            datum[DOC_KEY][TITLE_KEY] + "\n" +
                            datum[DOC_KEY][CONTEXT_KEY]).lower().split()
                        if w.startswith('@entity')
                    ]
                    cand_raw = [e[len("@entity"):].split("_") for e in cand_e]
                    document = remove_entity_marks(datum[DOC_KEY][TITLE_KEY] +
                                                   "\n" +
                                                   datum[DOC_KEY][CONTEXT_KEY])
                    document = document.lower()
                    doc_raw = document.split()
                    question = remove_entity_marks(qa[QUERY_KEY]).lower()
                    qry_id = qa[ID_KEY]
                    assert question
                    qry_raw = question.split()
                    ans_raw = ""
                    for ans in qa[ANS_KEY]:
                        if ans[ORIG_KEY] == "dataset":
                            ans_raw = ans[TXT_KEY].lower()
                    assert ans_raw
                    if remove_notfound:
                        if ans_raw not in doc_raw:
                            found_umls = False
                            for ans in qa[ANS_KEY]:
                                if ans[ORIG_KEY] == "UMLS":
                                    umls_answer = ans[TXT_KEY].lower()
                                    if umls_answer in doc_raw:
                                        found_umls = True
                                        ans_raw = umls_answer
                            if not found_umls:
                                continue

                    relabeling_dicts[qa[ID_KEY]] = None
                    # wrap the query with special symbols
                    qry_raw.insert(0, SYMB_BEGIN)
                    qry_raw.append(SYMB_END)
                    try:
                        cloze = qry_raw.index('@placeholder')
                    except ValueError:
                        print('@placeholder not found in ', qry_raw,
                              '. Fixing...')
                        at = qry_raw.index('@')
                        qry_raw = qry_raw[:at] + [''.join(qry_raw[at:at + 2])
                                                  ] + qry_raw[at + 2:]
                        cloze = qry_raw.index('@placeholder')

                    # tokens/entities --> indexes
                    doc_words = list(map(lambda w: w_dict[w], doc_raw))

                    # tokens/entities --> indexes
                    qry_words = list(map(lambda w: w_dict[w], qry_raw))
                    if use_chars:
                        qry_chars = list(
                            map(
                                lambda w: list(
                                    map(lambda c: c_dict.get(c, c_dict[' ']),
                                        list(w)[:MAX_WORD_LEN])), qry_raw))
                    else:
                        qry_chars = []
                    ans = list(map(lambda w: w_dict.get(w, 0),
                                   ans_raw.split()))
                    cand = [
                        list(map(lambda w: w_dict.get(w, 0), c))
                        for c in cand_raw
                    ]

                    if use_chars:
                        doc_chars = list(
                            map(
                                lambda w: list(
                                    map(lambda c: c_dict.get(c, c_dict[' ']),
                                        list(w)[:MAX_WORD_LEN])), doc_raw))
                    else:
                        doc_chars = []

                    questions.append((doc_words, qry_words, ans, cand,
                                      doc_chars, qry_chars, cloze, qry_id))

                else:
                    raise ValueError

        return questions, relabeling_dicts
Ejemplo n.º 22
0
    def make_dictionary(self, question_dir, vocab_file, ent_setup,
                        remove_notfound):
        vocab_file = "{}_stp{}_remove{}_py3".format(vocab_file, ent_setup,
                                                    remove_notfound)
        if os.path.exists(vocab_file):
            print("loading vocabularies from " + vocab_file + " ...")
            vocabularies = list(
                map(lambda x: x.strip(),
                    codecs.open(vocab_file, encoding="utf-8").readlines()))
        else:
            print("no " + vocab_file +
                  " found, constructing the vocabulary list ...")
            vocab_set = set()
            n = 0.
            dataset_train = load_json(question_dir + "train1.0.json")
            dataset_dev = load_json(question_dir + "dev1.0.json")
            dataset_test = load_json(question_dir + "test1.0.json")

            if ent_setup in ["ent-anonym",
                             "ent"]:  # treats each entity as a single token
                # train here (remove_notfound=True|False), dev/test below
                for datum in dataset_train[DATA_KEY]:
                    document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" +
                                           datum[DOC_KEY][CONTEXT_KEY])
                    document = document.lower()

                    assert document
                    for qa in datum[DOC_KEY][QAS_KEY]:
                        doc_raw = document.split()
                        question = to_entities(qa[QUERY_KEY]).lower()
                        assert question
                        qry_raw = question.split()
                        ans_raw = ""
                        for ans in qa[ANS_KEY]:
                            if ans[ORIG_KEY] == "dataset":
                                ans_raw = (
                                    "@entity" +
                                    "_".join(ans[TXT_KEY].split())).lower()
                        assert ans_raw
                        if remove_notfound:
                            if ans_raw not in doc_raw:
                                found_umls = False
                                for ans in qa[ANS_KEY]:
                                    if ans[ORIG_KEY] == "UMLS":
                                        umls_answer = ("@entity" + "_".join(
                                            ans[TXT_KEY].split())).lower()
                                        if umls_answer in doc_raw:
                                            found_umls = True
                                            ans_raw = umls_answer
                                if not found_umls:
                                    continue
                        if ent_setup == "ent-anonym":  # anonymize
                            entity_dict = {}
                            entity_id = 0
                            lst = doc_raw + qry_raw
                            lst.append(ans_raw)
                            for word in lst:
                                if (word.startswith('@entity')) and (
                                        word not in entity_dict):
                                    entity_dict[word] = '@entity' + str(
                                        entity_id)
                                    entity_id += 1
                            qry_raw = [
                                entity_dict[w] if w in entity_dict else w
                                for w in qry_raw
                            ]
                            doc_raw = [
                                entity_dict[w] if w in entity_dict else w
                                for w in doc_raw
                            ]
                            ans_raw = entity_dict[ans_raw]
                        vocab_set |= set(qry_raw)
                        vocab_set |= set(doc_raw)
                        vocab_set.add(ans_raw)
                        # show progress
                        n += 1
                        if n % 10000 == 0:
                            print(n)
                # treat dev/test separately to allow remove_notfound=False
                for datum in dataset_test[DATA_KEY] + dataset_dev[DATA_KEY]:
                    document = to_entities(datum[DOC_KEY][TITLE_KEY] + "\n" +
                                           datum[DOC_KEY][CONTEXT_KEY])
                    document = document.lower()

                    assert document
                    for qa in datum[DOC_KEY][QAS_KEY]:
                        doc_raw = document.split()
                        question = to_entities(qa[QUERY_KEY]).lower()
                        assert question
                        qry_raw = question.split()
                        ans_raw = ""
                        for ans in qa[ANS_KEY]:
                            if ans[ORIG_KEY] == "dataset":
                                ans_raw = (
                                    "@entity" +
                                    "_".join(ans[TXT_KEY].split())).lower()
                        assert ans_raw
                        if ent_setup == "ent-anonym":
                            entity_dict = {}
                            entity_id = 0
                            lst = doc_raw + qry_raw
                            lst.append(ans_raw)
                            for word in lst:
                                if (word.startswith('@entity')) and (
                                        word not in entity_dict):
                                    entity_dict[word] = '@entity' + str(
                                        entity_id)
                                    entity_id += 1
                            qry_raw = [
                                entity_dict[w] if w in entity_dict else w
                                for w in qry_raw
                            ]
                            doc_raw = [
                                entity_dict[w] if w in entity_dict else w
                                for w in doc_raw
                            ]
                            ans_raw = entity_dict[ans_raw]
                        vocab_set |= set(qry_raw)
                        vocab_set |= set(doc_raw)
                        vocab_set.add(ans_raw)
                        # show progress
                        n += 1
                        if n % 10000 == 0:
                            print(n)

                entities = set(e for e in vocab_set if e.startswith('@entity'))
                # @placehoder, @begin and @end are included in the vocabulary list
                tokens = vocab_set.difference(entities)
                tokens.add(SYMB_BEGIN)
                tokens.add(SYMB_END)

                vocabularies = list(entities) + list(tokens)

            elif ent_setup == "no-ent":  # ignore entity markings
                # train here (remove_notfound=True|False), dev/test below
                for datum in dataset_train[DATA_KEY]:
                    document = remove_entity_marks(datum[DOC_KEY][TITLE_KEY] +
                                                   "\n" +
                                                   datum[DOC_KEY][CONTEXT_KEY])
                    document = document.lower()
                    assert document
                    doc_raw = document.split()
                    for qa in datum[DOC_KEY][QAS_KEY]:
                        question = remove_entity_marks(qa[QUERY_KEY]).lower()
                        assert question
                        qry_raw = question.split()
                        ans_raw = ""
                        for ans in qa[ANS_KEY]:
                            if ans[ORIG_KEY] == "dataset":
                                ans_raw = ans[TXT_KEY].lower()
                        assert ans_raw
                        if remove_notfound:
                            if ans_raw not in doc_raw:
                                found_umls = False
                                for ans in qa[ANS_KEY]:
                                    if ans[ORIG_KEY] == "UMLS":
                                        umls_answer = ans[TXT_KEY].lower()
                                        if umls_answer in document:
                                            found_umls = True
                                            ans_raw = umls_answer
                                if not found_umls:
                                    continue
                        vocab_set |= set(qry_raw)
                        vocab_set |= set(doc_raw)
                        vocab_set.add(ans_raw)
                        # show progress
                        n += 1
                        if n % 10000 == 0:
                            print(n)
                # treat dev/test separately to allow remove_notfound=False
                for datum in dataset_test[DATA_KEY] + dataset_dev[DATA_KEY]:
                    document = remove_entity_marks(datum[DOC_KEY][TITLE_KEY] +
                                                   "\n" +
                                                   datum[DOC_KEY][CONTEXT_KEY])
                    document = document.lower()
                    assert document
                    doc_raw = document.split()
                    for qa in datum[DOC_KEY][QAS_KEY]:
                        question = remove_entity_marks(qa[QUERY_KEY]).lower()
                        assert question
                        qry_raw = question.split()
                        ans_raw = ""
                        for ans in qa[ANS_KEY]:
                            if ans[ORIG_KEY] == "dataset":
                                ans_raw = ans[TXT_KEY].lower()
                        assert ans_raw
                        vocab_set |= set(qry_raw)
                        vocab_set |= set(doc_raw)
                        vocab_set.add(ans_raw)
                        # show progress
                        n += 1
                        if n % 10000 == 0:
                            print(n)

                entities = set(e for e in vocab_set if e.startswith('@entity'))
                # @placehoder, @begin and @end are included in the vocabulary list
                tokens = vocab_set.difference(entities)
                tokens.add(SYMB_BEGIN)
                tokens.add(SYMB_END)

                vocabularies = list(entities) + list(tokens)

            else:
                raise ValueError

            print("writing vocabularies to " + vocab_file + " ...")
            vocab_fp = codecs.open(vocab_file, "w", encoding="utf-8")
            vocab_fp.write('\n'.join(vocabularies))
            vocab_fp.close()

        vocab_size = len(vocabularies)
        word_dictionary = dict(zip(vocabularies, range(vocab_size)))
        char_set = set([c for w in vocabularies for c in list(w)])
        char_set.add(' ')
        char_dictionary = dict(zip(list(char_set), range(len(char_set))))
        num_entities = len(
            [v for v in vocabularies if v.startswith('@entity')])
        print("vocab_size = %d" % vocab_size)
        print("num characters = %d" % len(char_set))
        print("%d anonymoused entities" % num_entities)
        print("%d other tokens (including @placeholder, %s and %s)" %
              (vocab_size - num_entities, SYMB_BEGIN, SYMB_END))

        return word_dictionary, char_dictionary, num_entities
Ejemplo n.º 23
0
                        help='Data file (default: data.json')
    parser.add_argument('--name', type=str, default='darts')
    parser.add_argument('--cfg', type=int, default=4,
                        help='configuration (default: 4)')
    parser.add_argument('--bs', type=int, default=32,
                        help='batch size (default: 32)')
    parser.add_argument('--epochs', type=int, default=10,
                        help='training epochs (default: 10)')
    parser.add_argument('--dropout', type=float, default=0.3,
                        help='decoder implicit regularization (default: 0.3)')
    parser.add_argument('--normalize', action='store_true', default=True,
                        help='use input normalization')
    parser.add_argument('--input_dim', type=int, default=11)
    parser.add_argument('--hidden_dim', type=int, default=128)
    parser.add_argument('--dim', type=int, default=16,
                        help='feature dimension (default: 16)')
    parser.add_argument('--hops', type=int, default=5)
    parser.add_argument('--mlps', type=int, default=2)
    parser.add_argument('--latent_points', type=int, default=10000,
                        help='latent points for validaty check (default: 10000)')
    args = parser.parse_args()
    cfg = configs[args.cfg]
    dataset = load_json(args.data)
    print('using {}'.format(args.data))
    print('feat dim {}'.format(args.dim))
    pretraining_gae(dataset, cfg)




Ejemplo n.º 24
0
def smooth_exp(data_path, emb_path, supervised_emb_path, output_path,
               data_type, random_path, path_step, straight_path):
    print('experiments:')
    ## load raw architecture
    dataset = load_json(data_path)
    ## load feature & test_acc
    feature, test_acc = read_feature(emb_path)
    feature_sup = np.squeeze(np.load(supervised_emb_path))
    feature_nums = len(dataset)
    ## get start points
    start_idx = np.random.choice(feature_nums, random_path,
                                 replace=False).tolist()
    if straight_path > 0:
        straight_idx = get_straight(dataset, num=straight_path)
        start_idx = np.stack(start_idx + straight_idx)
    ## smooth experiments
    ops = []
    adj = []
    ops_sup = []
    adj_sup = []
    for k, ind in enumerate(start_idx):
        ops_k = []
        adj_k = []
        prev_node = feature[ind].reshape(1, -1)
        mask = np.zeros(feature_nums, dtype=int)
        ## supervised
        ops_k_sup = []
        adj_k_sup = []
        prev_node_sup = feature_sup[ind].reshape(1, -1)
        mask_sup = np.zeros(feature_nums, dtype=int)
        for i in tqdm(range(path_step),
                      desc='smooth experiment {} of {}'.format(
                          k + 1, len(start_idx))):
            dis = linalg.norm(feature - prev_node, axis=1)
            mdis = ma.masked_array(dis, mask)
            idx = np.argmin(mdis)
            mask[idx] = 1
            prev_node = feature[idx].reshape(1, -1)
            ops_k.append(
                torch.LongTensor(dataset[str(idx)]['module_operations']))
            adj_k.append(
                torch.LongTensor(dataset[str(idx)]['module_adjacency']))
            ## supervised
            dis_sup = linalg.norm(feature_sup - prev_node_sup, axis=1)
            mdis_sup = ma.masked_array(dis_sup, mask_sup)
            idx_sup = np.argmin(mdis_sup)
            mask_sup[idx_sup] = 1
            prev_node_sup = feature_sup[idx_sup].reshape(1, -1)
            ops_k_sup.append(
                torch.LongTensor(dataset[str(idx_sup)]['module_operations']))
            adj_k_sup.append(
                torch.LongTensor(dataset[str(idx_sup)]['module_adjacency']))
        ops_k = torch.stack(ops_k)
        adj_k = torch.stack(adj_k)
        ops.append(ops_k)
        adj.append(adj_k)
        ops_k_sup = torch.stack(ops_k_sup)
        adj_k_sup = torch.stack(adj_k_sup)
        ops_sup.append(ops_k_sup)
        adj_sup.append(adj_k_sup)

    ## conver to graph
    for i in tqdm(range(len(start_idx)), desc='draw graphs'):
        G = adj2graph(ops[i], adj[i])
        names = []
        temp_path = '.temp'
        G_sup = adj2graph(ops_sup[i], adj_sup[i])
        names_sup = []
        temp_path_sup = '.temp_sup'
        if not os.path.exists(temp_path):
            os.makedirs(temp_path)
        if not os.path.exists(temp_path_sup):
            os.makedirs(temp_path_sup)
        for j in range(path_step):
            namej = plot_DAG(G[j], temp_path, str(j), data_type, backbone=True)
            names.append(namej)
            namej_sup = plot_DAG(G_sup[j],
                                 temp_path_sup,
                                 str(j),
                                 data_type,
                                 backbone=True)
            names_sup.append(namej_sup)
        ## pave to single image
        if not os.path.exists(os.path.join(output_path, 'unsupervised')):
            os.makedirs(os.path.join(output_path, 'unsupervised'))
        images = [[Image.open(name) for name in names]]
        join_images(*images, bg_color='white', alignment=(0, 0)).save(
            os.path.join(output_path, 'unsupervised',
                         '{}_unsupervised.png'.format(start_idx[i])))

        if not os.path.exists(os.path.join(output_path, 'supervised')):
            os.makedirs(os.path.join(output_path, 'supervised'))
        images = [[Image.open(name) for name in names_sup]]
        join_images(*images, bg_color='white', alignment=(0, 0)).save(
            os.path.join(output_path, 'supervised',
                         '{}_supervised.png'.format(start_idx[i])))

        if not os.path.exists(os.path.join(output_path, 'compare')):
            os.makedirs(os.path.join(output_path, 'compare'))
        images = [[Image.open(name) for name in names],
                  [Image.open(name) for name in names_sup]]
        join_images(*images, bg_color='white', alignment=(0, 0)).save(
            os.path.join(output_path, 'compare',
                         '{}_compare.png'.format(start_idx[i])))
Ejemplo n.º 25
0
def smooth_exp_nas201(data_path, emb_path, supervised_emb_path, output_path,
                      random_path, path_step):
    print('experiments (NAS 201):')
    ## load raw architecture
    dataset = load_json(data_path)
    ## load feature & test_acc
    feature_raw = torch.load(emb_path)
    feature = []
    for i in tqdm(range(len(feature_raw)), desc='load feature'):
        feature.append(feature_raw[i]['feature'].detach().numpy())
    feature = np.stack(feature)
    feature_sup = np.load(supervised_emb_path)
    feature_nums = len(dataset)
    ## get start points
    start_idx = np.random.choice(feature_nums, random_path,
                                 replace=False).tolist()
    ## smooth experiments
    ops = []
    ops_sup = []

    for k, ind in enumerate(start_idx):
        ops_k = []
        prev_node = feature[ind].reshape(1, -1)
        mask = np.zeros(feature_nums, dtype=int)
        ## supervised
        ops_k_sup = []
        prev_node_sup = feature_sup[ind].reshape(1, -1)
        mask_sup = np.zeros(feature_nums, dtype=int)
        for i in tqdm(range(path_step),
                      desc='smooth experiment {} of {}'.format(
                          k + 1, len(start_idx))):
            dis = linalg.norm(feature - prev_node, axis=1)
            mdis = ma.masked_array(dis, mask)
            idx = np.argmin(mdis)
            mask[idx] = 1
            prev_node = feature[idx].reshape(1, -1)
            ops_k.append(
                np.argmax(np.array(dataset[str(idx)]['module_operations']),
                          axis=1))
            ## supervised
            dis_sup = linalg.norm(feature_sup - prev_node_sup, axis=1)
            mdis_sup = ma.masked_array(dis_sup, mask_sup)
            idx_sup = np.argmin(mdis_sup)
            mask_sup[idx_sup] = 1
            prev_node_sup = feature_sup[idx_sup].reshape(1, -1)
            ops_k_sup.append(
                np.argmax(np.array(dataset[str(idx_sup)]['module_operations']),
                          axis=1))
        ops_k = np.stack(ops_k)
        ops.append(ops_k)
        ops_k_sup = np.stack(ops_k_sup)
        ops_sup.append(ops_k_sup)

    ## conver to graph
    num2ops = {
        0: 'in',
        1: '1x1',
        2: '3x3',
        3: 'pool',
        4: 'skip',
        5: 'none',
        6: 'out'
    }
    x = [130, 300, 280, 40, 150, 320]
    y = [550, 500, 350, 400, 250, 200]
    img = mpimg.imread(
        os.path.join(os.path.dirname(os.path.abspath(__file__)), 'nas201.jpg'))
    for i in tqdm(range(len(start_idx)), desc='draw graphs'):
        names = []
        temp_path = '.temp'
        names_sup = []
        temp_path_sup = '.temp_sup'
        if not os.path.exists(temp_path):
            os.makedirs(temp_path)
        if not os.path.exists(temp_path_sup):
            os.makedirs(temp_path_sup)
        ops0_prev = []
        ops1_prev = []
        for j in range(path_step):
            namej = os.path.join(temp_path, str(j) + '.jpg')
            names.append(namej)
            ops0 = [num2ops[x] for x in ops[i][j]]
            fig, ax = plt.subplots()
            ax.imshow(img)
            ax.set_xticks([])
            ax.set_yticks([])
            ax.spines['top'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['right'].set_visible(False)
            for k in range(6):
                if len(ops0_prev) == 0 or ops0[k + 1] == ops0_prev[k + 1]:
                    plt.text(x[k],
                             y[k],
                             ops0[k + 1],
                             fontsize=18,
                             color='blue')
                else:
                    plt.text(x[k], y[k], ops0[k + 1], fontsize=18, color='red')
            plt.savefig(namej, bbox_inches='tight')
            plt.close()
            ops0_prev = ops0
            namej_sup = os.path.join(temp_path_sup, str(j) + '.jpg')
            names_sup.append(namej_sup)
            ops1 = [num2ops[x] for x in ops_sup[i][j]]
            fig, ax = plt.subplots()
            ax.imshow(img)
            ax.set_xticks([])
            ax.set_yticks([])
            ax.spines['top'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.spines['right'].set_visible(False)
            for k in range(6):
                if len(ops1_prev) == 0 or ops1[k + 1] == ops1_prev[k + 1]:
                    plt.text(x[k],
                             y[k],
                             ops1[k + 1],
                             fontsize=18,
                             color='blue')
                else:
                    plt.text(x[k], y[k], ops1[k + 1], fontsize=18, color='red')
            plt.savefig(namej_sup, bbox_inches='tight')
            plt.close()
            ops1_prev = ops1
        ## pave to single image
        if not os.path.exists(os.path.join(output_path, 'unsupervised')):
            os.makedirs(os.path.join(output_path, 'unsupervised'))
        images = [[Image.open(name) for name in names]]
        join_images(*images, bg_color='white', alignment=(0, 0)).save(
            os.path.join(output_path, 'unsupervised',
                         '{}_unsupervised.png'.format(start_idx[i])))

        if not os.path.exists(os.path.join(output_path, 'supervised')):
            os.makedirs(os.path.join(output_path, 'supervised'))
        images = [[Image.open(name) for name in names_sup]]
        join_images(*images, bg_color='white', alignment=(0, 0)).save(
            os.path.join(output_path, 'supervised',
                         '{}_supervised.png'.format(start_idx[i])))

        if not os.path.exists(os.path.join(output_path, 'compare')):
            os.makedirs(os.path.join(output_path, 'compare'))
        images = [[Image.open(name) for name in names],
                  [Image.open(name) for name in names_sup]]
        join_images(*images, bg_color='white', alignment=(0, 0)).save(
            os.path.join(output_path, 'compare',
                         '{}_compare.png'.format(start_idx[i])))
Ejemplo n.º 26
0
def create_dataset(conceptnet_raw_path, house_objects_path, relations_path):
    ''' Create a dataset of objects and their relations from Conceptnet '''
    objects = load_json(house_objects_path).keys()
    objects = [x.replace(' ', '_') for x in objects]
    relations = [line.rstrip() for line in open(relations_path)]
    collect_relations(objects, relations, conceptnet_raw_path, 7)
Ejemplo n.º 27
0
def plot_data(dataset_names, only_table=False):
    for dataset_name in dataset_names:
        print('*** Dataset name:', dataset_name)
        qini_dict = load_json(dataset_name + '_qini')
        var_sel_dict = load_json(dataset_name + '_val_sel')
        plot_all(dataset_name, qini_dict, var_sel_dict, only_table)
Ejemplo n.º 28
0
                        help='silent: 0, progress bar: 1, detailed: 2')
    args = parser.parse_args()

    # Create log object.
    if args.mode == 'train':
        sys.stdout = Logger(TRAIN_LOG_LOC)
    else:
        sys.stdout = Logger(TEST_LOG_LOC)

    print_statement('HYPERPARAMETER SETTING', verbose=args.verbose)
    print_flags(args, verbose=args.verbose)

    # Load data.
    print_statement('DATA PROCESSING', verbose=args.verbose)
    label_map = load_json(LABEL_JSON_LOC,
                          reverse=True,
                          name='Label Mapping',
                          verbose=args.verbose)
    train_data = load_json(TRAIN_JSON_LOC,
                           label_map,
                           name='Training Set',
                           verbose=args.verbose)
    val_data = load_json(VAL_JSON_LOC,
                         label_map,
                         name='Validation Set',
                         verbose=args.verbose)
    test_data = load_json(TEST_JSON_LOC,
                          label_map,
                          name='Test Set',
                          verbose=args.verbose)

    # Train model.
Ejemplo n.º 29
0
 def _reset(self, data_path, save):
     if not save:
         print("extract arch2vec embedding table...")
         dataset = load_json(data_path)
         self.model = Model(input_dim=args.input_dim,
                            hidden_dim=args.hidden_dim,
                            latent_dim=args.latent_dim,
                            num_hops=args.hops,
                            num_mlp_layers=args.mlps,
                            dropout=args.dropout,
                            **cfg['GAE']).cuda()
         model_ckpt_path = os.path.join(self.dir_name,
                                        '{}'.format(args.model_path))
         if not os.path.exists(model_ckpt_path):
             print("File {} does not exist.".format(model_ckpt_path))
             exit()
         self.model.load_state_dict(
             torch.load(model_ckpt_path)['model_state'])
         self.model.eval()
         print("length of the dataset: {}".format(len(dataset)))
         self.f_path = os.path.join(
             self.dir_name, '{}-arch2vec.pt'.format(args.dataset_name))
         if os.path.exists(self.f_path):
             print('ATTENTION!!! {} is already saved.'.format(self.f_path))
             exit()
         print('save to {} ...'.format(self.f_path))
         for ind in range(len(dataset)):
             adj = torch.Tensor(
                 dataset[str(ind)]['module_adjacency']).unsqueeze(0).cuda()
             ops = torch.Tensor(dataset[str(ind)]
                                ['module_operations']).unsqueeze(0).cuda()
             adj, ops, prep_reverse = preprocessing(adj, ops, **cfg['prep'])
             test_acc = dataset[str(ind)]['test_accuracy']
             valid_acc = dataset[str(ind)]['validation_accuracy']
             other_info = {
                 'valid_accuracy_avg':
                 dataset[str(ind)]['validation_accuracy_avg'],
                 'test_accuracy_avg':
                 dataset[str(ind)]['test_accuracy_avg']
             }
             time = dataset[str(ind)]['training_time']
             x, _ = self.model._encoder(ops, adj)
             self.embedding[ind] = {
                 'feature': x.mean(dim=1).squeeze(0).cpu(),
                 'valid_accuracy': float(valid_acc),
                 'test_accuracy': float(test_acc),
                 'time': float(time),
                 'other_info': other_info
             }
         torch.save(self.embedding, self.f_path)
         print("finished arch2vec extraction")
         exit()
     else:
         self.f_path = os.path.join(
             self.dir_name, '{}-arch2vec.pt'.format(args.dataset_name))
         print("load pretrained arch2vec in path: {}".format(self.f_path))
         self.embedding = torch.load(self.f_path)
         random.seed(args.seed)
         random.shuffle(self.embedding)
         for ind in range(len(self.embedding)):
             self.features.append(self.embedding[ind]['feature'])
         self.features = torch.stack(self.features, dim=0)
         print('loading finished. pretrained embeddings shape: {}'.format(
             self.features.shape))
def load_config(config_file):
    config = load_json(config_file)
    return config
def evaluate_helmet_image_sgcc_score(predicted_file_json_path: str,
                                     gold_json_file_path: str,
                                     iou_threshold: float,
                                     false_detection_weight: float,
                                     missed_detection_weight: float,
                                     object_detection_weight: float):
    """ calculate the sgcc helmet image score by the predicted and gold json file """
    try:
        gt_data = load_json(gold_json_file_path)
        pred_data = load_json(predicted_file_json_path)

        # load the names of categories
        class_name_list = []
        for class_item in gt_data['categories']:
            if isinstance(class_item['name'], list):
                class_name_list.append(class_item['name'][0])
            else:
                class_name_list.append(class_item['name'])

        class_name_dict = {'wear_helmet_label': 1, 'no_helmet_label': 0}

        # traverse the images, a batch of one picture
        false_detection_count = 0
        detection_no_wear_total_count = 0
        missed_detection_count = 0
        gold_no_wear_total_count = 0
        object_detection_correct_count = 0
        object_detection_total_count = 0
        for i in range(len(gt_data['images'])):
            image_id = gt_data['images'][i]['id']
            # load gold annotations,ann_gt = n * [cls_id, x1, y1, x2, y2]
            labels_gt, ann_gt = get_ann(image_id, gt_data['annotations'])
            # load predicted annotations,ann_pred = n * [x1, y1, x2, y2, pred_score, cls_id]
            _, ann_pred = get_ann(image_id, pred_data)
            # sort the ann pred list by the confidence pred scores in a descending order
            if len(ann_pred):
                ann_pred = ann_pred[(-ann_pred[:, 4]).argsort()]

            ann_pred = torch.Tensor(ann_pred)
            ann_gt = torch.Tensor(ann_gt)

            # predicted no_wear boxes and labels
            if len(ann_pred) == 0:
                pred_no_wear_indices, pred_no_wear_labels, pred_no_wear_boxes = [], [], []
            else:
                pred_no_wear_indices = torch.where(
                    ann_pred[:, -1] == class_name_dict['no_helmet_label'])
                pred_no_wear_labels = ann_pred[:, -1][pred_no_wear_indices]
                pred_no_wear_boxes = ann_pred[:, :4][pred_no_wear_indices]

            # target no_wear boxes and labels
            if len(ann_gt) == 0:
                target_no_wear_indices, target_no_wear_labels, target_no_wear_boxes = [], [], []
            else:
                target_no_wear_indices = torch.where(
                    ann_gt[:, 0] == class_name_dict['no_helmet_label'])
                target_no_wear_labels = ann_gt[:, 0][target_no_wear_indices]
                target_no_wear_boxes = ann_gt[:, 1:][target_no_wear_indices]

            false_detection_number, detection_no_wear_number = helmet_image_false_detection(
                pred_no_wear_labels=pred_no_wear_labels,
                pred_no_wear_boxes=pred_no_wear_boxes,
                target_no_wear_labels=target_no_wear_labels,
                target_no_wear_boxes=target_no_wear_boxes,
                iou_threshold=iou_threshold)
            false_detection_count += false_detection_number
            detection_no_wear_total_count += detection_no_wear_number

            missed_detection_number, gold_no_wear_number = helmet_image_missed_detection(
                pred_no_wear_labels=pred_no_wear_labels,
                pred_no_wear_boxes=pred_no_wear_boxes,
                target_no_wear_labels=target_no_wear_labels,
                target_no_wear_boxes=target_no_wear_boxes,
                iou_threshold=iou_threshold)
            missed_detection_count += missed_detection_number
            gold_no_wear_total_count += gold_no_wear_number

            object_detection_correct_number, object_detection_total_number = helmet_image_object_detection(
                pred_no_wear_labels=pred_no_wear_labels,
                pred_no_wear_boxes=pred_no_wear_boxes,
                target_no_wear_labels=target_no_wear_labels,
                target_no_wear_boxes=target_no_wear_boxes,
                iou_threshold=iou_threshold)
            object_detection_correct_count += object_detection_correct_number
            object_detection_total_count += object_detection_total_number

        false_detection_rate = (false_detection_count /
                                detection_no_wear_total_count) if (
                                    detection_no_wear_total_count != 0) else 0
        missed_detection_rate = (missed_detection_count /
                                 gold_no_wear_total_count) if (
                                     gold_no_wear_total_count != 0) else 0
        object_detection_correct_rate = (
            object_detection_correct_count / object_detection_total_count) if (
                object_detection_total_count != 0) else 0

        logger.info("false_detection_rate: {} / {} = {}".format(
            false_detection_count, detection_no_wear_total_count,
            false_detection_rate))
        logger.info("missed_detection_rate: {} / {} = {}".format(
            missed_detection_count, gold_no_wear_total_count,
            missed_detection_rate))
        logger.info("object_detection_correct_rate: {} / {} = {}".format(
            object_detection_correct_count, object_detection_total_count,
            object_detection_correct_rate))

        sgcc_helmet_image_score = 1 - (
            false_detection_weight * false_detection_rate +
            missed_detection_weight * missed_detection_rate +
            object_detection_weight * (1 - object_detection_correct_rate))

        logger.info("evaluation for {} and {}\n".format(
            predicted_file_json_path, gold_json_file_path))
        ap_table = [[
            "false detection rate", "missed detection rate",
            "object detection correct rate", "sgcc helmet image score"
        ]]
        ap_table += [[
            false_detection_rate, missed_detection_rate,
            object_detection_correct_rate, sgcc_helmet_image_score
        ]]
        logger.info("\n{}\n".format(AsciiTable(ap_table).table))

        return float('{:.8f}'.format(sgcc_helmet_image_score)), "评测成功"
    except Exception as e:
        return -1, "格式错误"
    except AssertionError:
        _, _, tb = sys.exc_info()
        traceback.print_tb(tb)
        tb_info = traceback.extract_tb(tb)
        filename, line, func, text = tb_info[-1]

        logger.info('an error occurred on line {} in statement {}'.format(
            line, text))

        return -1, "格式错误"