def extract_raw_refs_from_pdfs():
    metas = util.load_json(cfg.paths['papers-metadata'])
    refs = util.parallelize(extract_raw_refs_from_pdf, metas, N_THREADS)
    refs = {m['uid']: r for m, r in zip(metas, refs)}
    util.save_json(cfg.paths['raw-papers-refs'], refs)

    print('saved raw papers refs to "{}"'.format(cfg.paths['raw-papers-refs']))
def process():
    cands = load_results()
    model = AttentionModel()
    mc_ctx = MultiChoiceQuestionManger(subset='val')

    results = []
    t = time()
    for i, res_i in enumerate(cands):
        if i % 100 == 0:
            avg_time = (time() - t) / 100.
            print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time))
            t = time()

        image_id = res_i['image_id']
        aug_id = res_i['aug_id']
        question = res_i['target']
        # question_id = int(aug_id / 1000)
        question_id = res_i['question_id']
        gt_answer = mc_ctx.get_gt_answer(question_id)
        pred_ans, sc = model.get_score(image_id, question)
        is_valid = compare_answer(pred_ans, gt_answer)
        # import pdb
        # pdb.set_trace()
        if not is_valid:
            continue
        t_i = {
            'image_id': int(image_id),
            'aug_id': aug_id,
            'question_id': question_id,
            'question': question,
            'score': float(sc)
        }
        results.append(t_i)
    save_json('result/bs_vis_scores_mlb2-att.json', results)
Example #3
0
def get_journals():
    pos, cnt = 1, 0
    util.mkdir(JOURNAL_FOLDER)
    while True:
        html = util.get_page(JOURNAL_URL + str(pos))
        links = util.find_journals(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'journal'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data[
                'short']
            util.save_json(
                os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])),
                data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
            break
        pos += 100
        print 'Journal', cnt
Example #4
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        for sub in subs:
            if sub not in conf.keys():
                continue
            html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = conf[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
def process(model_type='mlb'):
    cands = load_results()
    if model_type == 'mlb':
        model = AttentionModel()
    else:
        model = VanillaModel()
    mc_ctx = MultiChoiceQuestionManger(subset='val')

    results = {}
    t = time()
    for i, res_key in enumerate(cands):
        if i % 100 == 0:
            avg_time = (time() - t) / 100.
            print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time))
            t = time()
        res_i = cands[res_key]
        image_id = res_i['image_id']
        question = res_i['target']
        question_id = res_i['question_id']
        gt_answer = mc_ctx.get_gt_answer(question_id)
        pred_ans, scores = model.get_score(image_id, question)
        sc = float(scores)
        is_valid = compare_answer(pred_ans, gt_answer)
        # if not is_valid:
        #     continue
        results[res_key] = {
            'pred_answer': pred_ans,
            'pred_score': sc,
            'gt_answer': gt_answer,
            'is_valid': is_valid
        }
    save_json('result/%s_scores_final_v2.json' % model_type, results)
Example #6
0
 def save(self, folder_name="autosave"):
     base_path = os.path.join(self.save_path, folder_name)
     print 'save %s to %s' % (self.scene_handler.scene.name, base_path)
     scn = self.scene_handler.scene
     util.mkdir_if_absent(base_path)
     util.save_json(self.dict_repr(), os.path.join(base_path, 'game'))
     util.save_json(scn.dict_repr(), os.path.join(base_path, scn.name))
Example #7
0
def process():
    def _parse_image_id(image):
        return int(image.split('.')[0].split('_')[-1])

    model = AttentionModel()
    ans2top_ans = AnswerTokenToTopAnswer()

    task_data_dir = '/usr/data/fl302/code/utils/bs_data_maker'
    task_data_file = os.path.join(task_data_dir, 'task_data_for_verif.json')
    task_data = load_json(task_data_file)
    is_valid = []
    num = len(task_data)
    for i, info in enumerate(task_data):
        print('%d/%d' % (i, num))
        image = info['image']
        image_id = _parse_image_id(image)
        question = info['target']
        answer = info['answer']
        scores = model.inference(image_id, question)
        scores[:, -1] = -10.
        # pdb.set_trace()
        top_ans_id = ans2top_ans.direct_query(answer)
        if top_ans_id == 2000:
            raise Exception('Warning: answer oov')
        scores = scores.flatten()
        pred_top_ans_id = scores.argmax()
        is_valid.append(int(pred_top_ans_id == top_ans_id))

    n_valid = sum(is_valid)
    print('valid: %d/%d' % (n_valid, num))
    save_json(os.path.join(task_data_dir, 'task_data_verif_state.json'),
              is_valid)
Example #8
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        for sub in subs:
            if sub not in conf.keys():
                continue
            html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = conf[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Example #9
0
def score_replay_buffer():
    d = load_json('vqa_replay_buffer/low/vqa_replay.json')
    vqa_data = VQAData()

    # create model
    sess, model = create_model()

    memory = d['memory']
    new_memory = {}
    for i, quest_key in enumerate(memory.keys()):
        pathes = memory[quest_key]
        if len(pathes) == 0:
            continue
        if i % 100 == 0:
            print('Processed %d/%d items' % (i, len(memory)))
        new_memory[quest_key] = {}
        # if it has valid questions
        quest_id = int(quest_key)
        pathes = memory[quest_key].keys()
        quest, quest_len = process_questions(pathes)
        image, top_ans = vqa_data.get_data(quest_id)
        vqa_inputs = [image, quest, quest_len, top_ans]
        scores = sess.run(model.prob,
                          feed_dict=model.fill_feed_dict(vqa_inputs))
        confs = scores[:, top_ans]
        ref = memory[quest_key]
        for path_key, new_conf in zip(pathes, confs):
            new_memory[quest_key][path_key] = (ref[path_key], float(new_conf))
    # save
    save_json('vqa_replay_buffer/vqa_replay_low_rescore.json',
              {'memory': new_memory})
def process_worker(subset, id, proc_range, to_sentence):
    # params
    k = 50
    res_file = 'result/quest_vaq_nn_%s_worker%d.json' % (subset, id)

    # load distances
    val_qids, nn_ids = load_image_nn(subset=subset)
    # slice
    val_qids = val_qids[proc_range]
    nn_ids = nn_ids[proc_range]

    # create nn model
    nn_model = QuestionPool()

    # run
    num = len(val_qids)
    results = []
    for i, (v_qid, v_nn) in enumerate(zip(val_qids, nn_ids)):
        # run nn search
        t = time()
        tr_qid, tr_path = nn_model.get_candidates(v_nn[:k])
        sent = to_sentence.index_to_question(tr_path)
        print(sent)
        print('P%d: Processing %d/%d, time %0.2f sec.' %
              (id, i, num, time() - t))
        res_i = {'question_id': int(v_qid), 'question': sent}
        results.append(res_i)
    save_json(res_file, results)
    print('P%d: Done' % id)
    return
def process():
    cands = load_results()
    model = N2MNWrapper()
    mc_ctx = MultiChoiceQuestionManger(subset='val')

    results = []
    t = time()
    for i, res_i in enumerate(cands):
        if i % 100 == 0:
            avg_time = (time() - t) / 100.
            print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time))
            t = time()

        image_id = res_i['image_id']
        aug_id = res_i['question_id']
        question = res_i['question']
        question_id = int(aug_id / 1000)
        gt_answer = mc_ctx.get_gt_answer(question_id)
        pred_answers, scores = model.inference(image_id, [question])
        sc = scores[0]
        pred_ans = pred_answers[0]
        is_valid = compare_answer(pred_ans, gt_answer)
        # import pdb
        # pdb.set_trace()
        if not is_valid:
            continue
        t_i = {
            'image_id': int(image_id),
            'question_id': aug_id,
            'question': question,
            'score': float(sc)
        }
        results.append(t_i)
    save_json('result/vae_ia_van_n2mn_flt_full.json', results)
Example #12
0
 def save(self, folder_name="autosave"):
     base_path = os.path.join(self.save_path, folder_name)
     print 'save %s to %s' % (self.scene_handler.scene.name, base_path)
     scn = self.scene_handler.scene
     util.mkdir_if_absent(base_path)
     util.save_json(self.dict_repr(), os.path.join(base_path, 'game'))
     util.save_json(scn.dict_repr(), os.path.join(base_path, scn.name))
Example #13
0
def precompute(num_p, offset=0, primes_path="primes.txt", store_dir=store_dir):
    Path(store_dir).mkdir(exist_ok=True)

    primes = load_primes(primes_path)
    for p in tqdm(primes[offset:num_p], desc="Precomputing"):
        rc_dict = precompute_p(p)
        util.save_json(p, rc_dict, store_dir)
Example #14
0
def merge_result(res1, res2):
    results = []
    unk_counts = []
    batch_vqa_scores = []
    for quest_id in res1:
        items1 = res1[quest_id]
        items2 = res2[quest_id]
        res_i, scores_i, n_i = _merge_item(items1, items2)
        results += res_i
        batch_vqa_scores.append(scores_i.mean())
        unk_counts.append(n_i)
    # save results
    res_file = 'result/bs_gen_%s.json' % 'vae_ia_merge'
    score_file = 'result/bs_vqa_scores_%s.mat' % 'vae_ia_merge'
    save_json(res_file, results)
    batch_vqa_scores = np.array(batch_vqa_scores, dtype=np.float32)
    mean_vqa_score = batch_vqa_scores.mean()
    mean_unk_count = np.mean(unk_counts)

    savemat(score_file, {
        'scores': batch_vqa_scores,
        'mean_score': mean_vqa_score
    })
    print('BS mean VQA score: %0.3f' % mean_vqa_score)
    print('BS mean #questions: %0.3f' % mean_unk_count)
    return res_file, mean_vqa_score, mean_unk_count
Example #15
0
def process():
    # load data
    blacklist = make_blacklist()
    save_json('data/kptest_blacklist.json', blacklist)
    qrpe = load_qrpe_data(blacklist)
    vtfp = load_vtfp_data(blacklist)
    import pdb
    pdb.set_trace()
    meta = qrpe + vtfp
    # process data
    images, image_ids, questions = [], [], []
    encoder = SentenceEncoder()
    for item in meta:
        image_id = item['image_id']
        image = item['image']
        tokens = encoder.encode_sentence(item['question'])
        images.append(image)
        image_ids.append(image_id)
        questions.append(tokens)
    # put to array
    from post_process_variation_questions import put_to_array
    arr, arr_len = put_to_array(questions)

    save_json('data/QRD_irrelevant_meta.json', {
        'images': images,
        'image_ids': image_ids
    })
    image_ids = np.array(image_ids, dtype=np.int32)
    save_hdf5('data/QRD_irrelevant_data.data', {
        'image_ids': image_ids,
        'quest': arr,
        'quest_len': arr_len
    })
Example #16
0
def process_test():
    from util import save_hdf5, save_json
    # load data
    meta = load_bsir_dataset()
    # process data
    labels, images, image_ids, questions = [], [], [], []
    encoder = SentenceEncoder()
    for item in meta:
        image_id = item['image_id']
        image = item['image']
        tokens = encoder.encode_sentence(item['question'])
        images.append(image)
        image_ids.append(image_id)
        questions.append(tokens)
        labels.append(item['label'])
    # put to array
    from post_process_variation_questions import put_to_array
    arr, arr_len = put_to_array(questions)

    save_json('data/QRD_irrelevant_meta_test.json', {
        'images': images,
        'image_ids': image_ids
    })
    image_ids = np.array(image_ids, dtype=np.int32)
    labels = np.array(labels, dtype=np.float32)
    import pdb
    pdb.set_trace()
    save_hdf5(
        'data/QRD_irrelevant_data_test.data', {
            'image_ids': image_ids,
            'quest': arr,
            'quest_len': arr_len,
            'labels': labels
        })
Example #17
0
def _encode_w2v(images, encoder, subset):
    quest_coding = []
    cands_coding = []
    labels = []
    quest_ids = []
    cands_meta = []
    for i, info in enumerate(images):
        if not i % 1000:
            tf.logging.info("%s: processed %d of %d items." %
                            (subset.upper(), i, len(images)))

        quest_id = info.question_id
        q_w2v = encoder.encode(info.question)
        ca_w2v, label = _encode_answer_candidates(info, encoder)
        # pdb.set_trace()
        quest_coding.append(q_w2v)
        cands_coding.append(ca_w2v)
        labels.append(label)
        quest_ids.append(quest_id)
        _m = {'quest_id': quest_id, 'cands': info.choices}
        cands_meta.append(_m)
    # ready to pack data
    quest_coding = np.concatenate(quest_coding, axis=0).astype(np.float32)
    cands_coding = np.concatenate(cands_coding, axis=0).astype(np.float32)
    labels = np.array(labels, dtype=np.int32)
    quest_ids = np.array(quest_ids, dtype=np.int32)
    save_hdf5(
        'data3/vqa_mc_w2v_coding_%s.data' % subset, {
            'quest_w2v': quest_coding,
            'cands_w2v': cands_coding,
            'labels': labels,
            'quest_ids': quest_ids
        })
    save_json('data3/vqa_mc_cands_%s.meta' % subset, cands_meta)
Example #18
0
def search_pub_and_save(uid, query, base_dir):
    data = {
        'uid': uid,
        'query': query,
        'success': True,
        'message': '',
        'result': {},
    }
    path = get_search_result_path(uid, query, base_dir)
    if os.path.isfile(path):
        data = util.load_json(path)
        if data['success'] and data['result']:
            print('skipping query "{}" with results'.format(query))
            return
    print('executing query "{}"'.format(query))
    try:
        result = search_pub(query)
        data['result'] = result
    except Exception as e:
        error_msg = str(e)
        print('ERROR for query "{}": "{}"'.format(query, error_msg))
        data['success'] = False
        data['message'] = error_msg
    util.save_json(path, data)
    print('saved query "{}" to {}'.format(query, path))
    global sleep_time
    if not data['result']:
        sleep_time *= 1.1
    print('sleeping for {:.2f} seconds'.format(sleep_time))
    time.sleep(sleep_time)
    return data
Example #19
0
def parse_raw_refs():
    refs = util.load_json(cfg.paths['raw-papers-refs'])
    data = util.parallelize(_parse_raw_refs, list(refs.values()), N_THREADS)
    refs = {k: v for k, v in zip(refs.keys(), data)}
    util.save_json(cfg.paths['papers-refs'], refs)

    print('saved papers refs to "{}"'.format(cfg.paths['papers-refs']))
Example #20
0
def process():
    cands = load_results()
    model = N2MNWrapper()
    mc_ctx = MultiChoiceQuestionManger(subset='val')

    results = {}
    t = time()
    for i, res_key in enumerate(cands):
        if i % 100 == 0:
            avg_time = (time() - t) / 100.
            print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time))
            t = time()
        res_i = cands[res_key]
        image_id = res_i['image_id']
        question = res_i['target']
        question_id = res_i['question_id']
        gt_answer = mc_ctx.get_gt_answer(question_id)
        pred_answers, scores = model.inference(image_id, [question])
        sc = float(scores[0])
        pred_ans = pred_answers[0]
        is_valid = compare_answer(pred_ans, gt_answer)
        # if not is_valid:
        #     continue
        results[res_key] = {
            'pred_answer': pred_ans,
            'pred_score': sc,
            'gt_answer': gt_answer,
            'is_valid': is_valid
        }
    save_json('result/n2mn_scores_final_v2.json', results)
def ivqa_decoding_beam_search(checkpoint_path=None, subset='kpval'):
    model_config = ModelConfig()
    res_file = 'result/quest_vaq_greedy_%s_%s.json' % (
        FLAGS.model_type.upper(), subset)
    # Get model
    model_fn = get_model_creation_fn(FLAGS.model_type)
    create_fn = create_reader(FLAGS.model_type, phase='test')

    # Create the vocabulary.
    to_sentence = SentenceGenerator(trainset='trainval')

    # get data reader
    reader = create_fn(batch_size=100,
                       subset=subset,
                       version=FLAGS.test_version)

    if checkpoint_path is None:
        ckpt_dir = FLAGS.checkpoint_dir % (FLAGS.version, FLAGS.model_type)
        # ckpt_dir = '/import/vision-ephemeral/fl302/models/v2_kpvaq_VAQ-RL/'
        ckpt = tf.train.get_checkpoint_state(ckpt_dir)
        checkpoint_path = ckpt.model_checkpoint_path

    # Build model
    g = tf.Graph()
    with g.as_default():
        # Build the model.
        model = model_fn(model_config, 'beam')
        model.build()
        # Restore from checkpoint
        restorer = Restorer(g)
        sess = tf.Session()
        restorer.restore(sess, checkpoint_path)

    num_batches = reader.num_batches

    print('Running beam search inference...')
    results = []
    for i in range(num_batches):
        outputs = reader.get_test_batch()

        # inference
        quest_ids, image_ids = outputs[-2:]
        scores, pathes = model.greedy_inference(outputs[:-2], sess)

        scores, pathes = post_process_prediction(scores, pathes)
        question = to_sentence.index_to_question(pathes[0])
        print('%d/%d: %s' % (i, num_batches, question))

        for quest_id, image_id, path in zip(quest_ids, image_ids, pathes):
            sentence = to_sentence.index_to_question(path)
            res_i = {
                'image_id': int(image_id),
                'question_id': int(quest_id),
                'question': sentence
            }
            results.append(res_i)

    save_json(res_file, results)
    return res_file
 def save_data(self):
   save_payload = {
     "trades": self.swaps,
     "locks": self.locks,
     "history": self.history,
     "addresses": self.addresses
   }
   save_json(self.get_path(), save_payload)
Example #23
0
 def save(self):
     if self.num_call % self.save_interval == 0:
         print('Saving VQA replay buffers')
         t = time()
         sv_file = os.path.join(self.sv_dir, 'vqa_replay.json')
         save_json(sv_file, {'num_call': self.num_call,
                             'memory': self.memory})
         print('File %s saved to disk, total time: %0.2fs' % (sv_file, time() - t))
def process(method, inf_type='rand'):
    if inf_type == 'rand':
        res_file = 'result/tmp_bs_RL2_final_%s.json' % method
    else:
        res_file = 'result/tmp_bs_RL2_final_%s_BEAM.json' % method
    if os.path.exists(res_file):
        print('File %s already exist, skipped' % res_file)
        return

    # cands = load_results()
    model = _TYPE2Model[method]()
    mc_ctx = MultiChoiceQuestionManger(subset='val')

    task_data = load_lm_outputs(method, inf_type)

    belief_sets = {}
    t = time()
    num = len(task_data)
    for i, ans_key in enumerate(task_data.keys()):
        # time it
        avg_time = (time() - t)
        print('%d/%d (%0.2f sec/sample)' % (i, num, avg_time))
        t = time()

        # extract basis info
        cands = task_data[ans_key]
        quest_id = cands[0]['question_id']

        # gt_answer = mc_ctx.get_gt_answer(quest_id)
        image_id = mc_ctx.get_image_id(quest_id)
        image = mc_ctx.get_image_file(quest_id)

        # process
        gt_question = mc_ctx.get_question(quest_id)

        i_scores, i_questions = [], []
        for item in cands:
            target = item['question']
            pred_ans, vqa_score = model.get_score(image_id, target)
            # inset check
            is_valid = compare_answer(pred_ans, ans_key)
            if not is_valid:
                continue
            i_questions.append(target)
            i_scores.append([float(vqa_score), item['score']])
        print('%d/%d' % (len(i_questions), len(cands)))
        bs_i = {
            'image': image,
            'image_id': image_id,
            'question': gt_question,
            'answer': ans_key,
            'belief_sets': i_questions,
            'belief_strength': i_scores
        }

        belief_sets[ans_key] = bs_i
    save_json(res_file, belief_sets)
Example #25
0
def vaq_decoding_greedy(checkpoint_path=None, subset='kpval'):
    model_config = ModelConfig()
    res_file = 'result/quest_vaq_greedy_%s.json' % FLAGS.model_type.upper()

    # Get model
    model_fn = get_model_creation_fn(FLAGS.model_type)
    create_fn = create_reader(FLAGS.model_type, phase='test')
    # Create the vocabulary.
    to_sentence = SentenceGenerator(trainset='trainval')

    # build data reader
    reader = create_fn(batch_size=32, subset=subset)

    if checkpoint_path is None:
        ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir %
                                             FLAGS.model_type)
        checkpoint_path = ckpt.model_checkpoint_path

    g = tf.Graph()
    with g.as_default():
        # Build the model.
        model = model_fn(model_config, 'greedy')
        model.build()
        saver = tf.train.Saver()

        sess = tf.Session()
        tf.logging.info('Restore from model %s' %
                        os.path.basename(checkpoint_path))
        saver.restore(sess, checkpoint_path)

    num_batches = reader.num_batches

    print('Running greedy inference...')
    results = []
    for i in range(num_batches):
        outputs = reader.get_test_batch()

        # inference
        quest_ids, image_ids = outputs[-2:]
        scores, pathes = model.greedy_inference(outputs[:-2], sess)

        scores, pathes = post_process_prediction(scores, pathes)
        question = to_sentence.index_to_question(pathes[0])
        print('%d/%d: %s' % (i, num_batches, question))

        for quest_id, image_id, path in zip(quest_ids, image_ids, pathes):
            sentence = to_sentence.index_to_question(path)
            res_i = {
                'image_id': int(image_id),
                'question_id': int(quest_id),
                'question': sentence
            }
            results.append(res_i)

    save_json(res_file, results)
    return res_file
def make_dummpy_result_file(pop_q):
    from util import load_json, save_json
    results = load_json('result/quest_vaq_greedy_VAQ-SAT_kptest.json')
    for item in results:
        item['image_id'] = int(item['image_id'])
        item['question_id'] = int(item['question_id'])
        item['question'] = pop_q
    sv_file = 'result/tmp_pop_q_kptest.json'
    save_json(sv_file, results)
    return sv_file
Example #27
0
    def save_framesummary(self, frame):
        data = {}
        data['id'] = frame.id
        data['name'] = frame.name
        data['topic_ids'] = frame.topic_ids
        data['bins'] = [(id, bin_name_from_id(id)) for id in frame.bin_ids]
        data['counts'] = frame.counts
        data['heatmapGrid'] = frame.heatmap.grid
        data['heatmapWeights'] = self.get_frame_heatmap_weights(frame)

        file_path = join(self.target_dir, 'framesummary', str(frame.id))
        save_json(file_path, data, indent=0)
Example #28
0
    def save_topicframe(self, frame, topic_frame):
        data = {}
        data['topicId'] = topic_frame.topic_id
        data['frameId'] = topic_frame.frame_id
        data['tokenList'] = topic_frame.token_list
        data['tokenWeights'] = self.get_token_weights(frame, topic_frame)
        data['heatmapWeights'] = self.get_topicframe_heatmap_weights(
            frame, topic_frame)
        data['counts'] = topic_frame.counts

        file_path = join(self.target_dir, 'topicframe',
                         str(topic_frame.frame_id), str(topic_frame.topic_id))
        save_json(file_path, data, indent=0)
Example #29
0
def get_authors():
	files = util.listdir(AUTHOR_FOLDER)
	util.mkdir(AUTHOR_CRALWED_FOLDER)
	for file_name in files:
		save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		print data['short'], full_name
		data['links'] = get_links(data['short'], html)
		util.save_json(save_path, data)
def download_missing_paper_pdfs():
    if not os.path.isdir(cfg.paths['pdfs-dir']):
        os.makedirs(cfg.paths['pdfs-dir'])

    metas = util.load_json(cfg.paths['papers-metadata'])
    metas = util.parallelize(download_paper_pdf_if_needed, metas, N_THREADS)
    util.save_json(cfg.paths['papers-metadata'], metas)

    print('\n----')
    print('saved updated papers metadata to "{}"'.format(
        cfg.paths['papers-metadata']))
    n = sum(int(os.path.isfile(m['pdf-path'])) for m in metas)
    print('{}/{} ({:.3f}%) items with pdfs'.format(n, len(metas),
                                                   100 * n / len(metas)))
Example #31
0
def get_authors():
    files = util.listdir(AUTHOR_FOLDER)
    util.mkdir(AUTHOR_CRALWED_FOLDER)
    for file_name in files:
        save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name))
        html = util.get_page(data['url'])
        full_name = get_full_name(html)
        data['name'] = full_name
        print data['short'], full_name
        data['links'] = get_links(data['short'], html)
        util.save_json(save_path, data)
def build_question_set():
    sv_file = 'data/kprestval_pos_tags.json'
    st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    meta = load_and_process_metadata('val')
    images = split_data_by_seed(meta, 'kprestval')
    num = len(images)
    pos_tags_dict = {}
    for i, info in enumerate(images):
        question_id = info.question_id
        question = info.question.lower()
        _pos_tags = st.tag(word_tokenize(question))
        pos_tags_dict[question_id] = _pos_tags
        print('\nPOS TAGGER: %d/%d' % (i, num))
        print(_pos_tags)
    save_json(sv_file, {'pos_tags': pos_tags_dict})
Example #33
0
def convert_vqa_annotations():
    path_src = '../../data/VQA/Annotations/mscoco_val2014_annotations.json'
    d = load_json(path_src)
    anno = {}
    for info in d['annotations']:
        quest_id = int(info['question_id'])
        tmp = {}
        for _a in info['answers']:
            a = _a['answer']
            if a in tmp:
                tmp[a] += 1.0
            else:
                tmp[a] = 1.0
        anno[quest_id] = tmp
    save_json(_ANNO_FILE, anno)
def get_conferences():
	files = util.listdir(CONFERENCE_FOLDER)
	util.mkdir(CONFERENCE_CRALWED_FOLDER)
	cnt = 0
	conf = util.load_json('conf_name.json')
	for file_name in files:
		save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
		if data['short'] not in conf.keys():
			continue
		html = util.get_page(data['url'])
		subs = get_subs(data['short'], html)
		data['name'] = conf[data['short']]
		data['sub'] = {}
		if len(subs) == 0:
			data['sub']['#'] = get_publications(html)
			util.save_json(save_path, data)
		cnt += 1
def get_journals():
	files = util.listdir(JOURNAL_FOLDER)
	util.mkdir(JOURNAL_CRALWED_FOLDER)
	cnt = 0
	jour = util.load_json('jour_name.json')
	for file_name in files:
		save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
		if data['short'] not in jour.keys():
			continue
		html = util.get_page(data['url'])
		subs = get_subs(data['short'], html)
		data['name'] = jour[data['short']]
		data['sub'] = {}
		if len(subs) == 0:
			data['sub']['#'] = get_publications(html)
			util.save_json(save_path, data)
		cnt += 1
		print cnt, len(files), data['short']		
Example #36
0
def get_journals():
    pos, cnt = 1, 0
    util.mkdir(JOURNAL_FOLDER)
    while True:
        html = util.get_page(JOURNAL_URL + str(pos))
        links = util.find_journals(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'journal'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data['short']
            util.save_json(os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])), data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
        	break
        pos += 100
        print 'Journal', cnt
Example #37
0
def get_conferences():
    pos, cnt = 1, 0
    util.mkdir(CONFERENCE_FOLDER)
    while True:
        html = util.get_page(CONFERENCE_URL + str(pos))
        links = util.find_conferences(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'conference'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/db/conf/' + data['short']
            util.save_json(os.path.join(CONFERENCE_FOLDER, util.hex_hash(data['short'])), data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
        	break
        pos += 100
        print 'Conference', cnt
Example #38
0
def get_authors():
    pos, cnt = 545504, 0
    util.mkdir(AUTHOR_FOLDER)
    while True:
        html = util.get_page(AUTHOR_URL + str(pos))
        links = util.find_authors(html)
        once_cnt = 0
        for link in links:
            if link[0] == '' or '?' in link[0]:
                continue
            data = {}
            data['type'] = 'author'
            data['short'] = link[0]
            data['name'] = link[1]
            data['url'] = 'http://dblp.uni-trier.de/pers/hd/a/' + data['short']
            util.save_json(os.path.join(AUTHOR_FOLDER, util.hex_hash(data['short'])), data)
            cnt += 1
            once_cnt += 1
        if once_cnt == 0:
        	break
        pos += 300
        print 'Author', pos, cnt
Example #39
0
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    util.mkdir(JOURNAL_CRALWED_FOLDER)
    cnt = 0
    jour = util.load_json('jour_name.json')
    for file_name in files:
        save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        if data['short'] not in jour.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = jour[data['short']]
        data['sub'] = {}
        for sub in subs:
            html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = jour[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Example #40
0
            ip_choices.remove(ip)
            nodeStr = lb_node % (ip, node_port)
            nodesStr += "%s\n    " % nodeStr

        if health_mon == "true":
            hm = hm_req % (health_mon_type)
        else:
            hm = ""

        lbStr = lb_post % (lb_name + "%i" % i, lb_port, lb_proto, vip_type, nodesStr, hm)
        reqs.append(lbStr + "\n")
    return reqs


if __name__ == "__main__":
    prog = sys.argv[0]
    if len(sys.argv) < 1:
        usage(prog)
        sys.exit()
    ips = getIps("server_list.json")
    request_list = []
    configs = util.load_json("reqConfig.json")
    for config in configs:
        reqs = build_req(config, ips)
        for req in reqs:
            request_list.append(req)

    random.shuffle(request_list)

    util.save_json("requests.json", request_list)
Example #41
0
JOURNAL_CRALWED_FOLDER = os.path.join('link', 'journal')
CONFERENCE_CRALWED_FOLDER = os.path.join('link', 'conference')


merged_data = {}
merged_data['jour'] = {}
merged_data['conf'] = {}

cnt = 0
files = util.listdir(JOURNAL_CRALWED_FOLDER)
for file_name in files:
	data = util.load_json(os.path.join(JOURNAL_CRALWED_FOLDER, file_name))
	short = data['short']
	del data['short']
	merged_data['jour'][short] = data
	cnt += 1
	print cnt, len(files)

cnt = 0
files = util.listdir(CONFERENCE_CRALWED_FOLDER)
for file_name in files:
	data = util.load_json(os.path.join(CONFERENCE_CRALWED_FOLDER, file_name))
	short = data['short']
	del data['short']
	merged_data['conf'][short] = data
	cnt += 1
	print cnt, len(files)


util.save_json('merged.json', merged_data)