def extract_raw_refs_from_pdfs(): metas = util.load_json(cfg.paths['papers-metadata']) refs = util.parallelize(extract_raw_refs_from_pdf, metas, N_THREADS) refs = {m['uid']: r for m, r in zip(metas, refs)} util.save_json(cfg.paths['raw-papers-refs'], refs) print('saved raw papers refs to "{}"'.format(cfg.paths['raw-papers-refs']))
def process(): cands = load_results() model = AttentionModel() mc_ctx = MultiChoiceQuestionManger(subset='val') results = [] t = time() for i, res_i in enumerate(cands): if i % 100 == 0: avg_time = (time() - t) / 100. print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time)) t = time() image_id = res_i['image_id'] aug_id = res_i['aug_id'] question = res_i['target'] # question_id = int(aug_id / 1000) question_id = res_i['question_id'] gt_answer = mc_ctx.get_gt_answer(question_id) pred_ans, sc = model.get_score(image_id, question) is_valid = compare_answer(pred_ans, gt_answer) # import pdb # pdb.set_trace() if not is_valid: continue t_i = { 'image_id': int(image_id), 'aug_id': aug_id, 'question_id': question_id, 'question': question, 'score': float(sc) } results.append(t_i) save_json('result/bs_vis_scores_mlb2-att.json', results)
def get_journals(): pos, cnt = 1, 0 util.mkdir(JOURNAL_FOLDER) while True: html = util.get_page(JOURNAL_URL + str(pos)) links = util.find_journals(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'journal' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data[ 'short'] util.save_json( os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 100 print 'Journal', cnt
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 conf = util.load_json('conf_name.json') for file_name in files: save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) if data['short'] not in conf.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = conf[data['short']] data['sub'] = {} for sub in subs: if sub not in conf.keys(): continue html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub) data['sub'][sub] = {} data['sub'][sub]['pub'] = get_publications(html) data['sub'][sub]['name'] = conf[sub] cnt += 1 print cnt, len(files), data['short'] util.save_json(save_path, data)
def process(model_type='mlb'): cands = load_results() if model_type == 'mlb': model = AttentionModel() else: model = VanillaModel() mc_ctx = MultiChoiceQuestionManger(subset='val') results = {} t = time() for i, res_key in enumerate(cands): if i % 100 == 0: avg_time = (time() - t) / 100. print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time)) t = time() res_i = cands[res_key] image_id = res_i['image_id'] question = res_i['target'] question_id = res_i['question_id'] gt_answer = mc_ctx.get_gt_answer(question_id) pred_ans, scores = model.get_score(image_id, question) sc = float(scores) is_valid = compare_answer(pred_ans, gt_answer) # if not is_valid: # continue results[res_key] = { 'pred_answer': pred_ans, 'pred_score': sc, 'gt_answer': gt_answer, 'is_valid': is_valid } save_json('result/%s_scores_final_v2.json' % model_type, results)
def save(self, folder_name="autosave"): base_path = os.path.join(self.save_path, folder_name) print 'save %s to %s' % (self.scene_handler.scene.name, base_path) scn = self.scene_handler.scene util.mkdir_if_absent(base_path) util.save_json(self.dict_repr(), os.path.join(base_path, 'game')) util.save_json(scn.dict_repr(), os.path.join(base_path, scn.name))
def process(): def _parse_image_id(image): return int(image.split('.')[0].split('_')[-1]) model = AttentionModel() ans2top_ans = AnswerTokenToTopAnswer() task_data_dir = '/usr/data/fl302/code/utils/bs_data_maker' task_data_file = os.path.join(task_data_dir, 'task_data_for_verif.json') task_data = load_json(task_data_file) is_valid = [] num = len(task_data) for i, info in enumerate(task_data): print('%d/%d' % (i, num)) image = info['image'] image_id = _parse_image_id(image) question = info['target'] answer = info['answer'] scores = model.inference(image_id, question) scores[:, -1] = -10. # pdb.set_trace() top_ans_id = ans2top_ans.direct_query(answer) if top_ans_id == 2000: raise Exception('Warning: answer oov') scores = scores.flatten() pred_top_ans_id = scores.argmax() is_valid.append(int(pred_top_ans_id == top_ans_id)) n_valid = sum(is_valid) print('valid: %d/%d' % (n_valid, num)) save_json(os.path.join(task_data_dir, 'task_data_verif_state.json'), is_valid)
def score_replay_buffer(): d = load_json('vqa_replay_buffer/low/vqa_replay.json') vqa_data = VQAData() # create model sess, model = create_model() memory = d['memory'] new_memory = {} for i, quest_key in enumerate(memory.keys()): pathes = memory[quest_key] if len(pathes) == 0: continue if i % 100 == 0: print('Processed %d/%d items' % (i, len(memory))) new_memory[quest_key] = {} # if it has valid questions quest_id = int(quest_key) pathes = memory[quest_key].keys() quest, quest_len = process_questions(pathes) image, top_ans = vqa_data.get_data(quest_id) vqa_inputs = [image, quest, quest_len, top_ans] scores = sess.run(model.prob, feed_dict=model.fill_feed_dict(vqa_inputs)) confs = scores[:, top_ans] ref = memory[quest_key] for path_key, new_conf in zip(pathes, confs): new_memory[quest_key][path_key] = (ref[path_key], float(new_conf)) # save save_json('vqa_replay_buffer/vqa_replay_low_rescore.json', {'memory': new_memory})
def process_worker(subset, id, proc_range, to_sentence): # params k = 50 res_file = 'result/quest_vaq_nn_%s_worker%d.json' % (subset, id) # load distances val_qids, nn_ids = load_image_nn(subset=subset) # slice val_qids = val_qids[proc_range] nn_ids = nn_ids[proc_range] # create nn model nn_model = QuestionPool() # run num = len(val_qids) results = [] for i, (v_qid, v_nn) in enumerate(zip(val_qids, nn_ids)): # run nn search t = time() tr_qid, tr_path = nn_model.get_candidates(v_nn[:k]) sent = to_sentence.index_to_question(tr_path) print(sent) print('P%d: Processing %d/%d, time %0.2f sec.' % (id, i, num, time() - t)) res_i = {'question_id': int(v_qid), 'question': sent} results.append(res_i) save_json(res_file, results) print('P%d: Done' % id) return
def process(): cands = load_results() model = N2MNWrapper() mc_ctx = MultiChoiceQuestionManger(subset='val') results = [] t = time() for i, res_i in enumerate(cands): if i % 100 == 0: avg_time = (time() - t) / 100. print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time)) t = time() image_id = res_i['image_id'] aug_id = res_i['question_id'] question = res_i['question'] question_id = int(aug_id / 1000) gt_answer = mc_ctx.get_gt_answer(question_id) pred_answers, scores = model.inference(image_id, [question]) sc = scores[0] pred_ans = pred_answers[0] is_valid = compare_answer(pred_ans, gt_answer) # import pdb # pdb.set_trace() if not is_valid: continue t_i = { 'image_id': int(image_id), 'question_id': aug_id, 'question': question, 'score': float(sc) } results.append(t_i) save_json('result/vae_ia_van_n2mn_flt_full.json', results)
def precompute(num_p, offset=0, primes_path="primes.txt", store_dir=store_dir): Path(store_dir).mkdir(exist_ok=True) primes = load_primes(primes_path) for p in tqdm(primes[offset:num_p], desc="Precomputing"): rc_dict = precompute_p(p) util.save_json(p, rc_dict, store_dir)
def merge_result(res1, res2): results = [] unk_counts = [] batch_vqa_scores = [] for quest_id in res1: items1 = res1[quest_id] items2 = res2[quest_id] res_i, scores_i, n_i = _merge_item(items1, items2) results += res_i batch_vqa_scores.append(scores_i.mean()) unk_counts.append(n_i) # save results res_file = 'result/bs_gen_%s.json' % 'vae_ia_merge' score_file = 'result/bs_vqa_scores_%s.mat' % 'vae_ia_merge' save_json(res_file, results) batch_vqa_scores = np.array(batch_vqa_scores, dtype=np.float32) mean_vqa_score = batch_vqa_scores.mean() mean_unk_count = np.mean(unk_counts) savemat(score_file, { 'scores': batch_vqa_scores, 'mean_score': mean_vqa_score }) print('BS mean VQA score: %0.3f' % mean_vqa_score) print('BS mean #questions: %0.3f' % mean_unk_count) return res_file, mean_vqa_score, mean_unk_count
def process(): # load data blacklist = make_blacklist() save_json('data/kptest_blacklist.json', blacklist) qrpe = load_qrpe_data(blacklist) vtfp = load_vtfp_data(blacklist) import pdb pdb.set_trace() meta = qrpe + vtfp # process data images, image_ids, questions = [], [], [] encoder = SentenceEncoder() for item in meta: image_id = item['image_id'] image = item['image'] tokens = encoder.encode_sentence(item['question']) images.append(image) image_ids.append(image_id) questions.append(tokens) # put to array from post_process_variation_questions import put_to_array arr, arr_len = put_to_array(questions) save_json('data/QRD_irrelevant_meta.json', { 'images': images, 'image_ids': image_ids }) image_ids = np.array(image_ids, dtype=np.int32) save_hdf5('data/QRD_irrelevant_data.data', { 'image_ids': image_ids, 'quest': arr, 'quest_len': arr_len })
def process_test(): from util import save_hdf5, save_json # load data meta = load_bsir_dataset() # process data labels, images, image_ids, questions = [], [], [], [] encoder = SentenceEncoder() for item in meta: image_id = item['image_id'] image = item['image'] tokens = encoder.encode_sentence(item['question']) images.append(image) image_ids.append(image_id) questions.append(tokens) labels.append(item['label']) # put to array from post_process_variation_questions import put_to_array arr, arr_len = put_to_array(questions) save_json('data/QRD_irrelevant_meta_test.json', { 'images': images, 'image_ids': image_ids }) image_ids = np.array(image_ids, dtype=np.int32) labels = np.array(labels, dtype=np.float32) import pdb pdb.set_trace() save_hdf5( 'data/QRD_irrelevant_data_test.data', { 'image_ids': image_ids, 'quest': arr, 'quest_len': arr_len, 'labels': labels })
def _encode_w2v(images, encoder, subset): quest_coding = [] cands_coding = [] labels = [] quest_ids = [] cands_meta = [] for i, info in enumerate(images): if not i % 1000: tf.logging.info("%s: processed %d of %d items." % (subset.upper(), i, len(images))) quest_id = info.question_id q_w2v = encoder.encode(info.question) ca_w2v, label = _encode_answer_candidates(info, encoder) # pdb.set_trace() quest_coding.append(q_w2v) cands_coding.append(ca_w2v) labels.append(label) quest_ids.append(quest_id) _m = {'quest_id': quest_id, 'cands': info.choices} cands_meta.append(_m) # ready to pack data quest_coding = np.concatenate(quest_coding, axis=0).astype(np.float32) cands_coding = np.concatenate(cands_coding, axis=0).astype(np.float32) labels = np.array(labels, dtype=np.int32) quest_ids = np.array(quest_ids, dtype=np.int32) save_hdf5( 'data3/vqa_mc_w2v_coding_%s.data' % subset, { 'quest_w2v': quest_coding, 'cands_w2v': cands_coding, 'labels': labels, 'quest_ids': quest_ids }) save_json('data3/vqa_mc_cands_%s.meta' % subset, cands_meta)
def search_pub_and_save(uid, query, base_dir): data = { 'uid': uid, 'query': query, 'success': True, 'message': '', 'result': {}, } path = get_search_result_path(uid, query, base_dir) if os.path.isfile(path): data = util.load_json(path) if data['success'] and data['result']: print('skipping query "{}" with results'.format(query)) return print('executing query "{}"'.format(query)) try: result = search_pub(query) data['result'] = result except Exception as e: error_msg = str(e) print('ERROR for query "{}": "{}"'.format(query, error_msg)) data['success'] = False data['message'] = error_msg util.save_json(path, data) print('saved query "{}" to {}'.format(query, path)) global sleep_time if not data['result']: sleep_time *= 1.1 print('sleeping for {:.2f} seconds'.format(sleep_time)) time.sleep(sleep_time) return data
def parse_raw_refs(): refs = util.load_json(cfg.paths['raw-papers-refs']) data = util.parallelize(_parse_raw_refs, list(refs.values()), N_THREADS) refs = {k: v for k, v in zip(refs.keys(), data)} util.save_json(cfg.paths['papers-refs'], refs) print('saved papers refs to "{}"'.format(cfg.paths['papers-refs']))
def process(): cands = load_results() model = N2MNWrapper() mc_ctx = MultiChoiceQuestionManger(subset='val') results = {} t = time() for i, res_key in enumerate(cands): if i % 100 == 0: avg_time = (time() - t) / 100. print('%d/%d (%0.2f sec/sample)' % (i, len(cands), avg_time)) t = time() res_i = cands[res_key] image_id = res_i['image_id'] question = res_i['target'] question_id = res_i['question_id'] gt_answer = mc_ctx.get_gt_answer(question_id) pred_answers, scores = model.inference(image_id, [question]) sc = float(scores[0]) pred_ans = pred_answers[0] is_valid = compare_answer(pred_ans, gt_answer) # if not is_valid: # continue results[res_key] = { 'pred_answer': pred_ans, 'pred_score': sc, 'gt_answer': gt_answer, 'is_valid': is_valid } save_json('result/n2mn_scores_final_v2.json', results)
def ivqa_decoding_beam_search(checkpoint_path=None, subset='kpval'): model_config = ModelConfig() res_file = 'result/quest_vaq_greedy_%s_%s.json' % ( FLAGS.model_type.upper(), subset) # Get model model_fn = get_model_creation_fn(FLAGS.model_type) create_fn = create_reader(FLAGS.model_type, phase='test') # Create the vocabulary. to_sentence = SentenceGenerator(trainset='trainval') # get data reader reader = create_fn(batch_size=100, subset=subset, version=FLAGS.test_version) if checkpoint_path is None: ckpt_dir = FLAGS.checkpoint_dir % (FLAGS.version, FLAGS.model_type) # ckpt_dir = '/import/vision-ephemeral/fl302/models/v2_kpvaq_VAQ-RL/' ckpt = tf.train.get_checkpoint_state(ckpt_dir) checkpoint_path = ckpt.model_checkpoint_path # Build model g = tf.Graph() with g.as_default(): # Build the model. model = model_fn(model_config, 'beam') model.build() # Restore from checkpoint restorer = Restorer(g) sess = tf.Session() restorer.restore(sess, checkpoint_path) num_batches = reader.num_batches print('Running beam search inference...') results = [] for i in range(num_batches): outputs = reader.get_test_batch() # inference quest_ids, image_ids = outputs[-2:] scores, pathes = model.greedy_inference(outputs[:-2], sess) scores, pathes = post_process_prediction(scores, pathes) question = to_sentence.index_to_question(pathes[0]) print('%d/%d: %s' % (i, num_batches, question)) for quest_id, image_id, path in zip(quest_ids, image_ids, pathes): sentence = to_sentence.index_to_question(path) res_i = { 'image_id': int(image_id), 'question_id': int(quest_id), 'question': sentence } results.append(res_i) save_json(res_file, results) return res_file
def save_data(self): save_payload = { "trades": self.swaps, "locks": self.locks, "history": self.history, "addresses": self.addresses } save_json(self.get_path(), save_payload)
def save(self): if self.num_call % self.save_interval == 0: print('Saving VQA replay buffers') t = time() sv_file = os.path.join(self.sv_dir, 'vqa_replay.json') save_json(sv_file, {'num_call': self.num_call, 'memory': self.memory}) print('File %s saved to disk, total time: %0.2fs' % (sv_file, time() - t))
def process(method, inf_type='rand'): if inf_type == 'rand': res_file = 'result/tmp_bs_RL2_final_%s.json' % method else: res_file = 'result/tmp_bs_RL2_final_%s_BEAM.json' % method if os.path.exists(res_file): print('File %s already exist, skipped' % res_file) return # cands = load_results() model = _TYPE2Model[method]() mc_ctx = MultiChoiceQuestionManger(subset='val') task_data = load_lm_outputs(method, inf_type) belief_sets = {} t = time() num = len(task_data) for i, ans_key in enumerate(task_data.keys()): # time it avg_time = (time() - t) print('%d/%d (%0.2f sec/sample)' % (i, num, avg_time)) t = time() # extract basis info cands = task_data[ans_key] quest_id = cands[0]['question_id'] # gt_answer = mc_ctx.get_gt_answer(quest_id) image_id = mc_ctx.get_image_id(quest_id) image = mc_ctx.get_image_file(quest_id) # process gt_question = mc_ctx.get_question(quest_id) i_scores, i_questions = [], [] for item in cands: target = item['question'] pred_ans, vqa_score = model.get_score(image_id, target) # inset check is_valid = compare_answer(pred_ans, ans_key) if not is_valid: continue i_questions.append(target) i_scores.append([float(vqa_score), item['score']]) print('%d/%d' % (len(i_questions), len(cands))) bs_i = { 'image': image, 'image_id': image_id, 'question': gt_question, 'answer': ans_key, 'belief_sets': i_questions, 'belief_strength': i_scores } belief_sets[ans_key] = bs_i save_json(res_file, belief_sets)
def vaq_decoding_greedy(checkpoint_path=None, subset='kpval'): model_config = ModelConfig() res_file = 'result/quest_vaq_greedy_%s.json' % FLAGS.model_type.upper() # Get model model_fn = get_model_creation_fn(FLAGS.model_type) create_fn = create_reader(FLAGS.model_type, phase='test') # Create the vocabulary. to_sentence = SentenceGenerator(trainset='trainval') # build data reader reader = create_fn(batch_size=32, subset=subset) if checkpoint_path is None: ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir % FLAGS.model_type) checkpoint_path = ckpt.model_checkpoint_path g = tf.Graph() with g.as_default(): # Build the model. model = model_fn(model_config, 'greedy') model.build() saver = tf.train.Saver() sess = tf.Session() tf.logging.info('Restore from model %s' % os.path.basename(checkpoint_path)) saver.restore(sess, checkpoint_path) num_batches = reader.num_batches print('Running greedy inference...') results = [] for i in range(num_batches): outputs = reader.get_test_batch() # inference quest_ids, image_ids = outputs[-2:] scores, pathes = model.greedy_inference(outputs[:-2], sess) scores, pathes = post_process_prediction(scores, pathes) question = to_sentence.index_to_question(pathes[0]) print('%d/%d: %s' % (i, num_batches, question)) for quest_id, image_id, path in zip(quest_ids, image_ids, pathes): sentence = to_sentence.index_to_question(path) res_i = { 'image_id': int(image_id), 'question_id': int(quest_id), 'question': sentence } results.append(res_i) save_json(res_file, results) return res_file
def make_dummpy_result_file(pop_q): from util import load_json, save_json results = load_json('result/quest_vaq_greedy_VAQ-SAT_kptest.json') for item in results: item['image_id'] = int(item['image_id']) item['question_id'] = int(item['question_id']) item['question'] = pop_q sv_file = 'result/tmp_pop_q_kptest.json' save_json(sv_file, results) return sv_file
def save_framesummary(self, frame): data = {} data['id'] = frame.id data['name'] = frame.name data['topic_ids'] = frame.topic_ids data['bins'] = [(id, bin_name_from_id(id)) for id in frame.bin_ids] data['counts'] = frame.counts data['heatmapGrid'] = frame.heatmap.grid data['heatmapWeights'] = self.get_frame_heatmap_weights(frame) file_path = join(self.target_dir, 'framesummary', str(frame.id)) save_json(file_path, data, indent=0)
def save_topicframe(self, frame, topic_frame): data = {} data['topicId'] = topic_frame.topic_id data['frameId'] = topic_frame.frame_id data['tokenList'] = topic_frame.token_list data['tokenWeights'] = self.get_token_weights(frame, topic_frame) data['heatmapWeights'] = self.get_topicframe_heatmap_weights( frame, topic_frame) data['counts'] = topic_frame.counts file_path = join(self.target_dir, 'topicframe', str(topic_frame.frame_id), str(topic_frame.topic_id)) save_json(file_path, data, indent=0)
def get_authors(): files = util.listdir(AUTHOR_FOLDER) util.mkdir(AUTHOR_CRALWED_FOLDER) for file_name in files: save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name print data['short'], full_name data['links'] = get_links(data['short'], html) util.save_json(save_path, data)
def download_missing_paper_pdfs(): if not os.path.isdir(cfg.paths['pdfs-dir']): os.makedirs(cfg.paths['pdfs-dir']) metas = util.load_json(cfg.paths['papers-metadata']) metas = util.parallelize(download_paper_pdf_if_needed, metas, N_THREADS) util.save_json(cfg.paths['papers-metadata'], metas) print('\n----') print('saved updated papers metadata to "{}"'.format( cfg.paths['papers-metadata'])) n = sum(int(os.path.isfile(m['pdf-path'])) for m in metas) print('{}/{} ({:.3f}%) items with pdfs'.format(n, len(metas), 100 * n / len(metas)))
def build_question_set(): sv_file = 'data/kprestval_pos_tags.json' st = StanfordPOSTagger('english-bidirectional-distsim.tagger') meta = load_and_process_metadata('val') images = split_data_by_seed(meta, 'kprestval') num = len(images) pos_tags_dict = {} for i, info in enumerate(images): question_id = info.question_id question = info.question.lower() _pos_tags = st.tag(word_tokenize(question)) pos_tags_dict[question_id] = _pos_tags print('\nPOS TAGGER: %d/%d' % (i, num)) print(_pos_tags) save_json(sv_file, {'pos_tags': pos_tags_dict})
def convert_vqa_annotations(): path_src = '../../data/VQA/Annotations/mscoco_val2014_annotations.json' d = load_json(path_src) anno = {} for info in d['annotations']: quest_id = int(info['question_id']) tmp = {} for _a in info['answers']: a = _a['answer'] if a in tmp: tmp[a] += 1.0 else: tmp[a] = 1.0 anno[quest_id] = tmp save_json(_ANNO_FILE, anno)
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 conf = util.load_json('conf_name.json') for file_name in files: save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) if data['short'] not in conf.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = conf[data['short']] data['sub'] = {} if len(subs) == 0: data['sub']['#'] = get_publications(html) util.save_json(save_path, data) cnt += 1
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 jour = util.load_json('jour_name.json') for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) if data['short'] not in jour.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = jour[data['short']] data['sub'] = {} if len(subs) == 0: data['sub']['#'] = get_publications(html) util.save_json(save_path, data) cnt += 1 print cnt, len(files), data['short']
def get_journals(): pos, cnt = 1, 0 util.mkdir(JOURNAL_FOLDER) while True: html = util.get_page(JOURNAL_URL + str(pos)) links = util.find_journals(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'journal' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data['short'] util.save_json(os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 100 print 'Journal', cnt
def get_conferences(): pos, cnt = 1, 0 util.mkdir(CONFERENCE_FOLDER) while True: html = util.get_page(CONFERENCE_URL + str(pos)) links = util.find_conferences(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'conference' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/db/conf/' + data['short'] util.save_json(os.path.join(CONFERENCE_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 100 print 'Conference', cnt
def get_authors(): pos, cnt = 545504, 0 util.mkdir(AUTHOR_FOLDER) while True: html = util.get_page(AUTHOR_URL + str(pos)) links = util.find_authors(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'author' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/pers/hd/a/' + data['short'] util.save_json(os.path.join(AUTHOR_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 300 print 'Author', pos, cnt
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 jour = util.load_json('jour_name.json') for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) if data['short'] not in jour.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = jour[data['short']] data['sub'] = {} for sub in subs: html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub) data['sub'][sub] = {} data['sub'][sub]['pub'] = get_publications(html) data['sub'][sub]['name'] = jour[sub] cnt += 1 print cnt, len(files), data['short'] util.save_json(save_path, data)
ip_choices.remove(ip) nodeStr = lb_node % (ip, node_port) nodesStr += "%s\n " % nodeStr if health_mon == "true": hm = hm_req % (health_mon_type) else: hm = "" lbStr = lb_post % (lb_name + "%i" % i, lb_port, lb_proto, vip_type, nodesStr, hm) reqs.append(lbStr + "\n") return reqs if __name__ == "__main__": prog = sys.argv[0] if len(sys.argv) < 1: usage(prog) sys.exit() ips = getIps("server_list.json") request_list = [] configs = util.load_json("reqConfig.json") for config in configs: reqs = build_req(config, ips) for req in reqs: request_list.append(req) random.shuffle(request_list) util.save_json("requests.json", request_list)
JOURNAL_CRALWED_FOLDER = os.path.join('link', 'journal') CONFERENCE_CRALWED_FOLDER = os.path.join('link', 'conference') merged_data = {} merged_data['jour'] = {} merged_data['conf'] = {} cnt = 0 files = util.listdir(JOURNAL_CRALWED_FOLDER) for file_name in files: data = util.load_json(os.path.join(JOURNAL_CRALWED_FOLDER, file_name)) short = data['short'] del data['short'] merged_data['jour'][short] = data cnt += 1 print cnt, len(files) cnt = 0 files = util.listdir(CONFERENCE_CRALWED_FOLDER) for file_name in files: data = util.load_json(os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)) short = data['short'] del data['short'] merged_data['conf'][short] = data cnt += 1 print cnt, len(files) util.save_json('merged.json', merged_data)