def parse_blog(path): if is_file_exist(path + '/linked_papers.json'): return if not is_file_exist(path + '/urls.json'): return with open(path + '/urls.json') as f: urls = json.load(f) output = [] dir_path = path + '/linked_papers.json' for url in urls: output.append(crawl_paper_links(url)) write_to_json_file(dir_path, output)
def count_urls(file): if is_file_exist(file): with open(file) as f: urls = json.load(f) global TOTAL_COUNT, UNIQUE_COUNT, MAP TOTAL_COUNT += len(urls) for url in urls: MAP[url] = 1 #hashset, 1 is meaningless UNIQUE_COUNT += 1
def main(argv): # command line arguments opts, args = getopt.getopt(argv, "hi:s:", ["ifile=", "start="]) # input file pointer inputfile = None # default value for start idx start = 0 for opt, arg in opts: if opt == '-h': print('insertion.py -i <input_file> -s <start_idx>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-s", "--start"): try: start = int(arg) except: print("Parameter start should be int!") sys.exit() if (inputfile is not None and utils.is_file_exist(inputfile)): # init the worker class w = thread.init_worker(concurrent=config.concurrent, timeout=config.timeout) with open(inputfile, 'r', buffering=config.read_buffer) as f: data = [] idx, c = 0, 0 for l in f: if (idx >= start): try: dpoint = json.loads(l) except: continue c += 1 data.append(dpoint) if (c == config.partition): print(f"insert_{idx - config.partition}-{idx}") thread.run_worker( w, f"insert_{idx - config.partition}-{idx}", data, parse) # clear c = 0 data = [] idx += 1 if (len(data)): print(f"insert_{idx-len(data)}-{idx}") thread.run_worker(w, f"insert_{idx-len(data)}-{idx}", data, parse) else: print("[WARNING] File not exists or not specified!")
def __init__(self, data_dir, dataset, train=True, cv_pretrained=True, transform=None, size=(224, 224), top_k=0, multi_label=False, tokenizer=None, text_max=14): self.dataset = dataset self.mode = 'train' if train else 'val' self.cv_pretrained = cv_pretrained self.transform = transform self.multi_label = multi_label self.text_max = text_max self.data_file = os.path.join(data_dir, dataset, f'data_dict_{top_k}_{multi_label}.pkl') self.question_file = os.path.join( data_dir, dataset, f'questions_{self.mode}_{top_k}_{multi_label}.h5') if self.cv_pretrained: self.image_dir = os.path.join( data_dir, dataset, f'images_{self.mode}_{str(size[0])}.h5') self.idx_dict_file = os.path.join(data_dir, dataset, 'idx_dict.pkl') else: if dataset == 'clevr' or dataset == 'sample': self.image_dir = os.path.join(data_dir, dataset, 'images', f'{self.mode}') elif dataset == 'vqa2': self.image_dir = os.path.join(data_dir, dataset, f'{self.mode}2014') if not is_file_exist(self.question_file): make_questions(data_dir, dataset, top_k, multi_label, tokenizer) if cv_pretrained: if not is_file_exist(self.image_dir): make_images(data_dir, dataset, size) self.load_data()
def count_linked_papers(file): if is_file_exist(file): with open(file) as f: try: data = json.load(f) global TOTAL_COUNT, UNIQUE_COUNT, MAP for each in data: TOTAL_COUNT += len(each['papers']) if not each['url'] in MAP: MAP[each['url']] = len(each['papers']) UNIQUE_COUNT += len(each['papers']) except: print 'except'
def __init__(self, data_dir, dataset, train=True, transform=None, size=(224, 224), object_size=14, cv_pretrained=True, top_k=0, multi_label=False, q_tokenizer='none', a_tokenizer='none', question_inverse=False, text_max=14, te_bert=False): self.dataset = dataset self.mode = 'train' if train else 'val' self.transform = transform self.cv_pretrained = cv_pretrained self.top_k = top_k self.multi_label = multi_label self.label = 'multi-label' if multi_label else 'uni-label' self.q_tokenizer = q_tokenizer self.a_tokenizer = a_tokenizer self.question_inverse = question_inverse self.text_max = text_max if not te_bert: self.qa_file = os.path.join(data_dir, dataset, f'qa_sets_{dataset}_{self.mode}.h5') if not is_file_exist(self.qa_file): make_questions(data_dir, dataset) else: self.qa_file = os.path.join(data_dir, dataset, f'qa_sets_{dataset}_{self.mode}_bert.h5') if not is_file_exist(self.qa_file): make_bert(data_dir, dataset) if cv_pretrained: self.image_dir = os.path.join(data_dir, dataset, f'images_{self.mode}_{str(size[0])}_{object_size}.h5') if not is_file_exist(self.image_dir): make_images(data_dir, dataset, size) # self.idx_dict_file = os.path.join(data_dir, dataset, 'idx_dict.pkl') else: if dataset == 'vqa2': self.image_dir = os.path.join(data_dir, dataset, f'{self.mode}2014') elif dataset == 'clevr' or dataset == 'clevr-humans': self.image_dir = os.path.join(data_dir, 'clevr', f'images_{self.mode}_{str(size[0])}_raw.h5') if not is_file_exist(self.image_dir): make_images(data_dir, dataset, size) self.load_data()
def count_linked_arxiv_papers(file): if is_file_exist(file): with open(file) as f: try: data = json.load(f) global TOTAL_COUNT, UNIQUE_COUNT, MAP for each in data: arxiv_count = 0 for paper in each['papers']: if 'arxiv' in paper: arxiv_count += 1 TOTAL_COUNT += arxiv_count if not each['url'] in MAP: MAP[each['url']] = arxiv_count UNIQUE_COUNT += arxiv_count except: print 'except'
def __init__(self, writer, args, batch_record_idx=0): self.writer = writer self.args = args self.timestamp = args.timestamp self.idx_to_question_type = args.idx_to_question_type self.idx_to_word = args.idx_to_word self.answer_idx_to_word = args.answer_idx_to_word self.qt_size = args.qt_size self.multi_label = args.multi_label self.batch_record_idx = batch_record_idx self.csv_file = os.path.join(args.log_directory, args.project, f"{args.project}_log.csv") self.rolling_average = 5 self.logs = defaultdict(lambda: deque(maxlen=self.rolling_average)) self.epoch_idx = None self.mode = None self.batch_num = 0 self.dataset_size = 0 self.epoch_loss = 0 self.epoch_correct = 0 self.per_question = None self.per_question_type = None self.epoch_start_time = 0 self.epoch_end_time = 0 self.epoch_time = 0 self.batch_loss = 0 self.batch_correct = 0 self.batch_start_time = 0 self.batch_end_time = 0 self.batch_time = 0 self.per_question_log = dict() self.exclude = [ 'data_directory', 'log_directory', 'data_config', 'config', 'log', 'word_to_idx', 'idx_to_word', 'answer_word_to_idx', 'answer_idx_to_word', 'question_type_to_idx', 'idx_to_question_type', 'q_size', 'a_size', 'qt_size' ] self.header = self.make_header() if not is_file_exist(self.csv_file): self.make_csv() if not self.is_record_exist(): self.make_record()