def __init__(self, start_ratio=0.0, end_ratio=0.9, sample_ratio=1.0): # load data from wmt_news zh_en_dict = load_json(filtered_union_zh_en_dict_path) en_zh_dict = load_json(filtered_union_en_zh_dict_path) data = [] for zh, val in zh_en_dict.items(): if not val or 'translation' not in val or not val['translation']: continue for en in val['translation']: data.append([zh, en]) data.append([en, zh]) for en, val in en_zh_dict.items(): if not val or 'translation' not in val or not val['translation']: continue for zh in val['translation']: data.append([zh, en]) data.append([en, zh]) # TODO remove duplicate # reproduce the process that nmt would go through in order to get its train set; shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # sample data data = self.sample_data(data, sample_ratio) # split dataset data = self.__split_data(data, start_ratio, end_ratio) self.__src_data, self.__tar_data = list(zip(*data))
def loader(): if not Indexer._indexer: Indexer._indexer = load_binary(Indexer.index_path) if not Indexer._download: Indexer._download = load_json(Indexer.download_path) if not Indexer._analysis: Indexer._analysis = load_json(Indexer.analysis_path)
def update(self, t_weight=6, d_weight=3, c_weight=1, forced=False): indexer = load_binary(self.index_path) indexer_readable = load_json(self.index_readable_path) download = load_json(self.download_path) analysis = load_json(self.analysis_path) total_count = Counter() id_noun_count = {} for video_id, content in download.items(): title = analysis[video_id]['nouns']['title'].split() * t_weight description = analysis[video_id]['nouns']['description'].split( ) * d_weight caption = analysis[video_id]['nouns']['caption'].split() * c_weight state = content.get('state') if state == 'update' or forced: self._delete_item(indexer, video_id) self._add_item(indexer, video_id, title + description + caption) download[video_id]['state'] = 'complete' elif state == 'new': self._add_item(indexer, video_id, title + description + caption) download[video_id]['state'] = 'complete' save_binary(indexer, self.index_path) save_json(indexer, self.index_readable_path) save_json(download, self.download_path) self._indexer = load_binary(self.index_path)
def update(forced=False): download_path = path['download'] analysis_path = path['analysis'] doc2vec_src_path = path['doc2vec']['src'] doc2vec_model_path = path['doc2vec']['model'] download = load_json(download_path) analysis = load_json(analysis_path) model = gensim.models.doc2vec.Doc2Vec model = model.load(doc2vec_model_path) corpus = list(CorpusGensim(doc2vec_src_path)) for video_id, content in download.items(): state = content.get('state', 'new') if state == 'new' or state == 'update' or forced: words = analysis[video_id]['nouns']['all'].split() dt = get_similarity(corpus, model, words, 0, 0.1) + words dr = get_similarity(corpus, model, words, 0.9, 1) # + words keyword = keywords(words, dt, dr).get_keywords(10, 2) analysis[video_id]['keywords'] = ' '.join(keyword) save_json(analysis, analysis_path)
def test_no_overlap(self): train_meta = utils.load_json(config.TRAIN_METADATA_PATH) valid_meta = utils.load_json(config.VAL_METADATA_PATH) test_meta = utils.load_json(config.TEST_METADATA_PATH) train_videos = list(train_meta.keys()) valid_videos = list(valid_meta.keys()) test_videos = list(test_meta.keys()) self.assertFalse(not set(train_videos).isdisjoint(valid_videos)) self.assertFalse(not set(train_videos).isdisjoint(test_videos)) self.assertFalse(not set(valid_videos).isdisjoint(test_videos))
def main2(): # ./module-table-analysis.py -f /home/qinshulei/projects/huawei/githubs/test_result_dict.json -t /home/qinshulei/projects/huawei/githubs/test-definitions # test_dir = '/home/qinshulei/projects/huawei/githubs/test-definitions' # result_file = '/home/qinshulei/projects/huawei/githubs/test_result_dict.json' # generate_module_dict(result_json_dict, test_dir) # get args parser = argparse.ArgumentParser(prog='PROG') parser.add_argument('-f', '--file', required=True, help='The data file path to load.') parser.add_argument('-t', "--testDir", required=True, help="specific test case dir") # TODO : save result to a file parser.add_argument('-o', '--output_file', help='allow output the result to a file') config = vars(parser.parse_args()) test_dir = config.get("testDir") # test_result_dict.json result_file = config.get("file") result_json_dict = utils.load_json(result_file) # job_result_dict = result_json_dict module_dict = generate_module_dict(result_json_dict, test_dir) print print_scope_result(module_dict)
def traverse_dict_and_merge(_dict_dir, _merged_dict): for file_name in os.listdir(_dict_dir): file_path = os.path.join(_dict_dir, file_name) print(f'\nloading dictionary from {file_path} ...') tmp_dict = load_json(file_path) print(f'merging dict {file_name} ...') mode = 0 if '_v_all' not in file_name else 1 # if mode == 1: # continue length = len(tmp_dict) i = 0 for key, val in tmp_dict.items(): if i % 50 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') _merged_dict = __merge_dict(_merged_dict, key, val, mode) i += 1 return _merged_dict
def build_unpaired_dataset(db_path, batch_size, training=True): def preprocess(fake_noise, fake_label, root, real_images, training): fake_noise = tf.strings.join([root, fake_noise], '/') fake_label = tf.strings.join([root, fake_label], '/') real_image = tf.py_function(np.random.choice(real_images), [real_images], 'string') real_image = tf.strings.join([root, real_image], '/') fake_noise = read_image(fake_noise) fake_label = read_image(fake_label) real_image = read_image(real_image) if training: fake_noise, fake_label = augument_image(fake_noise, fake_label) return fake_noise, real_image, fake_label db = load_json(db_path) ds = tf.data.Dataset.from_tensor_slices((db['fake_input'], db['fake_label'])) if training: ds = ds.shuffle(SHUFFLE_BUFFER_SIZE) ds = ds.map(functools.partial(preprocess, root=db['root'], real_images=db['real_image'], training=training), num_parallel_calls=NUM_PARALLEL_CALLS) ds = ds.batch(batch_size) ds = ds.prefetch(PREFETCH_BUFFER_SIZE) return ds
def __load_test(self): """ load test data """ print('\nStart loading test data ...') if os.path.isfile(self.__test_emb_pkl_path): self.__test_X, self.__test_y, _, _ = load_pkl( self.__test_emb_pkl_path) else: print('loading test doc list ...') # load the doc_list emb_json_path = os.path.join(path.PATH_TMP_DIR, 'emb_test_data.json') if os.path.isfile(emb_json_path): docs = load_json(emb_json_path) else: path_list = self.__get_path_list('test') docs = self.__load_docs(path_list, emb_json_path) print('converting test docs to trainable test data format ...') # convert the doc list to trainable data format self.__test_X, self.__test_y = self.__convert( docs, self.__test_emb_pkl_path) print('Finish loading test data')
def __load_from_dict(): # load data from files # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path) zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path) zh_en_list = list( filter(lambda x: 'translation' in x[1] and x[1]['translation'], zh_en_dict.items())) zh_en_list = list( map( lambda x: [[x[0]] * len(x[1]['translation']), x[1][ 'translation']], zh_en_list)) # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list) zh_data = [] en_data = [] length = len(zh_en_list) for i, val in enumerate(zh_en_list): if i % 50 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') zh_data += val[0] en_data += val[1] return list(zip(zh_data, en_data))
def get_board_type(directory, filename): strinfo = re.compile('.txt') json_name = strinfo.sub('.json', filename) test_info = utils.load_json(os.path.join(directory, json_name)) if 'board' in test_info.keys(): # for dummy-ssh board board_type = '' try: if re.search('ssh', test_info['board_instance']): board_type = test_info['board_instance'].split('_')[0] else: board_verify = test_info['board'].split(',')[0] for key in device_map.keys(): if device_map[key][0] == board_verify: board_type = key break else: board_type = '' except KeyError: try: board_verify = test_info['board'].split(',')[0] except: board_verify = test_info['board'] for key in device_map.keys(): if device_map[key][0] == board_verify: board_type = key break else: board_type = '' return board_type return ''
def __init__(self, file_path): self.file_path = file_path self._data = utils.load_json(file_path) self._data["save_directory"] = os.path.normpath( self._data["save_directory"]) self._data["users"] = list(dict.fromkeys(self._data["users"])) self._data["bookmarks"] = list(dict.fromkeys(self._data["bookmarks"]))
def __init__(self, start_ratio=0.0, end_ratio=0.98, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # load data from files # zh_en_dict = load_json(filtered_pos_union_en_zh_dict_path) zh_en_dict = load_json(filtered_pos_union_zh_en_dict_path) zh_en_list = list( filter(lambda x: 'translation' in x[1] and x[1]['translation'], zh_en_dict.items())) zh_en_list = list( map( lambda x: [[x[0]] * len(x[1]['translation']), x[1][ 'translation']], zh_en_list)) # data = reduce(lambda x, y: [x[0] + y[0], x[1] + y[1]], zh_en_list) zh_data = [] en_data = [] length = len(zh_en_list) for i, val in enumerate(zh_en_list): if i % 50 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') zh_data += val[0] en_data += val[1] data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer() # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __load_train(self): """ load train data """ print('\nStart loading train data') if os.path.isfile(self.__emb_pkl_path): self.__train_X, self.__train_y, self.dict, self.voc_size = load_pkl( self.__emb_pkl_path) else: print('loading doc list ...') # load the doc_list emb_json_path = os.path.join(path.PATH_TMP_DIR, 'emb_data.json') if os.path.isfile(emb_json_path): docs = load_json(emb_json_path) else: path_list = self.__get_path_list() docs = self.__load_docs(path_list, emb_json_path) print('generating dictionary ...') # generate the dictionary which maps the bond_id to index self.dict, self.voc_size = self.__gen_dict(docs) print('converting docs to trainable data format ...') # convert the doc list to trainable data format self.__train_X, self.__train_y = self.__convert( docs, self.__emb_pkl_path) print('Finish loading train data')
def __gen_topics_mask(self): topics = utils.load_json(path.TOPIC_BONDS_JSON) topics = self.dict.doc2idx(topics) while -1 in topics: topics.remove(-1) self.__topic_mask = self.__2_sum_one_hot(topics) self.__topic_mask[self.__topic_mask > 0] = 1
def __load_dir(dir_path): """ Load all the data in "dir_path", and complement the data in the dates that no transaction happened :return data: (list) e.g. [ # include transactions happen in many days ['bond_a', 'bond_b', ...], # represent transaction happen in one day ['bond_a', 'bond_b', ...], ... ] """ data = [] # load the date list date_list = os.listdir(dir_path) date_list.sort() # generate a date dict so that we can check whether there is transaction happens in that date date_dict = utils.list_2_dict(date_list) # find out the start and end date of all the transactions start_date = date_list[0][len('doc_'):-len('.json')] end_date = date_list[-1][len('doc_'):-len('.json')] # covert the date to timestamp cur_timestamp = utils.date_2_timestamp(start_date) end_timestamp = utils.date_2_timestamp(end_date) + 86000 # traverse all the date between the start date and the end date, but skip the holidays while cur_timestamp < end_timestamp: _date = utils.timestamp_2_date(cur_timestamp) file_name = f'doc_{_date}.json' # check if there is any transaction if file_name in date_dict: file_path = os.path.join(dir_path, file_name) # remove nan in doc tmp_doc = list( map(lambda x: x if isinstance(x, str) else '', utils.load_json(file_path))) while '' in tmp_doc: tmp_doc.remove('') data.append(tmp_doc) # if it is holidays, then skip it elif date.is_holiday(_date): pass # if no transaction happens in that date else: data.append([]) # move to the next day cur_timestamp += 86400 return data
def get_board_instance(directory, filename): strinfo = re.compile('.txt') json_name = strinfo.sub('.json', filename) #with open(os.path.join(directory, json_name), "r") as lines: test_info = utils.load_json(os.path.join(directory, json_name)) if 'board_instance' in test_info.keys(): board_instance = test_info['board_instance'] return board_instance return ''
def get_board_instance(directory, filename): strinfo = re.compile('.txt') json_name = strinfo.sub('.json',filename) #with open(os.path.join(directory, json_name), "r") as lines: test_info = utils.load_json(os.path.join(directory, json_name)) if 'board_instance' in test_info.keys(): board_instance = test_info['board_instance'] return board_instance return ''
def normalizing(forced=False): download_path = path['download'] analysis_path = path['analysis'] normed_path = path['norm'] download = load_json(download_path) analysis = load_json(analysis_path) with open(normed_path, 'w', encoding='utf8') as f: for video_id, content in download.items(): state = content.get('state', 'new') if state == 'new' or state == 'update' or forced: norm_title = normalize(content.get('title', '').lower(), english=True, number=True, punctuation=False, remains={'+', '#'}) norm_description = normalize(content.get('description', '').lower(), english=True, number=True, punctuation=False, remains={'+', '#'}) norm_caption = normalize(content.get('caption', '').lower(), english=True, number=True, punctuation=False, remains={'+', '#'}) f.write('{}\n'.format(norm_title + norm_description + norm_caption)) analysis[video_id] = {} norm = {} norm['title'] = norm_title norm['description'] = norm_description norm['caption'] = norm_caption norm['trackKind'] = content.get('trackKind', '') analysis[video_id]['norm'] = norm save_json(analysis, analysis_path)
def main(args): if args.subset == constants.TRAIN: root = config.TRAIN_SOUND_ROOT cls_dirs = True elif args.subset == constants.VALID: root = config.VALID_SOUND_ROOT cls_dirs = True elif args.subset == constants.TEST: root = config.TEST_SOUND_ROOT cls_dirs = False else: raise ValueError("Invalid subset.") convert_to_tfrecords(utils.load_json(args.meta_path), utils.load_json(args.classes_path), root, args.save_path, args.sampling_rate, class_dirs=cls_dirs)
def parse_json(json): jobs = utils.load_json(json) url = utils.validate_input(jobs['username'], jobs['token'], jobs['server']) connection = utils.connect(url) duration = jobs['duration'] # Remove unused data jobs.pop('duration') jobs.pop('username') jobs.pop('token') jobs.pop('server') return connection, jobs, duration
def load_branches(json_file): """This function loads branches' info from a Json file and dumps it into a list of dictionaries :param filename: File to be loaded :return: List of dictionaries """ logger.info("Loading branches from file %s", json_file) repo_dict = load_json(json_file) logger.info("Branches loaded") return repo_dict
def update(forced=False): def get_nouns(text): nouns = komoran3.nouns(text) return [ noun for noun in nouns if len(noun) > 1 and not noun.isnumeric() ] user_dict_path = path['user_dictionary'] komoran3 = Komoran('./lib/komoran/komoran/models', './lib/komoran/komoran/libs') komoran3.set_user_dictionary(user_dict_path) download_path = path['download'] analysis_path = path['analysis'] download = load_json(download_path) analysis = load_json(analysis_path) for video_id, content in download.items(): state = content.get('state', 'new') if state == 'new' or state == 'update' or forced: norm = analysis[video_id]['norm'] nouns = {} nouns['title'] = ' '.join(get_nouns(norm.get('title', ''))) nouns['description'] = ' '.join( get_nouns(norm.get('description', ''))) nouns['caption'] = ' '.join(get_nouns(norm.get('caption', ''))) nouns['all'] = nouns['title'] + ' ' + nouns[ 'description'] + ' ' + nouns['caption'] analysis[video_id]['nouns'] = nouns save_json(analysis, analysis_path)
def get_board_type(directory, filename): strinfo = re.compile('.txt') json_name = strinfo.sub('.json', filename) test_info = utils.load_json(os.path.join(directory, json_name)) if 'board' in test_info.keys(): # for dummy-ssh board board_type = '' try: if re.search('ssh', test_info['board_instance']): board_type = test_info['board_instance'].split('_')[0] else: if ',' in test_info['board']: board_verify = test_info['board'].split(',')[0] for key in device_map.keys(): if device_map[key][0] == board_verify: board_type = key break else: board_type = '' else: # for dummy_ssh_{board_type} board_type = test_info['board'].split('_')[-1] except KeyError: if ',' in test_info['board']: try: board_verify = test_info['board'].split(',')[0] except: board_verify = test_info['board'] for key in device_map.keys(): if device_map[key][0] == board_verify: board_type = key break else: board_type = '' else: # for jobs which has not incomplete board_type = test_info['board'].split('_')[-1] return board_type return ''
def build_paired_dataset(db_path, batch_size, training=True): def preprocess(noise, label, root, training): noise = tf.strings.join([root, noise], '/') label = tf.strings.join([root, label], '/') noise = read_image(noise) label = read_image(label) if training: noise, label = augument_image(noise, label) return noise, label db = load_json(db_path) ds = tf.data.Dataset.from_tensor_slices((db['noise'], db['label'])) if training: ds = ds.shuffle(SHUFFLE_BUFFER_SIZE) ds = ds.map(functools.partial(preprocess, root=db['root'], training=training), num_parallel_calls=NUM_PARALLEL_CALLS) ds = ds.batch(batch_size) return ds
def main(args): metadata = utils.load_json(args.train_metadata) r_avg = utils.StreamingAverage() g_avg = utils.StreamingAverage() b_avg = utils.StreamingAverage() for video_id, cls in metadata.items(): video_folder_path = os.path.join(config.TRAIN_FRAMES_ROOT, utils.class_name_to_dir_name(cls), video_id) frame_paths = [ os.path.join(video_folder_path, frame_path) for frame_path in os.listdir(video_folder_path) ] for frame_path in frame_paths: frame = cv2.imread(frame_path) assert len(frame.shape) == 3 assert frame.shape[-1] == 3 # opencv loads images in BGR b = np.mean(frame[..., 0]) g = np.mean(frame[..., 1]) r = np.mean(frame[..., 2]) r_avg.add(r) g_avg.add(g) b_avg.add(b) means = [r_avg.avg, g_avg.avg, b_avg.avg] np.save(args.save_path, means)
def gen_group_according_to(file_path): print('loading data ...') dict_dealer_index_2_group = utils.load_json(file_path) data, d_dealers, total_volume, total_transaction_count, bound_timestamp, d_new_bonds = utils.load_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp123.pkl')) utils.write_pkl( os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl'), d_dealers) # d_dealers = utils.load_pkl(os.path.join(path.ROOT_DIR, 'runtime', 'tmp_d_dealers.pkl')) labels = set(list(map(lambda x: x[1], dict_dealer_index_2_group.items()))) group_list = [{} for i in range(len(labels))] print('traversing data ...') length = len(d_dealers) cur = 0 for dealer_index, trace_list in d_dealers.items(): # show progress if cur % 5 == 0: progress = float(cur + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') cur += 1 if dealer_index not in dict_dealer_index_2_group: continue group_index = dict_dealer_index_2_group[dealer_index] group_list[group_index][dealer_index] = trace_list print('\rprogress: 100.0% \nsaving data ...') plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json' group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name) utils.write_json(group_path, group_list)
def __load_docs(path_list, emb_json_path): """ load all the data from the path list """ docs = [] length = len(path_list) # traverse the path list to load all the data for i, _path in enumerate(path_list): # show progress if i % 5 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') # remove nan in doc tmp_doc = list( map(lambda x: x if isinstance(x, str) else '', load_json(_path))) while '' in tmp_doc: tmp_doc.remove('') docs.append(tmp_doc) # cache data for faster processing next time write_json(emb_json_path, docs) return docs
def main(args): # load video classes classes = utils.load_json(config.CLASSES_PATH) # load lists of videos train_metadata = utils.load_json(config.TRAIN_METADATA_PATH) val_metadata = utils.load_json(config.VALID_METADATA_PATH) test_metadata = utils.load_json(config.TEST_METADATA_PATH) num_found = 0 total = 0 total_train_present = 0 total_train_missing = 0 total_val_present = 0 total_val_missing = 0 # load subset subset = None if args.subset: subset = utils.load_json(args.subset) # count train and validation videos for cls in classes: if subset is not None and cls not in subset: continue total += 1 cls_train_path = os.path.join(config.TRAIN_ROOT, cls.replace(" ", "_")) cls_valid_path = os.path.join(config.VALID_ROOT, cls.replace(" ", "_")) train_found = False valid_found = False if os.path.isdir(cls_train_path): train_present, train_missing = count_present_and_missing(cls, cls_train_path, train_metadata) train_found = True total_train_present += train_present total_train_missing += train_missing if os.path.isdir(cls_valid_path): valid_present, valid_missing = count_present_and_missing(cls, cls_valid_path, val_metadata) valid_found = True total_val_present += valid_present total_val_missing += valid_missing if train_found or valid_found: num_found += 1 if args.details: print("class {}".format(cls)) if train_found: print("train: {} / {}".format(train_present, train_present + train_missing)) if valid_found: print("valid: {} / {}".format(valid_present, valid_present + valid_missing)) print() # count test videos test_present, test_missing = count_present_and_missing(None, config.TEST_ROOT, test_metadata) # print train_percent_found = 0 if total_train_present > 0: train_percent_found = (total_train_present * 100) / (total_train_present + total_train_missing) valid_percent_found = 0 if total_val_present > 0: valid_percent_found = (total_val_present * 100) / (total_val_present + total_val_missing) test_percent_found = 0 if test_present > 0: test_percent_found = (test_present * 100) / (test_present + test_missing) print("class stats:") print("\t{:d} / {:d} classes found".format(num_found, total)) print() print("video stats (only for found classes):") print("\t{:d} / {:d} ({:.2f}%) train videos found".format( total_train_present, total_train_present + total_train_missing, train_percent_found)) print("\t{:d} / {:d} ({:.2f}%) valid videos found".format( total_val_present, total_val_present + total_val_missing, valid_percent_found)) print("\t{:d} / {:d} ({:.2f}%) test videos found".format( test_present, test_present + test_missing, test_percent_found))
if dealer_index not in dict_dealer_index_2_group: continue group_index = dict_dealer_index_2_group[dealer_index] group_list[group_index][dealer_index] = trace_list print('\rprogress: 100.0% \nsaving data ...') plan_name = os.path.splitext(os.path.split(file_path)[1])[0] + '.json' group_path = os.path.join(path.DATA_ROOT_DIR, 'groups', plan_name) utils.write_json(group_path, group_list) # gen_group_according_to(os.path.join( # path.ROOT_DIR, # 'group', # # 'group_K-means_without_original_stat.json' # # 'group_Spectral_Clustering_without_original_stat_with_model_input_features.json' # # 'group_K-means_filter_lower_5.json' # 'group_Spectral_Clustering_filter_lower_5_with_model_input_features.json' # )) # # print('done') _path = r'D:\Data\share_mine_laptop\community_detection\data\groups\group_Spectral_Clustering_filter_lower_5_with_model_input_features.json' data = utils.load_json(_path) for i, v in enumerate(data): print(i, len(v))
def caption_download(): if 'credentials' not in session: return redirect('authorize') credentials = google.oauth2.credentials.Credentials( **session['credentials']) youtube = googleapiclient.discovery.build(API_SERVICE_NAME, API_VERSION, credentials=credentials) channel_id = request.args.get('channel_id') resume = request.args.get('resume', 0) resume = int(resume) if resume else 0 max_results = 50 try: #1. page 정보 추출 search_list = youtube.search().list(part='id', channelId=channel_id, maxResults=1, type='video', fields='items,pageInfo').execute() except Exception as e: print(e) return jsonify({'error': str(e)}) caption_count = 0 total_results = search_list['pageInfo']['totalResults'] sys.stdout.write('\rprogress... {:4}/{:4} '.format(caption_count, total_results)) pages = total_results // max_results + 1 next_page_token = '' caption_time_pattern = re.compile( '\d:\d\d:\d\d.\d\d\d,\d:\d\d:\d\d.\d\d\d') for page in range(pages): #2. video id, title, description 추출 search_list = youtube.search().list( part='snippet', channelId=channel_id, maxResults=max_results, pageToken=next_page_token, type='video', fields='items(etag,id,snippet(description,title)),nextPageToken' ).execute() next_page_token = search_list.get('nextPageToken', None) for item in search_list['items']: caption_count += 1 if caption_count < resume: sys.stdout.write('\rprogress... {:4}/{:4} '.format( caption_count, total_results)) continue caption_dump = load_json(download_path) video_id = get_depth_dict(item, ('id', 'videoId'), None) if not video_id: print('error : There is no video_id.') sys.stdout.write('\rprogress... {:4}/{:4} '.format( caption_count, total_results)) continue title = get_depth_dict(item, ('snippet', 'title'), None) description = get_depth_dict(item, ('snippet', 'description'), None) #3. caption id 추출 caption_list = youtube.captions().list( part='snippet', videoId=video_id, fields='items(etag,id,snippet(language,trackKind,lastUpdated))' ).execute() caption_kind = {} for item in caption_list['items']: track_kind = item['snippet']['trackKind'] caption_kind[track_kind] = { 'id': item['id'], 'lastUpdated': item['snippet']['lastUpdated'] } #4. 선호하는 trackKind 추출 caption_id = None track_kind_preference = ['standard', 'ASR', 'forced'] for preference in track_kind_preference: if caption_kind.get(preference, None): caption_id = caption_kind[preference]['id'] new_last_updated = caption_kind[preference]['lastUpdated'] track_kind = preference break if not caption_id: print('error : There is no caption. video_id : ', video_id) update_info = { 'title': title, 'description': description, 'error': 'There is no caption.' } if caption_dump.get(video_id, None): update_info['state'] = 'new' else: update_info['state'] = 'update' caption_dump[video_id] = update_info save_json(caption_dump, download_path) sys.stdout.write('\rprogress... {:4}/{:4} '.format( caption_count, total_results)) continue #5. caption lastUpdated 체크 old_last_updated = get_depth_dict(caption_dump, (video_id, 'lastUpdated'), None) if old_last_updated and new_last_updated == old_last_updated: sys.stdout.write('\rprogress... {:4}/{:4} '.format( caption_count, total_results)) continue #6. download try: caption = youtube.captions().download( id=caption_id, tfmt='sbv').execute().decode("utf-8") except Exception as e: print(e) print('video_id : ', video_id) update_info = { 'title': title, 'description': description, 'error': str(e) } if caption_dump.get(video_id, None): update_info['state'] = 'update' else: update_info['state'] = 'new' caption_dump[video_id] = update_info save_json(caption_dump, download_path) sys.stdout.write('\rprogress... {:4}/{:4} '.format( caption_count, total_results)) else: caption = caption_time_pattern.sub('', caption) update_info = { 'lastUpdated': new_last_updated, 'trackKind': track_kind, 'caption': caption, 'title': title, 'description': description } if caption_dump.get(video_id, None): update_info['state'] = 'update' else: update_info['state'] = 'new' caption_dump[video_id] = update_info save_json(caption_dump, download_path) sys.stdout.write('\rprogress... {:4}/{:4} '.format( caption_count, total_results)) time.sleep(0.5) return redirect(url_for('index', _exteranl=True))
filtered_en_ro_dict_path = os.path.join(dictionary_dir, 'filtered_en_ro_merged.json') delete_ro_keys = [] delete_en_keys = [] def __check_has_val(val): for k, l in val.items(): if l: return True return False print('\nloading zh_en_dict ...') ro_en_dict = load_json(merged_ro_en_dict_path) ro_en_dict = filter_duplicate(ro_en_dict) print('filtering zh_en_dict ...') for ro, val in ro_en_dict.items(): if 'translation' not in val: continue translations = val['translation'] translations = list(filter(lambda x: x in en_word_dict, translations)) if not translations: del ro_en_dict[ro]['translation'] if not __check_has_val(ro_en_dict[ro]):
def __init__(self, file_path): self.file_path = file_path self._data = utils.load_json(file_path) self._data['save_directory'] = os.path.normpath(self._data['save_directory']) self._data['users'] = list(dict.fromkeys(self._data['users']))