def build_and_gather_multiple_arrays(self, save_path): print("🌋 Extracting mentions features") parallel_process(self.docs, set_feats, n_jobs=self.n_jobs) print("🌋 Building and gathering arrays") arr =[{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)] # removing parallel process shoud reduce the memory used arrays_dicts = list() for arr_doc in arr : arrays_dicts.append(get_feats(arr_doc['doc'],arr_doc['i'])) del arr #arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs) gathering_dict = dict((feat, None) for feat in FEATURES_NAMES) n_mentions_list = [] pairs_ant_index = 0 pairs_start_index = 0 for n, p, arrays_dict in tqdm(arrays_dicts): for f in FEATURES_NAMES: if gathering_dict[f] is None: gathering_dict[f] = arrays_dict[f] else: if f == FEATURES_NAMES[6]: #print("FEATURE DEALT AT 6,",f) #print("ARRAYS DICT DEALT,",arrays_dict[f]) #print("PAIRS ANT INDEX,",pairs_ant_index) #nuremberg =input() array = [a + pairs_ant_index for a in arrays_dict[f]] #print("ARRAY CHANGED AT 6,",array) elif f == FEATURES_NAMES[3]: #print("FEATURE DEALT AT 3,",f) #print("ARRAYS DICT DEALT,",arrays_dict[f]) #print("PAIRS ANT INDEX,",pairs_ant_index) #nuremberg2 =input() array = [a + pairs_start_index for a in arrays_dict[f]] #print("ARRAY CHANGED AT 3,",array) else: array = arrays_dict[f] gathering_dict[f] += array pairs_ant_index += n pairs_start_index += p n_mentions_list.append(n) for feature in FEATURES_NAMES[:9]: print("Building numpy array for", feature, "length", len(gathering_dict[feature])) if feature != "mentions_spans": array = np.array(gathering_dict[feature]) if array.ndim == 1: array = np.expand_dims(array, axis=1) else: array = np.stack(gathering_dict[feature]) # check_numpy_array(feature, array, n_mentions_list) print("Saving numpy", feature, "size", array.shape) np.save(save_path + feature, array) for feature in FEATURES_NAMES[9:]: print("Saving pickle", feature, "size", len(gathering_dict[feature])) with open(save_path + feature + '.bin', "wb") as fp: pickle.dump(gathering_dict[feature], fp) del arrays_dicts, gathering_dict
def download_csv(files, column, num_thread): # download train validation for file in files: content = pd.read_csv(os.path.join(CSV_ROOT_PATH, file)) print('entry count:', len(content[column].to_list())) print(content.shape[0]) vids = list(set(content[column].to_list())) print('video count:', len(vids)) parallel_process(vids, download_by_youtube_id, n_jobs=8) # check missing find_missing_csv(column=column, files=files)
def load_data(directory, label, target_directory): if not os.path.exists(directory): return [] earthquake_files = [{ 'earthquake_file': x, 'directory': directory, 'target_directory': target_directory, 'label': label } for x in os.listdir(directory) if '.SAC' in x ] if len(earthquake_files) == 0: return [] parallel_process(earthquake_files, process_file, use_kwargs=True, n_jobs=num_cpu)
def read_all_images(root, num_workers=4): classes, class_to_idx = find_classes(root) dataset = make_dataset(root, class_to_idx) if len(dataset) == 0: raise (RuntimeError("Found 0 images in subfolders of: " + root + "\n" + "Supported image extensions are: " + ",".join(IMG_EXTENSIONS))) num_images = len(dataset) paths = [dataset[i_image][0] for i_image in range(num_images)] print("Reading {0} images with {1} workers".format(num_images, num_workers)) if num_workers > 1: images = parallel_process(paths, read_image_for_pytorch, n_jobs=num_workers) else: images = [] for p in tqdm(paths): images.append(read_image_for_pytorch(p)) image_cache = {} for i, image in enumerate(images): path, target = dataset[i] image_cache[path] = image return image_cache
def extract_mentions_spans(doc, blacklist=True, debug=False): ''' Extract potential mentions from a spacy parsed Doc ''' if debug: print('===== doc ====:', doc) for c in doc: if debug: print("🚧 span search:", c, "head:", c.head, "tag:", c.tag_, "pos:", c.pos_, "dep:", c.dep_) # Named entities mentions_spans = list(ent for ent in doc.ents if ent.label_ in ACCEPTED_ENTS) if debug: print("==-- ents:", list( ((ent, ent.label_) for ent in mentions_spans))) for spans in parallel_process([{ 'doc': doc, 'span': sent, 'blacklist': blacklist } for sent in doc.sents], _extract_from_sent, use_kwargs=True, front_num=0): mentions_spans = mentions_spans + spans spans_set = set() cleaned_mentions_spans = [] for spans in mentions_spans: if spans.end > spans.start and (spans.start, spans.end) not in spans_set: cleaned_mentions_spans.append(spans) spans_set.add((spans.start, spans.end)) return cleaned_mentions_spans
def build_and_gather_multiple_arrays(self, save_path): print("🌋 Extracting mentions features") parallel_process(self.docs, set_feats, n_jobs=self.n_jobs) print("🌋 Building and gathering arrays") arr = [{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)] arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs) gathering_dict = dict((feat, None) for feat in FEATURES_NAMES) n_mentions_list = [] pairs_ant_index = 0 pairs_start_index = 0 for n, p, arrays_dict in tqdm(arrays_dicts): for f in FEATURES_NAMES: if gathering_dict[f] is None: gathering_dict[f] = arrays_dict[f] else: if f == FEATURES_NAMES[6]: array = [a + pairs_ant_index for a in arrays_dict[f]] elif f == FEATURES_NAMES[3]: array = [a + pairs_start_index for a in arrays_dict[f]] else: array = arrays_dict[f] gathering_dict[f] += array pairs_ant_index += n pairs_start_index += p n_mentions_list.append(n) for feature in FEATURES_NAMES[:9]: print("Building numpy array for", feature, "length", len(gathering_dict[feature])) if feature != "mentions_spans": array = np.array(gathering_dict[feature]) if array.ndim == 1: array = np.expand_dims(array, axis=1) else: array = np.stack(gathering_dict[feature]) # check_numpy_array(feature, array, n_mentions_list) print("Saving numpy", feature, "size", array.shape) np.save(save_path + feature, array) for feature in FEATURES_NAMES[9:]: print("Saving pickle", feature, "size", len(gathering_dict[feature])) with open(save_path + feature + '.bin', "wb") as fp: pickle.dump(gathering_dict[feature], fp)
def evaluate_flows_batch(srcnodes_list, destnodes_list, spm, n_threads=1): if n_threads == 1: flow_values_all = [] for srcnodes, destnodes in tqdm(list(zip(srcnodes_list, destnodes_list))): flow_values_all.append(evaluate_flows_batch_p([srcnodes, destnodes, spm])) else: raise NotImplementedError('Parallel processing still doesnt work') procdata = list(zip(list(srcnodes_list), list(destnodes_list),list([spm]*len(list(srcnodes_list))))) flow_values_all = utils.parallel_process(evaluate_flows_batch_p, procdata) return np.asarray(flow_values_all).T
def build_key_file(self, data_path, key_file, debug=False): print("🌋 Building key file from corpus") print("Saving in", key_file) # Create a pool of processes. By default, one is created for each CPU in your machine. with io.open(key_file, "w", encoding='utf-8') as kf: if debug: print("Key file saved in", key_file) for dirpath, _, filenames in os.walk(data_path): print("In", dirpath) file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ or f.endswith(".v4_gold_conll")] cleaned_file_list = [] for f in file_list: fn = f.split('.') if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) #self.load_file(file_list[0]) doc_list = parallel_process(cleaned_file_list, read_file) for doc in doc_list: kf.write(doc)
from torchvision import transforms ROOT = '/data0/EEV/data-frames/00DCWMfJIpc' ttf = transforms.Compose([ transforms.Resize(299), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) def a(i): return Image.open(os.path.join(ROOT, i)), i[:-4] st = time.time() files = os.listdir('/data0/EEV/data-frames/00DCWMfJIpc') # print(files) res = parallel_process(files, a, n_jobs=20) res = sorted(res, key=lambda t: t[1]) load_time = time.time() - st st = time.time() # print([x[1] for x in res]) res = [x[1] for x in res] print(res) print('1288', 'transform', time.time() - st, 'load', load_time) # 4.6070 1377 imgs 7.2s total
def parallel_comment(self, c_list=[]): # parallel_process(self.comment, self.comment_list) parallel_process(self.com_wrapper, c_list, cpu_cnt=2)
def build_and_gather_multiple_arrays(self, save_path,train_phase): print("🌋 Extracting mentions features") parallel_process(self.docs, set_feats, n_jobs=self.n_jobs) #for doc in self.docs : # set_feats(doc) print("🌋 Building and gathering arrays") arr =[{'doc': doc, 'i': i} for i, doc in enumerate(self.docs)] #build_gather_array = input("Printing Array ") #print(arr) #arrays_dicts = parallel_process(arr, get_feats, use_kwargs=True, n_jobs=self.n_jobs) arrays_dicts = list() for arr_doc in arr : arrays_dicts.append(get_feats(arr_doc['doc'],arr_doc['i'])) del arr gathering_dict = dict((feat, None) for feat in FEATURES_NAMES) n_mentions_list = [] pairs_ant_index = 0 pairs_start_index = 0 for n, p, arrays_dict in tqdm(arrays_dicts): for f in FEATURES_NAMES: if gathering_dict[f] is None: gathering_dict[f] = arrays_dict[f] else: if f == FEATURES_NAMES[6]: array = [a + pairs_ant_index for a in arrays_dict[f]] elif f == FEATURES_NAMES[3]: array = [a + pairs_start_index for a in arrays_dict[f]] else: array = arrays_dict[f] gathering_dict[f] += array pairs_ant_index += n pairs_start_index += p n_mentions_list.append(n) for feature in FEATURES_NAMES[:10]: print("Building numpy array for", feature, "length", len(gathering_dict[feature])) if feature != "mentions_spans": #array = np.array(gathering_dict[feature]) # check if we are dealing with length of memories if feature == "mentions_stories" or feature == "pairs_stories" : train_config = dict() max_story_len = 0 if train_phase : max_story_len = max([len(story) for story in gathering_dict[feature]]) max_story_len = min(200,max_story_len) # max length of the story is 30 print("max story len, (in train phase should be 200)",max_story_len) if os.path.exists('train_config.pickle'): file_handle_init = open('train_config.pickle','rb') train_config = pickle.load(file_handle_init) file_handle_init.close() file_handle = open('train_config.pickle','wb') train_config[feature] = max_story_len pickle.dump(train_config,file_handle) file_handle.close() else : file_handle = open('train_config.pickle','rb') train_config = pickle.load(file_handle) max_story_len = train_config[feature] print("max story len is (should be 200),",max_story_len) file_handle.close() #append_list = [0] # 1 is the embedding size, because now the story cosists of word_idx #append_list = 50*[0] # 50 is the embedding size #print(type(append_list)) gathering_array = [] for story in gathering_dict[feature] : #print(len(story[0])) #print(len(story[1])) #random_pause = input() if len(story) > 200 : final_story = story[-200:] else : number_to_append = max(0,max_story_len - len(story)) #number_to_append = min(number_to_append,50) final_story = story + number_to_append*[0] #print(final_story) #print(len(final_story)) #random_pause = input() gathering_array.append(final_story) array = np.array(gathering_array) print(array.shape) else : array = np.array(gathering_dict[feature]) if array.ndim == 1: print("expand_dims for feature, ",feature) array = np.expand_dims(array, axis=1) else: array = np.stack(gathering_dict[feature]) # check_numpy_array(feature, array, n_mentions_list) print("Saving numpy", feature, "size", array.shape) #array_save = input() np.save(save_path + feature, array) for feature in FEATURES_NAMES[9:]: print("Saving pickle", feature, "size", len(gathering_dict[feature])) with open(save_path + feature + '.bin', "wb") as fp: pickle.dump(gathering_dict[feature], fp) del arrays_dicts, gathering_dict
base_directory = 'data/generated/musdb' os.makedirs(base_directory, exist_ok=True) num_training = 20000 num_validation = 2000 num_testing = 0 n_jobs = 12 splits = [('train', num_training), ('validation', num_validation), ('test', num_testing)] directories = { 'train': os.path.join('data/musdb', 'train'), 'validation': os.path.join('data/musdb', 'validation'), 'test': os.path.join('data/musdb', 'test'), 'background': None } for split, num_split in splits: print("Generating %s" % split) target_directory = os.path.join(base_directory, split) os.makedirs(target_directory, exist_ok=True) mixes = [{"i": i, 'scene_duration': 3.2, 'max_sources': 4, 'foreground_directory': directories[split], 'background_directory': directories['background'], "target_directory": target_directory} for i in range(num_split)] parallel_process(mixes, create_mixture, n_jobs=n_jobs, use_kwargs=True)
def read_corpus(self, data_path, debug=False): print("🌋 Reading files") for dirpath, _, filenames in os.walk(data_path): print("In", dirpath, os.path.abspath(dirpath)) file_list = [os.path.join(dirpath, f) for f in filenames if f.endswith(".v4_auto_conll") \ or f.endswith(".v4_gold_conll")] cleaned_file_list = [] for f in file_list: fn = f.split('.') if fn[1] == "v4_auto_conll": gold = fn[0] + "." + "v4_gold_conll" if gold not in file_list: cleaned_file_list.append(f) else: cleaned_file_list.append(f) doc_list = parallel_process(cleaned_file_list, load_file) for docs in doc_list: #executor.map(self.load_file, cleaned_file_list): for utts_text, utt_tokens, utts_corefs, utts_speakers, name, part in docs: print("Imported", name) if debug: print("utts_text", utts_text) print("utt_tokens", utt_tokens) print("utts_corefs", utts_corefs) print("utts_speakers", utts_speakers) print("name, part", name, part) self.utts_text += utts_text self.utts_tokens += utt_tokens self.utts_corefs += utts_corefs self.utts_speakers += utts_speakers self.utts_doc_idx += [len(self.docs_names) ] * len(utts_text) self.docs_names.append((name, part)) print("utts_text size", len(self.utts_text)) print("utts_tokens size", len(self.utts_tokens)) print("utts_corefs size", len(self.utts_corefs)) print("utts_speakers size", len(self.utts_speakers)) print("utts_doc_idx size", len(self.utts_doc_idx)) print("🌋 Building docs") for name, part in self.docs_names: self.docs.append( ConllDoc(name=name, part=part, nlp=None, blacklist=False, consider_speakers=True, embedding_extractor=self.embed_extractor, conll=CONLL_GENRES[name[:2]])) print("🌋 Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") spacy.info('en') model = 'en' nlp = spacy.load(model) print("🌋 Parsing utterances and filling docs") doc_iter = (s for s in self.utts_text) ##对文本里的每句话,使用spacy将conll文件中的数据提取,分别对应spacy_tokens, conll_tokens, corefs, speaker, doc_id ##并使用add_conll_utterance提取其中的mention,初始化Mention对象 for utt_tuple in tqdm( zip(nlp.pipe(doc_iter), self.utts_tokens, self.utts_corefs, self.utts_speakers, self.utts_doc_idx)): spacy_tokens, conll_tokens, corefs, speaker, doc_id = utt_tuple # if debug: print(unicode_(self.docs_names[doc_id]), "-", spacy_tokens) doc = spacy_tokens # if debug: # out_str = "utterance " + unicode_(doc) + " corefs " + unicode_(corefs) + \ # " speaker " + unicode_(speaker) + "doc_id" + unicode_(doc_id) # print(out_str.encode('utf-8')) #### self.docs[doc_id].add_conll_utterance( doc, conll_tokens, corefs, speaker, use_gold_mentions=self.use_gold_mentions)
relevant=relevant, k=i) mAP[i] = {} mAP[i]['precision'] = precision mAP[i]['recall'] = recall return mAP print("Beginning to score documents using {}...".format(args.jar)) query_set = list(claim_to_article.keys()) if args.small: query_set = query_set[:100] # result = Parallel(n_jobs=15)(delayed(score_claim)(c) for c in query_set) result = utils.parallel_process(query_set, score_claim, n_jobs=15) # print("Saving results to disk...") # with open("result.pkl", "wb") as f: # pickle.dump(result, f) mAP = utils.calculatemAP(result, k) print(utils.query_customized_lucene("testing", k=5, jar_name=args.jar)[0]) print("Mean Average Precision:") utils.displaymAP(mAP) recalls = [] recalls.extend(mAP[1]['recall']) recalls.extend(mAP[5]['recall']) recalls.extend(mAP[10]['recall']) recalls.extend(mAP[20]['recall']) print("Avg recall: {}".format(np.mean(recalls)))
nodes_src_list = map(gm.gid2id, srcmodules) print "Evaluating flows..." def wrap(pair): nodes_dest, nodes_src = pair[0], pair[1] return flower.eval_flow_centrality(nodes_dest, nodes_src, spm=spm_data, progressbar=False) if args.n_cores > 1: flow_values_all = utils.parallel_process( wrap, zip(nodes_dest_list, nodes_src_list), ) else: flow_values_all = map( wrap, zip(nodes_dest_list, nodes_src_list), ) flow_values_all = np.asarray(flow_values_all) flow_values = flow_values_all[:, :, 0].T flow_values_mean = flow_values.mean(axis=1) flow_values_std = flow_values.std(axis=1) data = pd.DataFrame(zip(nodelist, flow_values_mean, flow_values_std), columns=['NodeID', 'Flow_mean', 'Flow_std'])
diff = set(find_missing(vids)) vids = list(set(vids) - diff) filename = os.path.splitext(file)[0] with open('vidlist_%s.txt' % (filename), 'w') as file: file.write('\n'.join(vids)) if __name__ == '__main__': args = parser.parse_args() if args.input_list: if os.path.exists(args.input_list): with open(args.input_list, 'r') as file: vids = file.readlines() print(len(vids)) parallel_process(vids, download_by_youtube_id, n_jobs=args.num_thread) elif args.find_missing != None: print('Find missing videos:', ['Train/val', 'Test'][args.find_missing]) find_missing_csv(column=['YouTube ID', 'Video ID'][args.find_missing], files=[['train.csv', 'val.csv'], ['test.csv']][args.find_missing]) elif args.download_tests: print('Downloading test...') download_csv(files=['test.csv'], column='Video ID', num_thread=args.num_thread) elif args.gen_vidmap: print('Generate vid to index map') # gen_vidmap_csv(files=['train.csv', 'val.csv']) # vid start_idx length gen_length_vidmap_csv(column='Video ID', files=['test.csv']) elif args.gen_vidlist: print('Generate test vid list') gen_vid_list() else:
VIDEO_ROOT = '/data0/EEV/data' FRAME_ROOT = '/data/EEV/data-audio' # logging.basicConfig(filename='log/ea_{}.log'.format(int(time.time())), filemode='w', level=logging.DEBUG) def extract(filename): """ filename: 12736.mp4 """ assert len(filename) == 15 video_id = filename[:-4] full_path = os.path.join(FRAME_ROOT, video_id) # if os.path.exists(full_path + '.wav'): # return cmd = 'ffmpeg -i {}/{} -threads 1 -vn -acodec pcm_s16le -ac 1 -ar 16000 {}/{}.wav'.format( VIDEO_ROOT, filename, FRAME_ROOT, video_id) # extract audio # print(cmd.split(' ')) , stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL f = subprocess.run(cmd.split(' ')) if __name__ == '__main__': args = parser.parse_args() filenames = os.listdir(VIDEO_ROOT) # extract('IHRncab3Cdg.mp4') parallel_process(filenames, extract, n_jobs=args.num_thread)