def format_to_bert(args): for corpus_type in ["train", "test"]: a_lst = [] for json_f in glob.glob(pjoin(args.raw_path, "*" + corpus_type + "*.json")): real_name = json_f.split("/")[-1] a_lst.append( ( corpus_type, json_f, args, pjoin(args.save_path, real_name.replace("json", "bert.pt")), ) ) print(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join()
def format_to_bert(args): if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'valid', 'test'] for corpus_type in datasets: a_lst = [] for json_f in glob.glob( pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] a_lst.append((json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')))) print(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join()
def main(): config = { "data_level": 0.5, "tolerance": 1.0, "seg_threshold": 0.5, "min_area": 10 } # --- Process args --- # args = get_args() pool = Pool() list( tqdm(pool.imap( partial(run_one, out_dirpath=args.out_dirpath, config=config, im_dirpath=args.im_dirpath, out_ext=args.out_ext, bbox=args.bbox), args.seg_filepath), desc="Simple poly.", total=len(args.seg_filepath)))
def format_to_bert_w_scores(args): os.makedirs(args.save_path, exist_ok=True) if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'valid', 'test'] for corpus_type in datasets: a_lst = [] for json_f in glob.glob( pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = os.path.basename(json_f) a_lst.append((json_f, args, pjoin(args.save_path, real_name.replace('json', 'sent_score.pt')))) logger.info(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert_w_scores, a_lst): pass pool.close() pool.join()
def run_fit_two_poolable(outfile, cores, tasks, two_pulse_fit, pulse_params, height_th, sigma0): """ Append new pandas row to an outfile. """ p = Pool(cores) file_exists = os.path.isfile(outfile) with open(outfile, 'w+') as f: if not file_exists: """create header""" df = p.map( lambda f: fit_two_poolable(f, two_pulse_fit, pulse_params, height_th, sigma0), tasks[0:1]) df[0].to_csv(f, header=True) for df in p.imap( lambda f: fit_two_poolable(f, two_pulse_fit, pulse_params, height_th, sigma0), tasks): try: df.to_csv(f, header=False) except: pass
def format_to_bert(args): """ Transforms words to ids with BERT tokenizer. """ # Create folders if not os.path.isdir(args.save_path): os.makedirs(args.save_path) if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'valid', 'test'] # Multiprocessing for _format_to_bert() for corpus_type in datasets: if not args.debug: a_lst = [] for json_f in glob.glob( pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')))) print("Processing {} dataset...".format(corpus_type)) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join() else: # NOTE: debug without multiprocessing print("Processing {} dataset...".format(corpus_type)) for json_f in glob.glob( pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] _format_to_bert((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt'))))
def format_to_bert(args): """ Function to create dataset in bert format --- main function is _format_to_bert which create gold summaries using greedy_selection """ if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['train', 'valid', 'test'] for corpus_type in datasets: a_lst = [] for json_f in glob.glob( pjoin(args.raw_path, '*' + corpus_type + '.*.json')): real_name = json_f.split('/')[-1] a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')))) print(a_lst) pool = Pool(args.n_cpus) for d in pool.imap(_format_to_bert, a_lst): pass pool.close() pool.join()
def format_xlnet(args): if args.dataset is not '': data_type = [args.dataset] else: data_type = ['train', 'valid', 'test'] for corpus_type in data_type: a_lst = [] for json_f in glob.glob( join(args.json_path, '*' + corpus_type + '.*.json')): real_name = os.path.basename(json_f) print(real_name) a_lst.append((json_f, args, join(args.save_path, real_name.replace('json', 'xlnet.pt')))) print(a_lst) pool = Pool(args.n_cpus) for _ in pool.imap(_format_xlnet, a_lst): pass pool.close() pool.join()
def proc(i, json, day, pd, time): url = "https://coinmarketcap.com/currencies/" + json[i][ "slug"] + "/historical-data/?start=20130428&end=" + day try: r = pd.read_html(url)[0] except: time.sleep(10) r = pd.read_html(url)[0] r["Name"] = json[i]["name"] r["Symbol"] = json[i]["symbol"] return r def calculate(args): return args[0](*args[1]) num_tasks = len(json) pool = Pool(processes=cpu_count()) results = pool.imap(calculate, [(proc, [i, json, day, pd, time]) for i in range(num_tasks)]) la = [] for i, r in enumerate(results): la.append(r) sys.stderr.write('\rdone {0:%}'.format((i + 1) / num_tasks)) da = pd.concat(la) da.replace("-", 0, inplace=True) da.to_csv("Put your address here", index=False, encoding="utf-8")
from multiprocess import Pool def say(s): print(s) pool = Pool(1) for d in pool.imap(say, 'hello'): pass pool.close() pool.join()
def test(): print('cpuCount() = %d\n' % cpuCount()) # # Create pool # PROCESSES = 4 print('Creating pool with %d processes\n' % PROCESSES) pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.apply_async(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imap_unordered(calculatestar, TASKS) print('Ordered results using pool.apply_async():') for r in results: print('\t', r.get()) print() print('Ordered results using pool.imap():') for x in imap_it: print('\t', x) print() print('Unordered results using pool.imap_unordered():') for x in imap_unordered_it: print('\t', x) print() print('Ordered results using pool.map() --- will block till complete:') for x in pool.map(calculatestar, TASKS): print('\t', x) print() # # Simple benchmarks # N = 100000 print('def pow3(x): return x**3') t = time.time() A = list(map(pow3, range(N))) print('\tmap(pow3, range(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() B = pool.map(pow3, range(N)) print('\tpool.map(pow3, range(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() C = list(pool.imap(pow3, range(N), chunksize=N // 8)) print('\tlist(pool.imap(pow3, range(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() L = [None] * 1000000 print('def noop(x): pass') print('L = [None] * 1000000') t = time.time() A = list(map(noop, L)) print('\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() B = pool.map(noop, L) print('\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L) // 8)) print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() del A, B, C, L # # Test error handling # print('Testing error handling:') try: print(pool.apply(f, (5, ))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.apply()') else: raise AssertionError('expected ZeroDivisionError') try: print(pool.map(f, range(10))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.map()') else: raise AssertionError('expected ZeroDivisionError') try: print(list(pool.imap(f, range(10)))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from list(pool.imap())') else: raise AssertionError('expected ZeroDivisionError') it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError('expected ZeroDivisionError') assert i == 9 print('\tGot ZeroDivisionError as expected from IMapIterator.next()') print() # # Testing timeouts # print('Testing ApplyResult.get() with timeout:', end='') res = pool.apply_async(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print() print() print('Testing IMapIterator.next() with timeout:', end='') it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print() print() # # Testing callback # print('Testing callback:') A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.apply_async(mul, (7, 8), callback=A.append) r.wait() r = pool.map_async(pow3, range(10), callback=A.extend) r.wait() if A == B: print('\tcallbacks succeeded\n') else: print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print('Testing close():') for worker in pool._pool: assert worker.is_alive() result = pool.apply_async(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.is_alive() print('\tclose() succeeded\n') # # Check terminate() method # print('Testing terminate():') pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.is_alive() print('\tterminate() succeeded\n') # # Check garbage collection # print('Testing garbage collection:') pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.is_alive() print('\tgarbage collection succeeded\n')
tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True) # Read corpus and extract key phrases. def worker(text): doc = nlp(text) phrases = [p.text for p in doc._.phrases] num_words = len(doc) return phrases, num_words p = Pool(options.nproc) total_words = 0 vocab = collections.Counter() for phrases, num_words in tqdm(p.imap(worker, corpus)): # Note: This count include punctuation as well as words. total_words += num_words # examine the top-ranked phrases in the document seen = 0 for i, p in enumerate(phrases): if len(p.split()) == 1: continue # print("{:.3f} {}".format(p.rank, p.text)) vocab[p] += 1 seen += 1 if options.maxphrases_per_doc > 0 and seen == options.maxphrases_per_doc: break for k in sorted(vocab.keys()):
== 'abstractive')): with open( os.path.join(args.output_path, split_name, str(file_id) + '.txt'), 'w') as out_file: # Write article art = data[contents[0]].numpy().decode('utf-8') out_file.write(art) # Write summary summ = data[contents[1]].numpy().decode('utf-8') out_file.write('\n@highlight\n') out_file.write(summ) # Sentence split for dataset if args.mode == 'ssplit': for split_name, data_num in zip(split_names, data_nums): files = os.listdir(os.path.join(args.output_path, split_name)) file_list = [ os.path.join(args.output_path, split_name, f) for f in files ] parallel_func = partial(_reseperate_sentence, args.ssplit_target) pool = Pool(args.cpu_num) for d in tqdm(pool.imap(parallel_func, file_list), total=data_num, desc=split_name, unit=' file'): pass pool.close() pool.join()
def format_to_bert(args): test_kws = pd.read_csv('csv_files/train_papers_sect8.csv') kws = { 'intro': [kw.strip() for kw in test_kws['intro'].dropna()], 'related': [kw.strip() for kw in test_kws['related work'].dropna()], 'exp': [kw.strip() for kw in test_kws['experiments'].dropna()], 'res': [kw.strip() for kw in test_kws['results'].dropna()], 'conclusion': [kw.strip() for kw in test_kws['conclusion'].dropna()] } if (args.dataset != ''): datasets = [args.dataset] else: datasets = ['test'] if len(args.sent_numbers_file) > 0: sent_numbers = pickle.load(open(args.sent_numbers_file, "rb")) else: sent_numbers = None # ARXIVIZATION bart = args.bart check_path_existence(args.save_path) for corpus_type in datasets: a_lst = [] c = 0 for json_f in glob.glob(pjoin(args.raw_path, corpus_type + '.*.json')): real_name = json_f.split('/')[-1] c += 1 a_lst.append( (corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt')), kws, bart, sent_numbers, 1)) print("Number of files: " + str(c)) ########################## ###### <DEBUGGING> ####### ########################## # for a in a_lst: # _format_to_bert(a) # single # json_f = args.raw_path + '/train.6.json' # _format_to_bert(('val', str(json_f), args, pjoin(args.save_path, str(json_f).replace('json', 'bert.pt')), kws, bart, # sent_numbers, 25)) ########################## ###### <DEBUGGING> ####### ########################## pool = Pool(args.n_cpus) print('Processing {} set with {} json files...'.format(corpus_type, len(a_lst))) all_papers_count = 0 all_paper_ids = {} for d in tqdm(pool.imap(_format_to_bert, a_lst), total=len(a_lst), desc=''): all_paper_ids[d[0]] = d[1] all_papers_count += d[2] pool.close() pool.join()
def runInParallel(*proc): for p in proc: p.start() p.join() return # thread1.start() # thread1.join() # thread2.start() # thread1.join() pool = Pool() parallel_run = pool.imap(runInParallel, [p1, p2]) pool.close() pool.join() print("done with parallel") print("data loaded") # def runInParallel(*funcs): # proc = [] # for fn in funcs: # p = Process(target=fn[0], args=fn[1]) # p.start() # proc.append(p) # for p in proc: # p.join() # dill.dump_session("sample_pregen_dat.out")
def test(): print('cpuCount() = %d\n' % cpuCount()) # # Create pool # PROCESSES = 4 print('Creating pool with %d processes\n' % PROCESSES) pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.apply_async(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imap_unordered(calculatestar, TASKS) print('Ordered results using pool.apply_async():') for r in results: print('\t', r.get()) print() print('Ordered results using pool.imap():') for x in imap_it: print('\t', x) print() print('Unordered results using pool.imap_unordered():') for x in imap_unordered_it: print('\t', x) print() print('Ordered results using pool.map() --- will block till complete:') for x in pool.map(calculatestar, TASKS): print('\t', x) print() # # Simple benchmarks # N = 100000 print('def pow3(x): return x**3') t = time.time() A = list(map(pow3, xrange(N))) print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() B = pool.map(pow3, xrange(N)) print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t)) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N//8)) print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() L = [None] * 1000000 print('def noop(x): pass') print('L = [None] * 1000000') t = time.time() A = list(map(noop, L)) print('\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() B = pool.map(noop, L) print('\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t)) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L)//8)) print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t)) assert A == B == C, (len(A), len(B), len(C)) print() del A, B, C, L # # Test error handling # print('Testing error handling:') try: print(pool.apply(f, (5,))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.apply()') else: raise AssertionError('expected ZeroDivisionError') try: print(pool.map(f, range(10))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from pool.map()') else: raise AssertionError('expected ZeroDivisionError') try: print(list(pool.imap(f, range(10)))) except ZeroDivisionError: print('\tGot ZeroDivisionError as expected from list(pool.imap())') else: raise AssertionError('expected ZeroDivisionError') it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError('expected ZeroDivisionError') assert i == 9 print('\tGot ZeroDivisionError as expected from IMapIterator.next()') print() # # Testing timeouts # print('Testing ApplyResult.get() with timeout:', end='') res = pool.apply_async(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print() print() print('Testing IMapIterator.next() with timeout:', end='') it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print() print() # # Testing callback # print('Testing callback:') A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.apply_async(mul, (7, 8), callback=A.append) r.wait() r = pool.map_async(pow3, range(10), callback=A.extend) r.wait() if A == B: print('\tcallbacks succeeded\n') else: print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print('Testing close():') for worker in pool._pool: assert worker.is_alive() result = pool.apply_async(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.is_alive() print('\tclose() succeeded\n') # # Check terminate() method # print('Testing terminate():') pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.is_alive() print('\tterminate() succeeded\n') # # Check garbage collection # print('Testing garbage collection:') pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.apply_async(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.is_alive() print('\tgarbage collection succeeded\n')