def certScanner (self) : p = Pool(nodes = 512) cprint ("[+] Keywords : " + " ".join(str(x) for x in self.keywordList), 'green') # self.allipAddrList = self.shuffleList() self.allipAddrList = [x for x in self.shuffleList() if self.region in x ] for self.tryipClass in self.allipAddrList: self.ipExtractResult = self.ipExtract(self.tryipClass.split("@")[0]) _max = len(self.ipExtractResult) cprint ("[+] Scanning IP Addr Class : " + self.tryipClass + "\t-- Number of scan target is :" + str(len(self.ipExtractResult)), 'green') with tqdm(total=_max) as pbar: pbar.set_description("[+] Progressing : %s " %self.tryipClass) for i, domain in tqdm(enumerate(p.imap(self.certChecker, self.ipExtractResult))): pbar.update() if domain is not None: self.resList.append(domain) pbar.close() p.terminate() # Like p.close() p.restart() # Like p.join() if self.resList: self.printRes() else: cprint ("[!] No kewords found on this IP class \n", 'red') time.sleep(1) self.ipExtractResult = [] self.resList = []
def train_multiprocessed(self, iInputDS, iTime, iSignal, iHorizon): pool = Pool(self.mOptions.mNbCores) self.defineTransformations(iInputDS, iTime, iSignal) # print([transform1.mFormula for transform1 in self.mTransformList]); args = [] for transform1 in self.mTransformList: arg = cTraining_Arg(transform1.get_name("")) arg.mSigDec = cSignalDecompositionOneTransform() arg.mSigDec.mOptions = self.mOptions arg.mSigDec.mExogenousData = self.mExogenousData arg.mInputDS = iInputDS arg.mTime = iTime arg.mSignal = iSignal arg.mHorizon = iHorizon arg.mTransformation = transform1 arg.mOptions = self.mOptions arg.mExogenousData = self.mExogenousData arg.mResult = None args.append(arg) for res in pool.imap(run_transform_thread, args): # print("FINISHED_TRAINING" , res.mName); self.mSigDecByTransform[res.mTransformation.get_name( "")] = res.mSigDec
def run_pool(cmd_list, threads=None): if threads: pool = Pool(threads) else: pool = Pool() list(tqdm.tqdm(pool.imap(run, cmd_list), total=len(cmd_list)))
def get_installed_packages(): """ get a list of all packages currently installed in the active environment """ packages = [] pool = Pool(4) # for dist in track( # list(Distribution.discover()), description="[cyan]Grabbing dependency info" # ): # packages.append(Package.from_dist(dist)) dists = list(Distribution.discover()) dists_num = len(dists) log.info("[bold]Found a total of {} distributions".format(dists_num), extra={"markup": True}) for package_enum in enumerate(pool.imap(Package.from_dist, dists), start=1): package = package_enum[1] log.info("{0}/{1}: processed [bold cyan]{2} {3}[/bold cyan]".format( package_enum[0], dists_num, package.name, package.version), extra={"markup": True}) packages.append(package) return packages
def multi_Non_Tweep_friends(self, handle): min_position, links = self.get_tweets(handle) print("Scraping last 100 days of activity") while (True): min_position1, links1 = self.get_tweets(handle, min_position) links = links + links1 if (min_position1 == None): break min_position = min_position1 people_list = [] link = [x for x in links if handle in x] link = self.duplicates(link) p = Pool(10) # Pool tells how many at a time with Pool(10) as p: records = list(tqdm(p.imap(self.get_people, link), total=len(link))) p.terminate() p.join() p.close() people_list = [item for sublist in records for item in sublist] people_list = self.duplicates(people_list) people_list = [x for x in people_list if x != handle] return (people_list)
def q5_plot_chromatic_num_bounds_by_prob(n, prange, pstep, k=None,\ clique_finder=greedy_find_clique_number, multi=False): """Plots a graph of number of colours against edge probability, for each of the various lower/upper bounds of chromatic number multi: True/False/int multiprocessing - yes/no/ num processes (default 4 if true) """ probs = np.arange(prange[0], prange[1], pstep) graphs = [[get_random_graph(n, p, k) for _ in range(10)] for p in probs] mean_bounds = [] pool = Pool(multi if type(multi) is int else 4) # graph_generator = pool.imap(multiprocessing_chrom_bounds_func, graphs) if multi else map(f, graphs) f = lambda graphs_list: list(map(get_chromatic_number_bounds, graphs_list)) graph_generator = pool.imap(f, graphs) if multi else map(f, graphs) for bounds in tqdm.tqdm(graph_generator, total=len(graphs)): mean_bounds.append(np.mean(bounds, axis=0)) pool.close() pool.join() mean_bounds = np.array(mean_bounds) plt.figure() for i, label in zip(range(mean_bounds.shape[1]), \ ['lb_comp', 'lb_clique', 'ub_clique', 'ub_greedy_rand', 'ub_greedy_msd']): plt.plot(probs, mean_bounds[:, i], label=label) plt.legend() return probs, mean_bounds
def goo(): pool = Pool(4) # def f(x): # return foo(100 + x) stuff = list(tqdm.tqdm(pool.imap(foo, range(20)), total=20)) print(stuff) print('aaa') pool.close() pool.join() print('bbb')
def computePerfsInParallel(self, args): lModels = {}; # print([arg.mName for arg in args]); # print([arg.mModel.mOutName for arg in args]); pool = Pool(self.mOptions.mNbCores) # results = [compute_perf_func(arg) for arg in args]; for res in pool.imap(compute_perf_func, args): # print("FINISHED_PERF_FOR_MODEL" , res.mName); lModels[res.mName] = res.mModel; # pool.close() # pool.join() return lModels;
def combine_scores(): """Combine the scores from all patients and dump into all_dict.txt. """ all_dicts = {} duration_dict = {} all_dict_q = multiprocessing.Manager().Queue() duration_dict_q = multiprocessing.Manager().Queue() dirs = [ y for y in os.listdir(patient_dir) if os.path.isdir(os.path.join(patient_dir, y)) ] bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(dirs)) f = functools.partial(scores_and_duration_dict, all_dict_q, duration_dict_q) p = Pool() for i, _ in enumerate(p.imap(f, dirs, chunksize=50), 1): bar.update(i) p.close() p.join() while not all_dict_q.empty(): patient_dict = all_dict_q.get() dur_dict = duration_dict_q.get() for i in patient_dict: print(i) if i not in all_dicts: all_dicts[i] = patient_dict[i] else: all_dicts[i].update(patient_dict[i]) for i in dur_dict: print(i) if i not in duration_dict: duration_dict[i] = dur_dict[i] else: duration_dict[i].update(dur_dict[i]) print('done combining scores, dumping...') json.dump(all_dicts, open(os.path.join(patient_dir, 'all_dict.txt'), 'w')) json.dump(duration_dict, open(os.path.join(patient_dir, 'duration_dict.txt'), 'w'))
# Ignore black images if slice.max() == 0 and slice.min() == 0: continue file_name = os.path.join(spectogram_dir_path, "{}{}.png".format(index, i)) scipy.misc.imsave(file_name, np.squeeze(slice)) return f if __name__ == '__main__': dataset_dir = '/media/work/audio/musiclid/youtube_spoken/' output_path_raw = os.path.join(dataset_dir, 'raw') output_path_spectograms = os.path.join(dataset_dir, 'spectograms') pool = Pool() for language in os.listdir(output_path_raw): for source_name in os.listdir(os.path.join(output_path_raw, language)): files = glob.glob(os.path.join(output_path_raw, language, source_name, "*.mp3")) spectogram_dir_path = os.path.join(output_path_spectograms, language, source_name) if not os.path.exists(spectogram_dir_path): os.makedirs(spectogram_dir_path) data = [(f, spectogram_dir_path, i) for i, f in enumerate(files)] for f in tqdm(pool.imap(segment_file, data), 'spectograms for {}/{}'.format(language, source_name), total=len(files)): pass
class analyze(setup.setup): def __init__(self,args,logging_level=logging.INFO): super(analyze, self ).__init__(args,logging_level) # set up processing pool and run all analyses specified in args def run(self): if self.args.jumpdists: n_bins=100. bin_width = 1/n_bins bins = np.arange(0,1+bin_width,1/n_bins) if self.args.file: user,vals = self.artist_jump_distributions(self.args.file,bins=bins,self_jumps=False) with open(self.args.resultdir+user,'w') as fout: fout.write(','.join(vals.astype(str))+'\n') else: raise('not implemented!') self.pool = Pool(self.args.n) self.rootLogger.info("Pool started") self.rootLogger.info("Starting jump distance analysis") func_partial = partial(self.artist_jump_distributions,bins=bins,self_jumps=False) with open(self.args.resultdir+'jumpdists','w') as fout: for user,vals in self.pool.imap(func_partial,self.listen_files): fout.write(user+'\t'+','.join(vals.astype(str))+'\n') self.pool.close() self.rootLogger.info("Pool closed") if self.args.blockdists: #self.rootLogger.info("Starting block distance analysis") self.mean_block_distances(self.args.file) if self.args.diversity_dists: bins = np.arange(0,1.01,.01) self.diversity_distributions(self.args.file,bins=bins) if self.args.clustering: self.clustering(self.args.file) if self.args.values: self.patch_values(self.args.file) if self.args.exp: self.explore_exploit(self.args.file) if self.args.patch_len_dists: self.patch_len_dists(self.args.file) # calculate distribution (using histogram with specified bins) # of sequential artist-to-artist distances def artist_jump_distributions(self,fi,bins,self_jumps=False): user = fi.split('/')[-1][:-4] df = pd.read_pickle(fi) if self_jumps: vals = np.histogram(df['dist'].dropna(),bins=bins)[0] else: vals = np.histogram(df['dist'][df['dist']>0],bins=bins)[0] self.rootLogger.info('artist jump distances done for user {} ({})'.format(user,fi)) return user,vals # calculate distribution (using histogram with specified bins) # of patch diversity for each user # awk 'FNR==1' * > diversity_dists_zeros # awk 'FNR==2' * > diversity_dists_nozeros def diversity_distributions(self,fi,bins): if 'patches' not in fi: raise('WRONG DATATYPE') user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi).dropna(subset=['diversity']) zeros = np.histogram(df[df['n']>=5]['diversity'],bins=bins)[0] nozeros = np.histogram(df[(df['n']>=5)&(df['diversity']>0)]['diversity'],bins=bins)[0] zeros = zeros/float(zeros.sum()) nozeros = nozeros/float(nozeros.sum()) with open(self.args.resultdir+user,'w') as fout: fout.write(user+'\t'+'zeros'+'\t'+','.join(zeros.astype(str))+'\n') fout.write(user+'\t'+'nozeros'+'\t'+','.join(nozeros.astype(str))+'\n') self.rootLogger.info('diversity distributions done for user {} ({})'.format(user,fi)) def mean_block_distances(self,fi,n=100): def cos_nan(arr1,arr2): if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)): return np.nan else: return cosine(arr1,arr2) user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi) blocks = df[df['n']>=5].dropna() result = [] for i in xrange(len(blocks)-n): first = blocks['centroid'].iloc[i] result.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) result = np.nanmean(np.vstack(result),0) with open(self.args.resultdir+user,'w') as fout: fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) # now shuffled # idx = np.array(blocks.index) # np.random.shuffle(idx) # blocks = blocks.reindex(idx) # result_random = [] # for i in xrange(len(blocks)-n): # first = blocks['centroid'].iloc[i] # result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) # result_random = np.nanmean(np.vstack(result_random),0) # with open(self.args.resultdir+user,'w') as fout: # fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') # fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n') # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) def clustering(self,fi): df = pd.read_pickle(fi) user = fi.split('/')[-1].split('_')[0] mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values)&(df['n']>=5)&(df['diversity']<=0.2) clust_data = df[mask].reset_index() arr = np.vstack(clust_data['centroid']) Z = linkage(arr, 'complete') clusters = fcluster(Z,t=0.2,criterion='distance') assignments = np.repeat(np.nan,len(df)) assignments[np.where(mask)] = clusters df['patch_clust'] = assignments df.to_pickle('{}{}.pkl'.format(self.args.resultdir,user)) self.rootLogger.info('Patch clusters for user {} processed successfully ({})'.format(user,fi)) def patch_len_dists(self,fi): df = pd.read_pickle(fi) user = fi.split('/')[-1][:-4] explore = df[np.isnan(df['patch_clust'])] result_explore = explore['n'].value_counts() df['explore'] = np.isnan(df['patch_clust']).astype(int) df['explore-idx'] = df['explore'].cumsum() result_exploit = df.groupby('explore-idx').apply(lambda df: df.dropna()['n'].sum()).value_counts() result_explore = result_explore.reindex(xrange(1,max(result_explore.index)+1),fill_value=0.).values result_exploit = result_exploit.reindex(xrange(1,max(result_exploit.index)+1),fill_value=0.).values result_explore = sparse.csr_matrix(result_explore) result_exploit = sparse.csr_matrix(result_exploit) with open(self.args.resultdir+user,'w') as fout: fout.write(user+'\t'+'explore'+'\t'+':'.join([','.join(a.astype(str)) for a in result_explore.data,result_explore.indices,result_explore.indptr])+'\n') fout.write(user+'\t'+'exploit'+'\t'+':'.join([','.join(a.astype(str)) for a in result_exploit.data,result_exploit.indices,result_exploit.indptr])+'\n') self.rootLogger.info('User {} processed successfully ({})'.format(user,fi)) def explore_exploit(self,fi): user = fi.split('/')[-1][:-4] df_patches_raw = pd.read_pickle(fi) # add time in next bout df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1) # add patch values # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum() # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum()) # overall_prop.name = 'final_value' # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust') """ # time in next exploit patch as function of exploration time result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean() fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # total time exploiting as a function of time exploring df_patches_raw['explore'] = np.isnan(df_patches_raw['patch_clust']).astype(int) df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum() # combine all exploit listens #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]})) # only last exploit bout grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'].iloc[-1]]})) #result = grp_explore.groupby('n')['n-exploit'].mean() #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # exploration time as a function of exploitation time grp_exploit = grp_explore.copy() grp_exploit['n-explore'] = grp_exploit['n'].shift(-1) result = grp_exploit.groupby('n-exploit')['n-explore'].mean() fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # prob exploit given explore time - already done # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])] # result = explore_only['n'][:-1].value_counts() # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values # final_result = arr/(np.cumsum(arr[::-1])[::-1]) # final_result = sparse.csr_matrix(final_result) # with open(self.args.resultdir+user+'_exploit','w') as fout: # fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') # prob explore given exploit time result = grp_explore['n-exploit'][grp_explore['n-exploit']>0].value_counts() arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values final_result = arr/np.cumsum(arr[::-1])[::-1] final_result = sparse.csr_matrix(final_result) with open(self.args.resultdir+user+'_explore','w') as fout: fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # patch value as a function of exploration time df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1) result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean() fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ self.rootLogger.info('User {} processed successfully ({})'.format(user,fi))
if __name__ == '__main__': output_path_raw = '/media/work/audio/musiclid/youtube_spoken/raw' output_path_segmented = '/media/work/audio/musiclid/youtube_spoken/segmented' pool = Pool() for language in os.listdir(output_path_raw): for source_name in os.listdir(os.path.join(output_path_raw, language)): files = glob.glob( os.path.join(output_path_raw, language, source_name, "*.mp3")) segment_dir_path = os.path.join(output_path_segmented, language, source_name) if not os.path.exists(segment_dir_path): os.makedirs(segment_dir_path) data = [(f, segment_dir_path) for f in files] data = [] for f in tqdm(pool.imap(segment_file, data), 'segmenting files in {}/{}'.format( language, source_name), total=len(files)): pass file_counter[language] += len( glob.glob(os.path.join(segment_dir_path, "*.wav"))) print file_counter
def main(argv): parser = ArgumentParser(description="A baseball pitch simulator.") parser.add_argument("config_file", action="store", help="Configuration file." ) parser.add_argument("-l", "--list-pitchers", action="store_true", help="List the name of all built-in pitcher available for use." ) parser.add_argument("-p", "--show-performance", action="store_true", help="Generate plot(s) showing model performance after training." ) args = parser.parse_args(argv) if args.list_pitchers: pitchers = [ p for p in dir(Pitchers) if isinstance( getattr(Pitchers,p), Pitchers.Pitcher ) ] print("Available pitchers:") for p in pitchers: print("\t",p) sys.exit(0) # load config config_file = pathlib.Path(args.config_file).resolve() with config_file.open() as f: conf = yaml.safe_load(f) pitcher = None pitcher_name = conf.get("pitcher",dict()).get("name",None) if pitcher_name is not None: pitcher = copy.deepcopy(getattr(Pitchers,pitcher_name)) # setup data files training_conf = conf.get("training",dict()) training_data_file_template = training_conf.get("training data file","{name}-training-data-{num}-{id}.yaml") input_model_file_template = training_conf.get("input model file","{name}-model-{id}.yaml") output_model_file_template = training_conf.get("output model file","{name}-model-{id}-new.yaml") num_training_trials = training_conf.get("number trials",1000) num_training_epochs = training_conf.get("number epochs",100) context = dict() context["name"] = pitcher_name context["id"] = pitcher.id() context["num"] = num_training_trials training_data_file = pathlib.Path(training_data_file_template.format(**context)).resolve() input_model_file = pathlib.Path(input_model_file_template.format(**context)).resolve() output_model_file = pathlib.Path(output_model_file_template.format(**context)).resolve() sim = Simulation() sim.configure(conf.get('simulation',dict())) if input_model_file.is_file(): pitcher.aim_model.load(str(input_model_file)) else: print(f"WARNING: '{str(input_model_file)}' does not exist.") losses = pitcher.train(sim,num_training_epochs,num_training_trials,training_data_file) if output_model_file.is_file(): print(f"WARNING: '{str(output_model_file)}' already exists. It will be OVERWRITTEN.") pitcher.aim_model.save(str(output_model_file)) print("Summary:") print(f"\tinitial training loss: {losses[0]}") print(f"\t final training loss: {losses[-1]}") if args.show_performance: print("Evaluating pitcher") configs = list() aim_locations = list() for x in numpy.arange( -15, 15+1,5 ): for z in numpy.arange( 12,5*12+1, 12 ): aim_x = Q_(x,'inch') aim_z = Q_(z,'inch') config = pitcher.configure_throw( 1, Q_(100,'percent'), aim_z, aim_x) aim_locations.append( [aim_x,aim_z] ) configs.append(config) def compute_location(config): trajectory = sim.run(config, terminate_function=lambda x: x[-1][2] < 0, record_all=False) act_x = Q_(trajectory[0][1],'m') act_z = Q_(trajectory[0][3],'m') return [act_x,act_z] pool = Pool() locations = list(tqdm.tqdm(pool.imap( compute_location, configs), total=len(configs))) txs = [ r[0].to("in").magnitude for r in aim_locations ] tzs = [ r[1].to("in").magnitude for r in aim_locations ] axs = [ r[0].to("in").magnitude for r in locations ] azs = [ r[1].to("in").magnitude for r in locations ] fig = go.Figure(data=[go.Scatter(x=txs,y=tzs,mode='markers'),go.Scatter(x=axs,y=azs,mode='markers')]) fig.show()
def newspaperarchive_scraper(search_terms, start_date, end_date, filepath): #Define functions #Define date generator def perdelta(start, end, delta): curr = start while curr < end: yield curr curr += delta #Define URL grabber def newspaperarchive_url(search_terms, date): day = "&pd={0}".format(date.day) month = "&pm={0}".format(date.month) year = "&py={0}".format(date.year) search_terms = "&pep={0}".format(search_terms.replace(" ", "-")) url = "http://access.newspaperarchive.com/tags/?pci=7&ndt=ex" + day + month + year + search_terms + "&pr=100" return url def test_matches(html): test = BeautifulSoup(html, 'html.parser') test_result = test.find('h2', text="0 Results for search") return test_result def extract_articles(page): #Grab articles articles = page.find_all('div', class_="searchResultBlock searchResultBlockWithThumb") return articles def extract_data(article, day): line = {} line['archive'] = "newspaperarchive" try: line["publication_title"] = article.h4.a.get_text().strip().encode('utf8') except: line["publication_title"] = "" line["href"] = article.a['href'] try: line['publication_id'] = re.search("(?<=http://access\.newspaperarchive\.com/)([^/]+/[^/]+/[^/]+/[^/]+)", line['href']).group(0) except: line['publication_id'] = "" line["search_date"] = day try: line['page'] = re.search("(?<=/page-)(\d\d?)", line['href']).group(0) except: line['page'] = "" line['search_terms'] = search_terms return line def scrape(search_terms, day): sleep(1) print day #Visit URL and parse url = newspaperarchive_url(search_terms, day) wait = 0 while True: try: start = requests.get(url, timeout=(1,180)).text break except: print "... trying again ..." + str(day) sleep(1.5**wait) wait += 1 #Are there any hits? if test_matches(start) == None: lines = [] nextLink = [] page = start page_number = 2 while nextLink != None: soup = BeautifulSoup(page, 'html.parser') articles = extract_articles(soup) #extract article data for article in articles: lines.append(extract_data(article, day)) #Get nextLink try: nextLink = soup.find('a', text=page_number)['href'] wait = 0 while True: try: page = requests.get(nextLink, timeout=(1,180)).text break except: print "... trying again ..." + str(day) sleep(1.5**wait) wait += 1 page_number += 1 except TypeError: nextLink = None return lines else: return None #Complete scraper #Parallel processing if __name__ == "__main__": #Create file name timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1)) filename = "newspaperarchive-" + timeperiod + ".csv" pool = Pool(10) date_list = [] for date in perdelta(start_date, end_date, timedelta(days=1)): date_list.append(date) search_terms_list = [search_terms] * len(date_list) result_iter = pool.imap(scrape, search_terms_list, date_list) #Create CSV fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"] with open("/".join((filepath,filename)), "w") as w: writer = csv.DictWriter(w, fieldnames=fields) writer.writeheader() for result in result_iter: if result != None: writer.writerows(result)
def init_data(self, data_name, n_chunk=1024): print(f'Initializing {data_name} data...') def transform_triple_to_hrt(triple_idx): """ Transforms triple-idx (as a whole) to h/r/t format """ if triple_idx == -1: # for response_triple return NAF_TRIPLE triple = self.idx2triple[triple_idx] h, r, t = triple.split(', ') return [self.word2idx[h], self.rel2idx[r], self.word2idx[t]] def process_file(root, inp): start_i, filename = inp n_sample = line_count(filename) post = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) post_length = np.zeros( (n_sample), dtype=np.int32) # valid length (without pad) response = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) response_length = np.zeros((n_sample), dtype=np.int32) # post_triple = np.zeros((n_sample, self.args.max_sentence_len), dtype=np.int32) triple = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len, 3), dtype=np.int32) entity = np.zeros((n_sample, self.args.max_sentence_len, self.args.max_triple_len), dtype=np.int32) response_triple = np.zeros( (n_sample, self.args.max_sentence_len, 3), dtype=np.int32) max_post_len, max_response_len, max_triple_len = 0, 0, 0 with jsonlines.open(filename) as df: for i, line in enumerate(df): pl, rl = len(line['post']) + 2, len(line['response']) + 2 post_length[i] = pl response_length[i] = rl max_post_len = max(pl, max_post_len) max_response_len = max(rl, max_response_len) max_triple_len = max([len(l) for l in line['all_triples']] + [max_triple_len]) all_triples = [ line['all_triples'][i - 1] if i > 0 else [-1] for i in line['post_triples'] ] post[i, :pl] = [SOS_IDX] + [ self.get_word_idx(p) for p in line['post'] ] + [EOS_IDX] response[i, :rl] = [SOS_IDX] + [ self.get_word_idx(r) for r in line['response'] ] + [EOS_IDX] # post_triple[i, 1:pl-1] = np.array(line['post_triples']) # [0, 0, 1, 0, 2...] response_triple[i, :rl] = [NAF_TRIPLE] + [ transform_triple_to_hrt(rt) for rt in line['response_triples'] ] + [NAF_TRIPLE] # put NAF_TRIPLE/entity at index 0 triple[i] = pad_2d( [[NAF_TRIPLE]] + [[transform_triple_to_hrt(t) for t in triples] for triples in all_triples] + [[NAF_TRIPLE]], length=(self.args.max_sentence_len, self.args.max_triple_len, 3)) entity[i] = pad_2d( [[NAF_IDX]] + [[self.entidx2wordidx[e] for e in entities] for entities in line['all_entities']] + [[NAF_IDX]], length=(self.args.max_sentence_len, self.args.max_triple_len)) # dump to zarr root['post'][start_i:start_i + n_sample] = post root['post_length'][start_i:start_i + n_sample] = post_length root['response'][start_i:start_i + n_sample] = response root['response_length'][start_i:start_i + n_sample] = response_length # root['post_triple'][start_i : start_i+n_sample] = post_triple root['triple'][start_i:start_i + n_sample] = triple root['entity'][start_i:start_i + n_sample] = entity root['response_triple'][start_i:start_i + n_sample] = response_triple return max_post_len, max_response_len, max_triple_len toread = [ f'{self.data_path}/{data_name}set_pieces/{piece}' for piece in os.listdir(f'{self.data_path}/{data_name}set_pieces') ] n_lines = sum([line_count(piece) for piece in toread]) init_n_lines = math.ceil( n_lines / n_chunk) * n_chunk # 마지막 조각 사이즈가 지정된 청크 사이즈보다 작아져서 나는 에러 방지 root = zarr.open(f'{self.data_path}/{data_name}set_new.zarr', mode='w') post = root.zeros('post', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') post_length = root.zeros('post_length', shape=(init_n_lines, ), chunks=(n_chunk, ), dtype='i4') # valid length (without pad) response = root.zeros('response', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') response_length = root.zeros('response_length', shape=(init_n_lines, ), chunks=(n_chunk, ), dtype='i4') post_triple = root.zeros('post_triple', shape=(init_n_lines, self.args.max_sentence_len), chunks=(n_chunk, None), dtype='i4') triple = root.zeros('triple', shape=(init_n_lines, self.args.max_sentence_len, self.args.max_triple_len, 3), chunks=(n_chunk, None, None, None), dtype='i4') entity = root.zeros('entity', shape=(init_n_lines, self.args.max_sentence_len, self.args.max_triple_len), chunks=(n_chunk, None, None), dtype='i4') response_triple = root.zeros('response_triple', shape=(init_n_lines, self.args.max_sentence_len, 3), chunks=(n_chunk, None, None), dtype='i4') pool = Pool(min(len(toread), mp.cpu_count())) func = functools.partial(process_file, root) iterinp = [(i * self.args.data_piece_size, filename) for i, filename in enumerate(toread)] max_post_lens, max_response_lens, max_triple_lens = zip( *tqdm(pool.imap(func, iterinp), total=len(iterinp))) max_post_len, max_response_len, max_triple_len = max( max_post_lens), max(max_response_lens), max(max_triple_lens) # trim remaining space post.resize(n_lines, max_post_len) post_length.resize(n_lines) response.resize(n_lines, max_response_len) response_length.resize(n_lines) post_triple.resize(n_lines, max_post_len) triple.resize(n_lines, max_post_len, max_triple_len, 3) entity.resize(n_lines, max_post_len, max_triple_len) response_triple.resize(n_lines, max_response_len, 3) print( f'Dumped {data_name} at: {self.data_path}/{data_name}set_new.zarr')
def RunCutplan(self): # initialisations cbL = 3 timer = True id = [] # t = time.time() if id == []: start_id = sum(self.completed) id = list(range(start_id, self.Cutplans.shape[0])) numproc = cpu_count() - 2 p = Pool(processes=numproc) # cpSched = self.Cutplans.iloc[id] total = len(id)*1000 for cID in id: c = self.Cutplans.iloc[cID] # find desc to be used to open the correct log data file desc = c.Description[2:4]+"-"+str(int(c.Description[5:7])-1) # get data from log data file LD = pd.read_csv(self.logPath+desc+'.csv') # initialise recovery recovery = Recovery(c) iterLog = [] iterC = [] for i in range(LD.shape[0]): log = LD.iloc[i] iterLog.append(log) iterC.append(c) # ============================================================================= # completed = [] # for lID in range(len(iterLog)): # log = iterLog[lID] # coords = GetLogCoords(log, c) # completed.append(coords) # Timer(id, cID, lID, time.time()-t) # ============================================================================= # if id.index(cID) > 0: # p.restart() data = [] data = p.imap(GetLogCoords, iterLog, iterC) completed = [] i = 0 while len(completed) < LD.shape[0]: try: res = next(iter(data)) completed.append(res) if self.abort: self.abort = False return if timer: count = id.index(cID)*1000 + len(completed) self.l_progress.emit(count/total) # Timer(id, cID, len(completed)-1, time.time()-t) except BaseException: break self.AveR[cID] = Recovery(c) self.BoardBreakdown[cID] = BoardBreakdown(c) minW = [1000000, 1000000] minH = [1000000, 1000000] minWID = [0, 0] minHID = [0, 0] numOF = 0 for lID in range(LD.shape[0]): coords = completed[lID] newW, newH = CalcUseable(coords) if newW < minW[0]: minW[0] = newW minWID[0] = lID if newH < minH[0]: minH[0] = newH minHID[0] = lID newW1, newH1 = CalcUseable(coords, cbL) if newW1 < minW[1]: minW[1] = newW1 minWID[1] = lID if newH1 < minH[1]: minH[1] = newH1 minHID[1] = lID # OF = recovery.RunRecoveryRand(coords, offS=2.725) # numOF += not OF OF = recovery.RunRecovery(coords) numOF += not OF self.AveR[cID].AddRecovery(recovery) self.BoardBreakdown[cID].AddRecovery(recovery) self.LogVol[cID, lID] = CalcBoardVol( LD.iloc[lID], c, recovery ) # recovery.RunRecovery(coords) # self.LogVol[cID, lID] = CalcBoardVol( # LD.iloc[lID], c, recovery # ) self.AveR[cID].AverageRecovery(LD.shape[0]) self.BoardBreakdown[cID].AverageRecovery(LD.shape[0]) self.MinHLog[cID][0] = minHID[0] self.MinWLog[cID][0] = minWID[0] self.MinW[cID][0] = minW[0] self.MinH[cID][0] = minH[0] self.MinHLog[cID][1] = minHID[1] self.MinWLog[cID][1] = minWID[1] self.MinW[cID][1] = minW[1] self.MinH[cID][1] = minH[1] self.OpenFacePerc[cID] = numOF/LD.shape[0] time.sleep(0.1) self.completed[cID] = True self.cp_progress.emit(cID) self.EstVol = np.nanmean(self.LogVol, 1) * np.array( self.Cutplans.LogCount) self.finished.emit() return self.AveR
from pathos.multiprocessing import ProcessingPool as pool from tqdm import tqdm def F(X, lamda=10, weight=0.05): print(X, lamda, weight) return res zip_lamda = [i for i in range(10)] x = [i + 10 for i in range(10)] with tqdm(total=len(x)) as t: for i, x in enumerate(pool.imap(F, x, zip_lamda)): t.update() pool.close() pool.join()
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'source_path', help="Path to the video or audio file to subtitle", nargs='?') parser.add_argument( '-C', '--concurrency', help="Number of concurrent API requests to make", type=int, default=10) parser.add_argument( '-o', '--output', help= "Output path for subtitles (by default, subtitles are saved in \ the same directory and name as the source path)" ) parser.add_argument('-F', '--format', help="Destination subtitle format", default="srt") parser.add_argument('-S', '--src-language', help="Language spoken in source file", default="en") parser.add_argument('-D', '--dst-language', help="Desired language for the subtitles", default="en") parser.add_argument( '-K', '--api-key', help= "The Google Translate API key to be used. (Required for subtitle translation)" ) parser.add_argument('--list-formats', help="List all available subtitle formats", action='store_true') parser.add_argument( '--list-languages', help="List all available source/destination languages", action='store_true') if (os.name == "posix"): print os.system("uname -a") else: print "unknown OS" args = parser.parse_args() # print "arguments",args args.source_path = str(self.filename) print args.source_path, "SOURCE PATH" # print "CONCURRENCY >>>", args.concurrency # print args path = args.source_path[:-3] srt_path = path + "srt" print srt_path audio_filename, audio_rate = extract_audio(args.source_path) regions = find_speech_regions(audio_filename) pool = ProcessingPool(args.concurrency) converter = FLACConverter(source_path=audio_filename) transcripts = [] if regions: try: widgets = [ "Converting speech regions to FLAC files: ", Percentage(), ' ', Bar(), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start() extracted_regions = [] for i, extracted_region in enumerate( pool.imap(converter, regions)): extracted_regions.append(extracted_region) pbar.update(i) self.progress1.setValue(i) pbar.finish() except KeyboardInterrupt: pbar.finish() pool.terminate() pool.join() print "Cancelling transcription" return 1 os.remove(audio_filename) return 0
print "... trying again ..." sleep(1.5**wait) wait += 1 if count > 0: t = r['titleData'] return {day : Set([x['value'] for x in t])} else: return {day : Set()} start_date = date(1880,1,1) end_date = date(1941,1,1) date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))] pool = Pool(10) result_iter = pool.imap(scrape_day, date_list) title_sets = {} for result in result_iter: title_sets.update(result) ################################### #Make dictionary of daily matches # ################################### def scrape_paper(title_id): title_url = "_".join((title_stub_url, title_id)) wait=0 while True: try: title_get = session.get(title_url, cookies=session.cookies, allow_redirects=True).text break except:
line['archive'] = "americas_historical_newspapers" line['publication_title'] = title.input.text line['publication_id'] = title.input['value'] city = title.find('td', class_="ci").text state = title.find('td', class_="st").text line['location'] = ", ".join((city,state)) line['lastUpdated'] = search_datetime papers_data.append(line) #Scrape publication dates in parallel pub_ids = [paper['publication_id'] for paper in papers_data] print "Scraping papers..." pool = Pool(10) result_iter = pool.imap(scrape_paper, pub_ids, [date_list_str]*len(pub_ids)) title_sets = {} for result in result_iter: title_sets.update(result) #Create file# filename = "americas_historical_newspapers-allpubs.csv" filepath = directory fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list_str #Create output print "Creating data rows..." out = [] for paper in papers_data: title_id = paper['publication_id'] date_match = {k : int(k in title_sets[title_id]) for k in date_list_str}
def train(self,simulation, epochs=100, num_throws = 1000, training_file = None, learning_rate=1e-4): ''' Train the pitcher's aim model for a given simulation. ''' # generate training data. # simulation inputs: # - pitch type # - effort # - verticle deflection # - horizontal deflection # simulation outputs: # - verticle location # - horizontal location if self.aim_model is None: raise Exception("Error: Pitcher's aim model has not been initialized. Cannot train.") N = num_throws if training_file is None: training_file = f'pitcher-training-data-{num_throws}-{self.id()}.pl' if isinstance(training_file,str): training_file = pathlib.Path(training_file).resolve() if training_file.is_file(): print(f"Traning data file found ({str(training_file)}). Loading training data from file.") data = torch.load(str(training_file)) sim_inputs = data['i'] sim_outputs = data['o'] else: print(f"No training data file found ({str(training_file)}). Created training dataset now") pint.set_application_registry(ureg) pool = Pool() sim_inputs = dict() sim_inputs['type'] = torch.empty([N],dtype=int) sim_inputs['effort'] = torch.empty([N],dtype=ScalarType) sim_inputs['verticle_deflection'] = torch.empty([N],dtype=ScalarType) sim_inputs['horizontal_deflection'] = torch.empty([N],dtype=ScalarType) sim_outputs = dict() sim_outputs['verticle_location'] = torch.empty([N],dtype=ScalarType) sim_outputs['horizontal_location'] = torch.empty([N],dtype=ScalarType) configs = list() print("Generating training configurations.") for i in tqdm.tqdm(range(N)): type = numpy.random.choice(list(self.characteristics['pitches'].keys())) effort = numpy.random.uniform(75,105) verticle_deflection = numpy.random.normal(loc=0,scale=5) horizontal_deflection = numpy.random.normal(loc=0,scale=5) sim_inputs['type'][i] = int(type) sim_inputs['effort'][i] = float(effort) sim_inputs['verticle_deflection'][i] = float(verticle_deflection) sim_inputs['horizontal_deflection'][i] = float(horizontal_deflection) configs.append(self.configure_throw_from_deflection( type, Q_(effort,'percent'), Q_(verticle_deflection,'degree'), Q_(horizontal_deflection,'degree'))) def terminate(record): if record[-1][2] < 0: return True if record[-1][0] > 2: return True return False def run_config(config): return simulation.run( config, terminate, record_all=False ) # estimate the runtime # print("Estimating runtime to generate training data...") # i = 0 # def run(): # nonlocal i # run_config(configs[i%N]) # i = i+1 # runtime = timeit.timeit(run,number=10)/10 # print(f" will take approximately {runtime*N/pool.nodes} s to run {N} simulations @ {runtime} s / run on {pool.nodes} CPUs.") runs = list(tqdm.tqdm(pool.imap( run_config, configs), total=N)) for i in range(N): sim_outputs['horizontal_location'][i] = runs[i][0][1] sim_outputs['verticle_location'][i] = runs[i][0][3] torch.save( {'i':sim_inputs,'o':sim_outputs}, str(training_file) ) model_inputs = torch.empty( [N,self.aim_model.in_features],dtype=ScalarType ) model_outputs = torch.empty( [N,self.aim_model.out_features],dtype=ScalarType ) for i in range(N): model_inputs[i,:] = self.aim_model.make_feature_vector( pitch_type=sim_inputs['type'][i], effort=sim_inputs['effort'][i], verticle_location=sim_outputs['verticle_location'][i], horizontal_location=sim_outputs['horizontal_location'][i] ) model_outputs[i,:] = self.aim_model.make_output_vector( verticle_deflection=sim_inputs['verticle_deflection'][i], horizontal_deflection=sim_inputs['horizontal_deflection'][i] ) # optimizer = torch.optim.Adam( self.aim_model.parameters(), lr=1e-2 ) optimizer = torch.optim.SGD( self.aim_model.parameters(), lr=learning_rate ) loss_func = torch.nn.MSELoss() losses = list() print(f"Traning model:") for i in tqdm.tqdm(range(epochs)): optimizer.zero_grad() pred = self.aim_model(model_inputs) loss = loss_func(pred,model_outputs) losses.append(float(loss)) loss.backward() optimizer.step() return losses
#!/usr/bin/env python # # Author: Mike McKerns (mmckerns @caltech and @uqfoundation) # Copyright (c) 1997-2014 California Institute of Technology. # License: 3-clause BSD. The full license text is available at: # - http://trac.mystic.cacr.caltech.edu/project/pathos/browser/pathos/LICENSE # instantiate and configure the worker pool from pathos.multiprocessing import ProcessingPool pool = ProcessingPool(nodes=4) _result = map(pow, [1,2,3,4], [5,6,7,8]) # do a blocking map on the chosen function result = pool.map(pow, [1,2,3,4], [5,6,7,8]) assert result == _result # do a non-blocking map, then extract the result from the iterator result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8]) result = list(result_iter) assert result == _result # do an asynchronous map, then get the results result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8]) result = result_queue.get() assert result == _result
def newspapers_com_scraper(search_terms, start_date, end_date, filepath): #Set starting values #Set URLs signin_url = "https://www.newspapers.com/signon.php" search_url = "http://www.newspapers.com/search/aj_getresults" search_content_url = "http://www.newspapers.com/search/aj_getsearchrecord" #Scraper Functions #Define date generator def perdelta(start, end, delta): curr = start while curr < end: yield curr curr += delta #Make search query def make_search_query(search_terms, search_date, count): query_terms = {"terms":[{"type":"keyword","values":{"value":search_terms}},{"type":"date","values":{"name":"year_month_day","value":str(search_date),"showMissing":"true"}}, {"type":"field", "values":{"name":"place","value":"United States of America"}}]} query_form = {"query_terms":dumps(query_terms), "start":0, "count":count, "engine":"solr", "sort":"score desc"} return query_form #Create record dictionary for content search def make_record_dict(records): out = [] for record in records: temp = {} temp['records'] = [record['records'][0]] temp['rollup'] = record['id'] out.append(temp) return out def get_content(records): groups = 1 while True: try: records_list = [] for group in range(groups): records_list.append(records[group::groups]) articles = [] for records_group in records_list: record_dict = make_record_dict(records_group) content_query_form = {'records':dumps(record_dict), 'highlight_terms':search_terms.replace(" ", "|"), 'nonKeywordView' : 'false'} wait = 0 while True: try: content = session.post(search_content_url, data = content_query_form, cookies=session.cookies, allow_redirects=True, headers={'referer' : 'http://www.newspapers.com/search/'}, timeout=(1,60)).text break except: print "... trying again ..." sleep(1.5**wait) wait += 1 articles = articles + json.loads(content)['records'] break except ValueError: groups += 1 return articles #Get article attributes def get_from_object(obj, *keys): try: value = obj for k in keys: if isinstance(value, dict): value = value.get(k) elif isinstance(value, list) and len(value)>1: value = (item for item in value if item['name'] == k).next()['value'] elif isinstance(value, list) and len(value)==1: value = value[k] return value except: return '' #Extract article data def get_article_data(record, search_date): line = {} line['archive'] = 'newspapers_com' line['publication_id'] = get_from_object(record, 'rec', 'cover', 'publicationId') line['publication_title'] = get_from_object(record, 'rec', 'pubMetadata', 'publication-title') line['search_date'] = search_date line['page'] = get_from_object(record, 'rec', 'cover', 'title') line['href'] = "http://www.newspapers.com/image/" + str(record['rec']['cover']['id']) + "/?terms=" + record['terms'] line['search_terms'] = search_terms return line #Scrape function def scrape(search_terms, day): sleep(1) print day #Create search query query_form = make_search_query(search_terms, day, 1000) #POST search query wait = 0 while True: try: matches = session.post(search_url, data = query_form, cookies=session.cookies, allow_redirects=True, headers={'referer' : 'http://www.newspapers.com/search/'}, timeout=(1,60)).text break except: print "... trying again ..." sleep(1.5**wait) wait += 1 #Create search content query results = json.loads(matches) if results['recCount'] > 0: #records = make_record_dict(results['records']) #print "Made " #Get articles articles = get_content(results['records']) lines = [] for article in articles: lines.append(get_article_data(article, day)) return lines else: return None #Complete Scraper date_list = [str(date) for date in perdelta(start_date, end_date, timedelta(days=1))] #Start session session = requests.session() #Log in signin = session.get(signin_url) doc = lxml.html.fromstring(signin.text) signin_form = doc.forms[0] signin_form.fields['username'] = "******" signin_form.fields['password'] = "******" session.post(signin_url, data=signin_form.form_values(), allow_redirects=True) #Create CSV #Create file name timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1)) filename = "newspapers_com-" + timeperiod + ".csv" fields = ["archive", "publication_title", "publication_id", "search_date", "page", "href", "search_terms"] pool = Pool(10) results_iter = pool.imap(scrape, [search_terms]*len(date_list), date_list) with open("/".join((filepath,filename)), "w") as w: writer = csv.DictWriter(w, fieldnames=fields) writer.writeheader() #Loop over days for results in results_iter: if results != None: writer.writerows(results)
state = re.search("^[^(--)]+(?=--)", str(location_raw)).group(0) line['location'] = ", ".join((city, state)) line['lastUpdated'] = search_datetime #Get paper publication dates paper_date_set = Set([x['date_issued'] for x in paper_data['issues']]) date_match = {k : int(k in paper_date_set) for k in date_list} line.update(date_match) return line #Scrape publication data print "Getting publication data..." pool = Pool(10) result_iter = pool.imap(scrape_paper, paper_stubs, [date_list]*len(paper_stubs)) lines = [] for result in result_iter: lines.append(result) #Prepare for write filename = "chronicling_america-allpubs.csv" filepath = directory fields = ['archive', 'publication_title', 'publication_id', 'location', 'lastUpdated'] + date_list print "Creating data rows..." out = [] for line in lines: line['publication_title'] = line['publication_title'].encode('utf8') line['location'] = line['location'].encode('utf8') out.append([line[k] for k in fields])
# instantiate and configure the worker pool from pathos.multiprocessing import ProcessingPool pool = ProcessingPool(nodes=4) _result = map(pow, [1,2,3,4], [5,6,7,8]) # do a blocking map on the chosen function result = pool.map(pow, [1,2,3,4], [5,6,7,8]) assert result == _result # do a non-blocking map, then extract the result from the iterator result_iter = pool.imap(pow, [1,2,3,4], [5,6,7,8]) result = list(result_iter) assert result == _result # do an asynchronous map, then get the results result_queue = pool.amap(pow, [1,2,3,4], [5,6,7,8]) result = result_queue.get() assert result == _result
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description= "Preprocess audio and then extract features (See detail in tensorflow_tts/bin/preprocess.py)." ) parser.add_argument("--rootdir", default=None, type=str, required=True, help="root path.") parser.add_argument("--outdir", default=None, type=str, required=True, help="output dir.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument("--n_cpus", type=int, default=4, required=False, help="yaml format configuration file.") parser.add_argument("--test_size", type=float, default=0.05, required=False, help="yaml format configuration file.") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('Skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) processor = LJSpeechProcessor(root_path=args.rootdir, cleaner_names="english_cleaners") # check directly existence if not os.path.exists(args.outdir): os.makedirs(args.outdir, exist_ok=True) os.makedirs(os.path.join(args.outdir, 'valid'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'valid', 'raw-feats'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'valid', 'wavs'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'valid', 'ids'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'valid', 'raw-f0'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'valid', 'raw-energies'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'train'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'train', 'raw-feats'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'train', 'wavs'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'train', 'ids'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'train', 'raw-f0'), exist_ok=True) os.makedirs(os.path.join(args.outdir, 'train', 'raw-energies'), exist_ok=True) # train test split idx_train, idx_valid = train_test_split(range(len(processor.items)), shuffle=True, test_size=args.test_size, random_state=42) # train/valid utt_ids train_utt_ids = [] valid_utt_ids = [] for idx in range(len(processor.items)): utt_ids = processor.get_one_sample(idx)["utt_id"] if idx in idx_train: train_utt_ids.append(utt_ids) elif idx in idx_valid: valid_utt_ids.append(utt_ids) # save train and valid utt_ids to track later. np.save(os.path.join(args.outdir, "train_utt_ids.npy"), train_utt_ids) np.save(os.path.join(args.outdir, "valid_utt_ids.npy"), valid_utt_ids) # process each data def save_to_file(idx): sample = processor.get_one_sample(idx) # get info from sample. audio = sample["audio"] text_ids = sample["text_ids"] utt_id = sample["utt_id"] rate = sample["rate"] # check assert len(audio.shape) == 1, \ f"{utt_id} seems to be multi-channel signal." assert np.abs(audio).max() <= 1.0, \ f"{utt_id} seems to be different from 16 bit PCM." assert rate == config["sampling_rate"], \ f"{utt_id} seems to have a different sampling rate." # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"]) if "sampling_rate_for_feats" not in config: x = audio sampling_rate = config["sampling_rate"] hop_size = config["hop_size"] else: x = librosa.resample(audio, rate, config["sampling_rate_for_feats"]) sampling_rate = config["sampling_rate_for_feats"] assert config["hop_size"] * config["sampling_rate_for_feats"] % rate == 0, \ "hop_size must be int value. please check sampling_rate_for_feats is correct." hop_size = config["hop_size"] * config[ "sampling_rate_for_feats"] // rate # extract feature mel, x_stft = logmelfilterbank(x, sampling_rate=sampling_rate, hop_size=hop_size, fft_size=config["fft_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"]) # make sure the audio length and feature length audio = np.pad(audio, (0, config["fft_size"]), mode='edge') audio = audio[:len(mel) * config["hop_size"]] # extract raw pitch f0, _ = pw.dio(x.astype(np.double), fs=config["sampling_rate"], f0_ceil=config["fmax"], frame_period=1000 * config["hop_size"] / config["sampling_rate"]) if len(f0) >= len(mel): f0 = f0[:len(mel)] else: f0 = np.pad(f0, ((0, len(mel) - len(f0)))) # extract energy S = librosa.magphase(x_stft)[0] energy = np.sqrt(np.sum(S**2, axis=0)) assert len(mel) * config["hop_size"] == len(audio) assert len(mel) == len(f0) == len(energy) # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() >= 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to re-consider global gain scale.") # save if config["format"] == "npy": if idx in idx_train: subdir = 'train' elif idx in idx_valid: subdir = 'valid' np.save(os.path.join(args.outdir, subdir, "wavs", f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.outdir, subdir, "raw-feats", f"{utt_id}-raw-feats.npy"), mel.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.outdir, subdir, "ids", f"{utt_id}-ids.npy"), text_ids.astype(np.int32), allow_pickle=False) np.save(os.path.join(args.outdir, subdir, "raw-f0", f"{utt_id}-raw-f0.npy"), f0.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.outdir, subdir, "raw-energies", f"{utt_id}-raw-energy.npy"), energy.astype(np.float32), allow_pickle=False) else: raise ValueError("support only npy format.") # apply multi-processing Pool p = Pool(nodes=args.n_cpus) work = tqdm(range(len(processor.items)), desc="[Preprocessing]") list(p.imap(save_to_file, work))
try: get_states = requests.get(nation_url, timeout=(1,60)).text break except: sleep(1.5**wait) wait += 1 parsed = BeautifulSoup(get_states, 'html.parser') state_urls = [a['href'] for a in parsed.find('div', class_='newLocUSListArea').find_all('a')] ################ #Get town links# ################ print "Getting town URLs..." pool = Pool(10) result_iter = pool.imap(get_town_urls, state_urls) town_urls = [] for result in result_iter: town_urls += result #Clean up town URLs town_urls = [re.sub("st\.-","st-",url) for url in town_urls] ################# #Get paper links# ################# print "Getting paper URLs..." result_iter = pool.imap(get_paper_urls, town_urls) paper_urls = []
def RunCutplan(self, timer=True): # initialisations id = [] t = time() start_id = 0 if id == []: start_id = sum(self.completed) id = list(range(start_id, self.Cutplans.shape[0])) numproc = cpu_count() - 2 p = Pool(processes=numproc) # cpSched = self.Cutplans.iloc[id] # NumLogs = min( # [self.Cutplans.LogCount[id]*1.5, [1000]*len(id)], axis=0) iterLog = [] iterC = [] lens = [] descs = [] rs = [] for cID in id: c = self.Cutplans.iloc[cID] # find desc to be used to open the correct log data file desc = c.Description[2:4]+"-"+str(int(c.Description[5:7])-1) descs.append(desc) # get data from log data file FullLD = read_csv(self.logPath+desc+'.csv') # total = NumLogs[id.index(cID)] # lMax = int(min([total, FullLD.shape[0]])) # lID = randint(0, FullLD.shape[0]-lMax) # LD = # FullLD.iloc[lID:lID+lMax].reset_index().drop('index', axis=1) lens.append(FullLD.shape[0]) # Set up lists for multiprocessing for i in range(FullLD.shape[0]): log = FullLD.iloc[i] iterLog.append(log) iterC.append(c) descs.append(desc) rs.append(Recovery(c)) # ============================================================================= # completed = [] # for lID in range(len(iterLog)): # log = iterLog[lID] # coords = GetLogCoords(log, c) # completed.append(coords) # Timer(id, cID, lID, time()-t) # ============================================================================= # if id.index(cID) > 0: # p.restart() data = [] data = p.imap(GetLogCoords, iterLog, iterC) completed = [] i = 0 j = 0 LogRecoveries = self.CreateRecoveriesDF(lens[i]) while j < len(iterLog): # try: j += 1 res = next(iter(data)) completed.append(res) if self.abort: self.abort = False return if timer: cur = j - sum(lens[0:i]) rs[j-1].RunRecovery(res) self.LogTransfer(iterLog[j-1], LogRecoveries, cur-1) self.RecoveryTransfer( rs[j-1], LogRecoveries, cur-1, descs[j-1]) Timer((i+1, len(id)), j, len(iterLog), time()-t) self.l_progress.emit(cur/lens[i]) if cur == lens[i]: self.completed[cID] = True self.cp_progress.emit(i) self.Recoveries[i] = DataFrame(LogRecoveries) i += 1 if i < len(lens): LogRecoveries = self.CreateRecoveriesDF(lens[i]) # except BaseException: # print("fail") # break # self.finishedSim.emit() # start = 0 # for l in lens: # LogRecoveries = self.CreateRecoveriesDF(l) # for i in range(l): # self.LogTransfer(iterLog[i], LogRecoveries, i) # self.RecoveryTransfer( # rs[start+i], LogRecoveries, lID, descs[start+i]) # self.Recoveries[cID] = DataFrame(LogRecoveries) # start += l self.finished.emit()
def main(): parser = argparse.ArgumentParser() parser.add_argument( 'source_path', help="Path to the video or audio file to subtitle", nargs='?') parser.add_argument( '-C', '--concurrency', help="Number of concurrent API requests to make", type=int, default=10) parser.add_argument( '-o', '--output', help= "Output path for subtitles (by default, subtitles are saved in \ the same directory and name as the source path)" ) parser.add_argument('-F', '--format', help="Destination subtitle format", default="srt") parser.add_argument('-S', '--src-language', help="Language spoken in source file", default="en") parser.add_argument('-D', '--dst-language', help="Desired language for the subtitles", default="en") parser.add_argument( '-K', '--api-key', help= "The Google Translate API key to be used. (Required for subtitle translation)" ) parser.add_argument('--list-formats', help="List all available subtitle formats", action='store_true') parser.add_argument( '--list-languages', help="List all available source/destination languages", action='store_true') args = parser.parse_args() print args if (os.name == "posix"): args.source_path = str(self.filename) else: args.source_path = (str(self.filename)).replace("/", "\\") pas = (args.source_path).replace("/", "\\") args.source_path = pas print " Printing pas >>>", pas print args path = args.source_path[:-3] srt_path = path + "srt" if args.list_formats: print("List of formats:") for subtitle_format in FORMATTERS.keys(): print("{format}".format(format=subtitle_format)) return 0 if args.list_languages: print("List of all languages:") for code, language in sorted(LANGUAGE_CODES.items()): print("{code}\t{language}".format(code=code, language=languages)) return 0 if args.format not in FORMATTERS.keys(): print( "Subtitle format not supported. Run with --list-formats to see all supported formats." ) return 1 if args.src_language not in LANGUAGE_CODES.keys(): print( "Source language not supported. Run with --list-languages to see all supported languages." ) return 1 if args.dst_language not in LANGUAGE_CODES.keys(): print( "Destination language not supported. Run with --list-languages to see all supported languages." ) return 1 if not args.source_path: print("Error: You need to specify a source path.") return 1 audio_filename, audio_rate = extract_audio(args.source_path) regions = find_speech_regions(audio_filename) pool = ProcessingPool(args.concurrency) converter = FLACConverter(source_path=audio_filename) recognizer = SpeechRecognizer(language=args.src_language, rate=audio_rate, api_key=GOOGLE_SPEECH_API_KEY) transcripts = [] if regions: try: widgets = [ "Converting speech regions to FLAC files: ", Percentage(), ' ', Bar(), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start() extracted_regions = [] for i, extracted_region in enumerate( pool.imap(converter, regions)): extracted_regions.append(extracted_region) pbar.update(i) self.progress1.setValue(i) pbar.finish() widgets = [ "Performing speech recognition: ", Percentage(), ' ', Bar(), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start() for i, transcript in enumerate( pool.imap(recognizer, extracted_regions)): transcripts.append(transcript) pbar.update(i) self.progress2.setValue(i) pbar.finish() QMessageBox.about(self, "Subtitles created", "Created at " + srt_path) if not is_same_language(args.src_language, args.dst_language): if args.api_key: google_translate_api_key = args.api_key translator = Translator(args.dst_language, google_translate_api_key, dst=args.dst_language, src=args.src_language) prompt = "Translating from {0} to {1}: ".format( args.src_language, args.dst_language) widgets = [ prompt, Percentage(), ' ', Bar(), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start() translated_transcripts = [] for i, transcript in enumerate( pool.imap(translator, transcripts)): translated_transcripts.append(transcript) pbar.update(i) self.progress2.setValue(i) pbar.finish() transcripts = translated_transcripts else: print "Error: Subtitle translation requires specified Google Translate API key. \See --help for further information." return 1 except KeyboardInterrupt: pbar.finish() pool.terminate() pool.join() print "Cancelling transcription" return 1 timed_subtitles = [(r, t) for r, t in zip(regions, transcripts) if t] formatter = FORMATTERS.get(args.format) formatted_subtitles = formatter(timed_subtitles) dest = args.output if not dest: base, ext = os.path.splitext(args.source_path) dest = "{base}.{format}".format(base=base, format=args.format) with open(dest, 'wb') as f: f.write(formatted_subtitles.encode("utf-8")) print "Subtitles file created at {}".format(dest) os.remove(audio_filename) return 0
class analyze(setup.setup): def __init__(self, args, logging_level=logging.INFO): super(analyze, self).__init__(args, logging_level) # set up processing pool and run all analyses specified in args def run(self): if self.args.jumpdists: n_bins = 100. bin_width = 1 / n_bins bins = np.arange(0, 1 + bin_width, 1 / n_bins) if self.args.file: user, vals = self.artist_jump_distributions(self.args.file, bins=bins, self_jumps=False) with open(self.args.resultdir + user, 'w') as fout: fout.write(','.join(vals.astype(str)) + '\n') else: raise ('not implemented!') self.pool = Pool(self.args.n) self.rootLogger.info("Pool started") self.rootLogger.info("Starting jump distance analysis") func_partial = partial(self.artist_jump_distributions, bins=bins, self_jumps=False) with open(self.args.resultdir + 'jumpdists', 'w') as fout: for user, vals in self.pool.imap(func_partial, self.listen_files): fout.write(user + '\t' + ','.join(vals.astype(str)) + '\n') self.pool.close() self.rootLogger.info("Pool closed") if self.args.blockdists: #self.rootLogger.info("Starting block distance analysis") self.mean_block_distances(self.args.file) if self.args.diversity_dists: bins = np.arange(0, 1.01, .01) self.diversity_distributions(self.args.file, bins=bins) if self.args.clustering: self.clustering(self.args.file) if self.args.values: self.patch_values(self.args.file) if self.args.exp: self.explore_exploit(self.args.file) if self.args.patch_len_dists: self.patch_len_dists(self.args.file) # calculate distribution (using histogram with specified bins) # of sequential artist-to-artist distances def artist_jump_distributions(self, fi, bins, self_jumps=False): user = fi.split('/')[-1][:-4] df = pd.read_pickle(fi) if self_jumps: vals = np.histogram(df['dist'].dropna(), bins=bins)[0] else: vals = np.histogram(df['dist'][df['dist'] > 0], bins=bins)[0] self.rootLogger.info( 'artist jump distances done for user {} ({})'.format(user, fi)) return user, vals # calculate distribution (using histogram with specified bins) # of patch diversity for each user # awk 'FNR==1' * > diversity_dists_zeros # awk 'FNR==2' * > diversity_dists_nozeros def diversity_distributions(self, fi, bins): if 'patches' not in fi: raise ('WRONG DATATYPE') user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi).dropna(subset=['diversity']) zeros = np.histogram(df[df['n'] >= 5]['diversity'], bins=bins)[0] nozeros = np.histogram(df[(df['n'] >= 5) & (df['diversity'] > 0)]['diversity'], bins=bins)[0] zeros = zeros / float(zeros.sum()) nozeros = nozeros / float(nozeros.sum()) with open(self.args.resultdir + user, 'w') as fout: fout.write(user + '\t' + 'zeros' + '\t' + ','.join(zeros.astype(str)) + '\n') fout.write(user + '\t' + 'nozeros' + '\t' + ','.join(nozeros.astype(str)) + '\n') self.rootLogger.info( 'diversity distributions done for user {} ({})'.format(user, fi)) def mean_block_distances(self, fi, n=100): def cos_nan(arr1, arr2): if np.any(np.isnan(arr1)) or np.any(np.isnan(arr2)): return np.nan else: return cosine(arr1, arr2) user = fi.split('/')[-1].split('_')[0] df = pd.read_pickle(fi) blocks = df[df['n'] >= 5].dropna() result = [] for i in xrange(len(blocks) - n): first = blocks['centroid'].iloc[i] result.append( np.array(blocks['centroid'][i + 1:i + n + 1].apply( lambda val: cos_nan(val, first)))) result = np.nanmean(np.vstack(result), 0) with open(self.args.resultdir + user, 'w') as fout: fout.write( '\t'.join([user, 'patch', ','.join(result.astype(str))]) + '\n') self.rootLogger.info( 'Block distances for user {} processed successfully ({})'.format( user, fi)) # now shuffled # idx = np.array(blocks.index) # np.random.shuffle(idx) # blocks = blocks.reindex(idx) # result_random = [] # for i in xrange(len(blocks)-n): # first = blocks['centroid'].iloc[i] # result_random.append(np.array(blocks['centroid'][i+1:i+n+1].apply(lambda val: cos_nan(val,first)))) # result_random = np.nanmean(np.vstack(result_random),0) # with open(self.args.resultdir+user,'w') as fout: # fout.write('\t'.join([user,'patch',','.join(result.astype(str))])+'\n') # fout.write('\t'.join([user,'patch_random',','.join(result_random.astype(str))])+'\n') # self.rootLogger.info('Block distances for user {} processed successfully ({})'.format(user,fi)) def clustering(self, fi): df = pd.read_pickle(fi) user = fi.split('/')[-1].split('_')[0] mask = (df['centroid'].apply(lambda arr: ~np.any(np.isnan(arr))).values ) & (df['n'] >= 5) & (df['diversity'] <= 0.2) clust_data = df[mask].reset_index() arr = np.vstack(clust_data['centroid']) Z = linkage(arr, 'complete') clusters = fcluster(Z, t=0.2, criterion='distance') assignments = np.repeat(np.nan, len(df)) assignments[np.where(mask)] = clusters df['patch_clust'] = assignments df.to_pickle('{}{}.pkl'.format(self.args.resultdir, user)) self.rootLogger.info( 'Patch clusters for user {} processed successfully ({})'.format( user, fi)) def patch_len_dists(self, fi): df = pd.read_pickle(fi) user = fi.split('/')[-1][:-4] explore = df[np.isnan(df['patch_clust'])] result_explore = explore['n'].value_counts() df['explore'] = np.isnan(df['patch_clust']).astype(int) df['explore-idx'] = df['explore'].cumsum() result_exploit = df.groupby('explore-idx').apply( lambda df: df.dropna()['n'].sum()).value_counts() result_explore = result_explore.reindex(xrange( 1, max(result_explore.index) + 1), fill_value=0.).values result_exploit = result_exploit.reindex(xrange( 1, max(result_exploit.index) + 1), fill_value=0.).values result_explore = sparse.csr_matrix(result_explore) result_exploit = sparse.csr_matrix(result_exploit) with open(self.args.resultdir + user, 'w') as fout: fout.write(user + '\t' + 'explore' + '\t' + ':'.join([ ','.join(a.astype(str)) for a in result_explore.data, result_explore.indices, result_explore.indptr ]) + '\n') fout.write(user + '\t' + 'exploit' + '\t' + ':'.join([ ','.join(a.astype(str)) for a in result_exploit.data, result_exploit.indices, result_exploit.indptr ]) + '\n') self.rootLogger.info('User {} processed successfully ({})'.format( user, fi)) def explore_exploit(self, fi): user = fi.split('/')[-1][:-4] df_patches_raw = pd.read_pickle(fi) # add time in next bout df_patches_raw['next_n'] = df_patches_raw['n'].shift(-1) # add patch values # listensPerPatch = df_patches_raw.groupby('patch_clust')['n'].sum() # overall_prop = listensPerPatch/float(df_patches_raw['n'].sum()) # overall_prop.name = 'final_value' # df_patches_raw = df_patches_raw.join(overall_prop,on='patch_clust') """ # time in next exploit patch as function of exploration time result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['next_n'].mean() fout.write(user+'\t'+'next-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # total time exploiting as a function of time exploring df_patches_raw['explore'] = np.isnan( df_patches_raw['patch_clust']).astype(int) df_patches_raw['explore-idx'] = df_patches_raw['explore'].cumsum() # combine all exploit listens #grp_explore = df_patches_raw.groupby('explore-idx').apply(lambda df: pd.DataFrame({'n':[df['n'].iloc[0]],'n-exploit':[df['n'][1:].sum()]})) # only last exploit bout grp_explore = df_patches_raw.groupby('explore-idx').apply( lambda df: pd.DataFrame({ 'n': [df['n'].iloc[0]], 'n-exploit': [df['n'].iloc[-1]] })) #result = grp_explore.groupby('n')['n-exploit'].mean() #fout.write(user+'\t'+'total-exploit-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # exploration time as a function of exploitation time grp_exploit = grp_explore.copy() grp_exploit['n-explore'] = grp_exploit['n'].shift(-1) result = grp_exploit.groupby('n-exploit')['n-explore'].mean() fout.write(user+'\t'+'explore-vs-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # prob exploit given explore time - already done # explore_only = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])] # result = explore_only['n'][:-1].value_counts() # arr = result.reindex(xrange(1,max(result.index)+1),fill_value=0.).values # final_result = arr/(np.cumsum(arr[::-1])[::-1]) # final_result = sparse.csr_matrix(final_result) # with open(self.args.resultdir+user+'_exploit','w') as fout: # fout.write(user+'\t'+':'.join([','.join(a.astype(str)) for a in final_result.data,final_result.indices,final_result.indptr])+'\n') # prob explore given exploit time result = grp_explore['n-exploit'][ grp_explore['n-exploit'] > 0].value_counts() arr = result.reindex(xrange(1, max(result.index) + 1), fill_value=0.).values final_result = arr / np.cumsum(arr[::-1])[::-1] final_result = sparse.csr_matrix(final_result) with open(self.args.resultdir + user + '_explore', 'w') as fout: fout.write(user + '\t' + ':'.join([ ','.join(a.astype(str)) for a in final_result.data, final_result.indices, final_result.indptr ]) + '\n') #fout.write(user+'\t'+'prob-explore-given-exploit'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ # patch value as a function of exploration time df_patches_raw['final_value_next'] = df_patches_raw['final_value'].shift(-1) result = df_patches_raw[np.isnan(df_patches_raw['patch_clust'])].groupby('n')['final_value_next'].mean() fout.write(user+'\t'+'exploit-value-vs-explore'+'\t'+','.join(["{}:{}".format(a,b) for a,b in result.iteritems()])+'\n') """ self.rootLogger.info('User {} processed successfully ({})'.format( user, fi))