def make_val_set(df, split_percentage=0.2, drop_percentage=0.): # Find the product_ids for each category. category_dict = defaultdict(list) for ir in tqdm(df.itertuples()): category_dict[ir[4]].append(ir[0]) train_list = [] val_list = [] with tqdm(total=len(df)) as pbar: for category_id, product_ids in category_dict.items(): category_idx = cat2idx[category_id] # Randomly remove products to make the dataset smaller. keep_size = int(len(product_ids) * (1. - drop_percentage)) if keep_size < len(product_ids): product_ids = np.random.choice(product_ids, keep_size, replace=False) # Randomly choose the products that become part of the validation set. val_size = int(len(product_ids) * split_percentage) if val_size > 0: val_ids = np.random.choice(product_ids, val_size, replace=False) else: val_ids = [] # Create a new row for each image. for product_id in product_ids: row = [product_id, category_idx] for img_idx in range(df.loc[product_id, "num_imgs"]): if product_id in val_ids: val_list.append(row + [img_idx]) else: train_list.append(row + [img_idx]) pbar.update() columns = ["product_id", "category_idx", "img_idx"] train_df = pd.DataFrame(train_list, columns=columns) val_df = pd.DataFrame(val_list, columns=columns) return train_df, val_df
def maxent_motifs(N, L, desired_ic, num_motifs, tolerance=10**-10, A=4, beta=None, countses=None, entropies=None, log_cols=None, verbose=False): ### computational if countses is None: logger("countses", verbose) countses = enumerate_counts(N, A, verbose=verbose) if entropies is None: logger("entropies", verbose) entropies = np.array(map(entropy_from_counts, tqdm(countses))) if log_cols is None: iterator = tqdm(countses) if verbose else countses logger("log_cols", verbose) log_cols = np.array([log_counts_to_cols(counts, A=A) for counts in iterator]) if beta is None: correction_per_col = (A-1)/(2*log(2)*N) desired_ic += L * correction_per_col beta = find_beta_for_mean_motif_ic(N,L,desired_ic,tolerance=tolerance,verbose=verbose, A=A, countses=countses, entropies=entropies, log_cols=log_cols) logger("beta: %s" % beta, verbose) logger("computing count ps from beta", verbose) ps = count_ps_from_beta(N,beta, A=A, verbose=verbose, log_cols=log_cols, entropies=entropies) count_sampler = inverse_cdf_sampler(countses, ps) def sample(): counts = [count_sampler() for i in range(L)] cols = [sample_col_from_count(count, A=A) for count in counts] return map(lambda site:"".join(site),transpose(cols)) iterator = trange if verbose else xrange if verbose: print "sampling" return [sample() for _ in iterator(num_motifs)]
def experiment4(L=10): """do grid search to determine whether linear or apw models fair better under different regimes""" def apw_fit(sigma, mu, Ne): code = sample_code(L, sigma) def apw_phat(site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:apw_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, mu, s))[25000:] return mean(chain) def linear_fit(sigma, mu, Ne): pssm = sample_matrix(L, sigma) def linear_phat(site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) chain = mh(lambda s:linear_phat(s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, mu, s))[25000:] return mean(chain) def apw_occ(code, mu, site): ep = score(code, site) return 1/(1+exp(ep-mu)) def linear_occ(pssm, mu, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu)) sigmas = np.linspace(0,5,5) mus = np.linspace(-10,10,5) Nes = np.linspace(0,5,5) apws = [apw_fit(sigma, mu, Ne) for sigma in tqdm(sigmas) for mu in mus for Ne in Nes] linears = [linear_fit(sigma, mu, Ne) for sigma in tqdm(sigmas) for mu in mus for Ne in Nes]
def find_beta_for_mean_col_ic(n, desired_ic_per_col,tolerance=10**-10,verbose=False, A=4, countses=None, entropies=None, log_cols=None): """find beta such that entropy*exp(-beta*entropy)/Z = des_ent""" if countses is None: if verbose: print "enumerating countses" countses = enumerate_counts(n, A, verbose=verbose) if entropies is None: if verbose: print "enumerating entropies" entropies = np.array(map(entropy_from_counts, tqdm(countses))) else: entropies = np.array(map(entropy_from_counts, countses)) #cols = np.array(map(countses_to_cols, countses)) if log_cols is None: if verbose: print "enumerating cols" #cols = np.exp(np.array(map(log_counts_to_cols, countses))) iterator = tqdm(countses) if verbose else countses log_cols = np.array([log_counts_to_cols(counts, A=A) for counts in iterator]) def f2(beta): log_phats = np_log_normalize(log_cols + -beta*entropies) expected_entropy = np.exp(log_phats).dot(entropies) return log2(A) - expected_entropy - desired_ic_per_col lb = -1 while f2(lb) > 0: lb *= 2 if lb < -1000: print "Warning, failed to find lower bound on beta" raise Exception("Couldn't find beta'") ub = 1000 while f2(ub) < 0: ub *= 2 print "raising upper bound to:",ub return secant_interval(f2,lb,ub,verbose=verbose,tolerance=tolerance)
def sigma_Ne_contour_plot(filename=None): sigmas = np.linspace(0,5,20) Nes = np.linspace(1,20,20) L = 10 n = 50 copies = 10*n trials = 100 motifss = [[[(sample_motif(sigma, Ne, L, copies, n)) for i in range(trials)] for sigma in sigmas] for Ne in tqdm(Nes)] occ_M = [[expected_occupancy(sigma, Ne, L, copies) for sigma in sigmas] for Ne in tqdm(Nes)] print "ic_M" ic_M = mmap(lambda ms:mean(map(motif_ic,ms)),motifss) print "gini_M" gini_M = mmap(lambda ms:mean(map(motif_gini,ms)),motifss) print "mi_M" mi_M = mmap(lambda ms:mean(map(total_motif_mi,ms)),tqdm(motifss)) plt.subplot(2,2,1) plt.contourf(sigmas,Nes,occ_M,cmap='jet') plt.colorbar() plt.subplot(2,2,2) plt.contourf(sigmas,Nes,ic_M,cmap='jet') plt.colorbar() plt.subplot(2,2,3) plt.contourf(sigmas,Nes,gini_M,cmap='jet') plt.colorbar() plt.subplot(2,2,4) plt.contourf(sigmas,Nes,mi_M,cmap='jet') plt.colorbar() maybesave(filename)
def make_api_call(crest_url_list): @RateLimited(150) def get_data(session, url): try: response = session.get(url.full_url) return response except requests.ConnectionError: print "Connection Aborted - BadStatusLine" session = FuturesSession(max_workers=10) for url in tqdm(crest_url_list, desc="Downloading", leave=False): # futures.append(get_data(session, url)) futures.append(SellOrder(get_data(session, url), url.region)) for request in tqdm(futures, desc="Completing Requests", leave=False): # res.append(request.result()) res.append(SellOrder(request.data.result(), request.region)) for x in res: # sell_orders_list.append(json.loads(x.content)) sell_orders_list.append(SellOrder(json.loads(x.data.content), x.region)) return sell_orders_list
def validate_sample_motif_neglect_fg2(iterations=50000): """compare fg_neglect sampling to MCMC""" bio_motif = Escherichia_coli.LexA n = len(bio_motif) L = len(bio_motif[0]) matrix = [[-ep for ep in row] for row in make_pssm(bio_motif)] ringer = ringer_motif(matrix,n) Ne = 2.375 random_motifs = [sample_motif_neglect_fg(matrix,n,Ne) for i in trange(iterations)] random_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(random_motifs)] random_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(random_motifs)] random_ics = map(motif_ic,random_motifs) _, chain = sella_hirsch_mh(matrix=matrix,init="ringer",Ne=Ne,n=n,iterations=iterations) chain_rhos = [motif_hamming_distance(ringer,motif) for motif in tqdm(chain)] chain_log_fs = [log_fitness(matrix,motif,G) for motif in tqdm(chain)] chain_ics = map(motif_ic,chain) plt.subplot(1,2,1) plt.scatter(random_rhos,random_log_fs) plt.scatter(chain_rhos,chain_log_fs,color='g') plt.xlabel("rho") plt.ylabel("log fitness") plt.subplot(1,2,2) plt.scatter(random_rhos,random_ics) plt.scatter(chain_rhos,chain_ics,color='g') plt.xlabel("rho") plt.ylabel("IC")
def memeExt(id_list,file_name): dir_PATH = 'D:\\KimSS-NAS\\LFG\\Works\\2016.04 Yeast HD LD Rho0\\(Archive) FIMO analysis\\' PATH = dir_PATH+file_name # 1. Read file meme_f = open(PATH,'r') meme_fc = meme_f.readlines() meme_f.close() print('1. Read file done: %d lines' % (len(meme_fc))) # 2. Search motif information and their index motifs = [s for s in meme_fc if 'MOTIF'.lower() in s.lower()] motifs_id = [i for i, s in enumerate(meme_fc) if 'MOTIF'.lower() in s.lower()] print('2. Search motifs done: %d motifs\n' % (len(motifs_id))) # 3. Search id_list in motif list my_motifs_id = [] for sch in tqdm(id_list): my_id = [i for i, s in enumerate(motifs) if sch.lower() in s.lower()] my_motifs_id.append(my_id) print('3. Extract my motifs done: %d motifs\n'% len(my_motifs_id)) result = [] for my in tqdm(my_motifs_id): if len(my)>0: my_motif = meme_fc[motifs_id[my[0]]: motifs_id[my[0]+1]-1] my_motif_str = ''.join(my_motif) result.append(my_motif_str) print('4. Summary my motifs done.%d motifs\n'% len(result)) return(result)
def test(test_indexes, BG_img, params): folder = params["folder"] marginX = params["marginX"] marginY = params["marginY"] neg_weight = params["neg_weight"] method = params["method"] feature = params["feature"] test_features, test_labels = [], [] test_feature_count = 0 print "Extracting positive test features..." # Read positive test examples for i in tqdm(range(len(test_indexes))): img = img_read(folder, test_indexes[i]) #motion_img = read_motion_image(folder, test_indexes[i], BG_img) height, width = img.shape bboxes = add_bbox_margin(read_bboxes(folder, test_indexes[i]), marginX, marginY, height, width) for j in bboxes: img_cut = img[j[0]:j[1], j[2]:j[3]] #motion_img_cut = motion_img[j[0]:j[1], j[2]:j[3]] test_feature_count += 1 #test_features.append(extract(img_cut, motion_img_cut, method, feature)) test_features.append(extract(img_cut, None, method, feature)) test_labels.append(1) pos_test_feature_count = test_feature_count print "Positive test features are extracted." print "Extracting negative test features..." # Read negative test examples for j in tqdm(range(pos_test_feature_count*neg_weight)): i = sample(test_indexes, 1)[0] img = img_read(folder, i) height, width = img.shape bboxes = add_bbox_margin(read_bboxes(folder, i), marginX, marginY, height, width) neg_bb = rand_bbox(bboxes, height, width); if overlaps(neg_bb, bboxes) != -1: continue #motion_img = read_motion_image(folder, i, BG_img) img_cut = img[neg_bb[0]:neg_bb[1], neg_bb[2]:neg_bb[3]] #motion_img_cut = motion_img[neg_bb[0]:neg_bb[1], neg_bb[2]:neg_bb[3]] test_feature_count += 1 #test_features.append(extract(img_cut, motion_img_cut, method, feature)) test_features.append(extract(img_cut, None, method, feature)) test_labels.append(-1) print "Negative test features are extracted." return test_features, test_labels, test_feature_count
def on_off_experiment2(num_motifs=100,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf"): """compare MI vs Gini on biological_motifs""" bio_motifs = [getattr(tfdf,tf) for tf in tfdf.tfs] Ns = map(len, bio_motifs) spoofses = [spoof_on_off_motif(motif,num_motifs=num_motifs,trials=1) for motif in bio_motifs] spoof_ginises = mmap(motif_gini,tqdm(spoofses)) spoof_mises = mmap(total_motif_mi,tqdm(spoofses)) cors, ps = [],[] for ginis, mis in zip(ginises, mises): cor, p = pearsonr(ginis,mis) cors.append(cor) ps.append(p) q = fdr(ps) plt.scatter(cors,ps,filename="gini-vs-mi-correlation-in-on-off-spoofs.pdf") plt.plot([-1,1],[q,q],linestyle='--',label="FDR-Adjusted Significance Level") plt.semilogy() plt.legend() plt.xlabel("Pearson Correlation Coefficient") plt.ylabel("P value") plt.xlim([-1,1]) plt.ylim([10**-4,1+1]) cor_ps = zip(cors,ps) sig_negs = [(c,p) for (c,p) in cor_ps if c < 0 and p < q] sig_poses = [(c,p) for (c,p) in cor_ps if c > 0 and p < q] insigs = [(c,p) for (c,p) in cor_ps if p > q] def weighted_correlation(cor_p_Ns): cors,ps,Ns = transpose(cor_p_Ns) return sum([cor*N for (cor,N) in zip (cors,Ns)])/sum(Ns) plt.title("Gini-MI Correlation Coefficient vs. P-value for On-Off Simulations from Prokaryotic Motifs") maybesave(filename)
def simple_eval(opts): """ Simple evaluation of the program, can and will be merged with the other evaluation in future. """ if not check_conf([opts.simple], opts): return values = list() print("Running simple evaluation.\tArguments: {0}\tRepeats: {1}.".format(1, opts.repeats)) for x in tqdm(range(opts.repeats)): values.append(get_simulation([opts.simple])) print("Extracting values..") for i in tqdm(range(len(values))): values[i] = extract(values[i], opts) mean = get_mean(values) print("Mean: {0}".format(mean)) median = get_median(values) print("Median: {0}".format(median)) if opts.graph: print("Generating graph..") graph([values], mean, median) sys.exit(0)
def parseAffiliationsToCSV(aff_filename): global AFF_OUT aff_file = open(aff_filename, 'r') dblp_aff = open(AFF_OUT,"w") aff_wr = csv.writer(dblp_aff, quoting=csv.QUOTE_ALL) aff_set = Set() for rline in tqdm(aff_file): line = rline.decode("utf-8").rstrip().replace("\n","") if line.startswith("#index"): continue if line.startswith("#n"): continue if line.startswith("#a"): affs = line.lstrip("#a").split(";") for a in affs: aff_set.add(a) if line.startswith("#t"): continue if line == "": continue AFF_INDEX = 0 for aff in tqdm(aff_set): aff_wr.writerow([AFF_INDEX,aff]) AFF_INDEX += 1 dblp_aff.close() aff_file.close()
def experiment3(trials=10): mu = -10 Ne = 5 L = 10 sigma = 1 codes = [sample_code(L, sigma) for i in range(trials)] pssms = [sample_matrix(L, sigma) for i in range(trials)] sites = [random_site(L) for i in xrange(10000)] apw_site_sigmas = [sd([score(code,site) for site in sites]) for code in codes] linear_site_sigmas = [sd([score_seq(pssm,site) for site in sites]) for pssm in pssms] def apw_phat(code, site): ep = score(code, site) return 1/(1+exp(ep-mu))**(Ne-1) def apw_occ(code, site): ep = score(code, site) return 1/(1+exp(ep-mu)) def linear_phat(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu))**(Ne-1) def linear_occ(pssm, site): ep = score_seq(pssm, site) return 1/(1+exp(ep-mu)) apw_mean_fits = [exp(mean(map(log10, mh(lambda s:apw_phat(code, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:apw_occ(code, s))[1:]))) for code in tqdm(codes)] linear_mean_fits = [exp(mean(map(log10, mh(lambda s:linear_phat(pssm, s), proposal=mutate_site, x0=random_site(L), capture_state = lambda s:linear_occ(pssm, s))[1:]))) for pssm in tqdm(pssms)] plt.scatter(apw_site_sigmas, apw_mean_fits, label='apw') plt.scatter(linear_site_sigmas, linear_mean_fits, color='g',label='linear') plt.semilogy() plt.legend(loc='lower right')
def get_image_feat(feat_type, image_folder, orig_split, indices, real_split): feats = defaultdict(int) prefix = 'abstract_v002_%s2015_' % (orig_split) if 'fc7' in feat_type: # set some parameters folder = os.path.join(image_folder, 'scene_img', 'img_%s2015' % (orig_split)) + '/' print "Preparing the VGG 19 Net" net = demo.build_convnet() print "Extracting Features" with open('temp_{}.txt'.format(orig_split), 'w') as image_file: for item in tqdm(indices): image_file.write(imname(prefix, item) + '\n') image_file.close() feats = demo.compute_fromfile(net, 'temp_{}.txt'.format(orig_split), base_path=folder) elif 'hdf5' in feat_type: try: folder = os.path.join(image_folder, 'scene_img', 'img_%s2015' % (orig_split)) + '/' images = np.zeros((len(indices), 3, 224, 224)) # TODO: Low Priority, make general for index, item in tqdm(enumerate(indices)): images[index] = demo.load_abstract_image(folder + imname(prefix, item)) with h5py.File('/ssd_local/rama/datasets/abstract-hdf5/{}.h5'.format(real_split), 'w') as outfile: outfile['images'] = images return True except: print "problem" return False else: folder = os.path.join(image_folder, 'scene_json', 'scene_%s2015_indv' % (orig_split)) # create the abstract feature instance AF = pickle.load(open('extract_features/af_dump.p', 'r')) # TODO: Figure out a better place to initialize all this out_dir = '/srv/share/vqa/release_data/abstract_v002/scene_json/features_v2/' keep_or_remove = 'keep' get_names = False tags = feat_type # path to metafeature directory metafeat_dir = af.dir_path(os.path.join(out_dir, 'metafeatures')) for item in tqdm(indices): metafeat_fn = '{}_instances-{}.cpickle'.format(item, AF.instance_ordering) cur_metafeat_fn = os.path.join(metafeat_dir, metafeat_fn) with open(cur_metafeat_fn, 'rb') as fp: cur_metafeats = pickle.load(fp) cur_feats, _ = AF.scene_metafeatures_to_features(cur_metafeats, tags, keep_or_remove, get_names) feats[item] = cur_feats return feats
def experiment1(): """Does downsampling preserve percentile statistics?""" motif = (prok_motifs[11]) downsamples = [sample(int(len(motif)/10), motif,replace=False) for i in range(100)] maxent_spoofs = spoof_maxent_motifs(motif, 1000, verbose=True) down_spoofs = [spoof_maxent_motifs(dm, 100) for dm in tqdm(downsamples)] true_mi, spoof_mis = motif_mi(motif), map(motif_mi, tqdm(maxent_spoofs)) down_mis, down_spoof_mis = map(motif_mi, downsamples), [map(motif_mi, spoofs) for spoofs in tqdm(down_spoofs)] true_percentile = percentile(true_mi, spoof_mis) down_percentiles = [percentile(down_mi, ds_mis) for (down_mi, ds_mis) in zip (down_mis, down_spoof_mis)]
def download_github_files(token, org, repo, branch, to_path, is_dogweb=False): """ Using the github api downloads manifest files to a temporary location for processing :param token: string of github token :param org: string of organization :param repo: string of git repository :param branch: string of branchname :param to_path: where to extract :param is_dogweb: if dogweb repo we need to get nested data """ directory = 'integration' if is_dogweb else '' url = 'https://api.github.com/repos/{0}/{1}/contents/{3}?ref={2}'.format(org, repo, branch, directory) headers = {'Authorization': 'token {}'.format(token)} if token else {} excludes = ['LICENSE', 'Rakefile', 'Gemfile'] print('Downloading files from {}/{}..'.format(repo, branch)) response = requests.get(url, headers=headers) """ Downloading manifest.json for integrations core repo only """ if response.status_code == requests.codes.ok: if not is_dogweb: for obj in tqdm(response.json()): name = obj.get('name', '') if not name.startswith('.') and not splitext(name)[1] and name not in excludes: to_manifest = '{}/manifest.json'.format(name) response_manifest = requests.get('https://raw.githubusercontent.com/{0}/{1}/{2}/{3}'.format(org, repo, branch, to_manifest), headers=headers) if response_manifest.status_code == requests.codes.ok: with open('{}{}_manifest.json'.format(to_path, name), mode='wb+') as f: f.write(response_manifest.content) else: print('There was an error ({}) listing {}/{} contents..'.format(response.status_code, repo, branch)) exit(1) """ Downloading readme.md for integrations core repo only """ if response.status_code == requests.codes.ok: if not is_dogweb: for obj in tqdm(response.json()): name = obj.get('name', '') if not name.startswith('.') and not splitext(name)[1] and name not in excludes: to_manifest = '{}/README.md'.format(name) response_manifest = requests.get('https://raw.githubusercontent.com/{0}/{1}/{2}/{3}'.format(org, repo, branch, to_manifest), headers=headers) if response_manifest.status_code == requests.codes.ok: with open('{}{}_readme.md'.format(to_path, name), mode='wb+') as f: f.write(response_manifest.content) else: print('There was an error ({}) listing {}/{} contents..'.format(response.status_code, repo, branch)) exit(1)
def visualize_stationary_sum(matrix,n,Ne,T,samples_per_bin=100): L = len(matrix) nu = Ne - 1 ringer = ringer_motif(matrix,n) motifss = [[mutate_motif_k_times(ringer,k) for i in range(samples_per_bin)] for k in trange(n*L)] log_fss = mmap(lambda motif:log_fitness(matrix,motif,G),tqdm(motifss)) Tss = mmap(T,tqdm(motifss)) log_ws = [log_rho_weight(rho,n,L) for rho in range(n*L)] terms = [mean(exp(nu*log_f + log_w)*T for log_f,T in zip(log_fs,Ts)) for log_w,log_fs,Ts in zip(log_ws,log_fss,Tss)] Z = sum([mean(exp(nu*log_f + log_w) for log_f,T in zip(log_fs,Ts)) for log_w,log_fs,Ts in zip(log_ws,log_fss,Tss)]) print sum(terms)/Z plt.plot(range(n*L),terms)
def get_data(self, data_type, color_images=True, mat_label_images=True, obj_label_images=True, calibrations=False, depth=False): file_list = [] for t in data_type: list_type = t + '_images' if list_type in self.config: file_list += self.config[list_type] else: raise Exception('The config does not contain a list for the entry: \'{0}_images\' \nConfig file located at: {1}'.format(t, self.config_filename)) return_list = [] if color_images: images = [] for fn in tqdm(file_list): i_n = os.path.join(self.image_folder, fn+self.image_extension) images.append(self.load_color(i_n)) return_list.append(images) if mat_label_images: mat_labels = [] for fn in tqdm(file_list): mat_l_n = os.path.join(self.mat_label_folder, fn+self.mat_label_extension) mat_labels.append(self.load_labels(mat_l_n, 'mat')) return_list.append(mat_labels) if obj_label_images: obj_labels = [] for fn in tqdm(file_list): obj_l_n = os.path.join(self.obj_label_folder, fn+self.obj_label_extension) obj_labels.append(self.load_labels(obj_l_n, 'obj')) return_list.append(obj_labels) if calibrations: calibration_data = [] for fn in tqdm(file_list): c_n = os.path.join(self.calibration_folder, fn+self.calibration_extension) calibration_data.append(self.load_calibration(c_n)) return_list.append(calibration_data) if depth: depth_data = [] for fn in tqdm(file_list): d_n = os.path.join(self.depth_folder, fn+self.depth_extension) depth_data.append(self.load_depth(d_n)) return_list.append(depth_data) if len(return_list) == 1: return return_list[0] else: return return_list
def analyze_collection(prok_motifs, euk_motifs): prok_correlated_pairses = map(analyze_motif,tqdm(prok_motifs,desc='motifs')) with open("prok_correlated_pairses.pkl",'w') as f: cPickle.dump(f,prok_correlated_pairses) euk_correlated_pairses = map(analyze_motif,tqdm(euk_motifs,desc='motifs')) with open("euk_correlated_pairses.pkl",'w') as f: cPickle.dump(f,euk_correlated_pairses) prok_corrs = np.array(map(len,prok_correlated_pairses)) euk_corrs = np.array(map(len,euk_correlated_pairses)) prok_depths = np.array([len(motif) for motif in prok_motifs]) euk_depths = np.array([len(motif) for motif in euk_motifs]) prok_lens = np.array([len(motif[0]) for motif in prok_motifs]) euk_lens = np.array([len(motif[0]) for motif in euk_motifs]) prok_lc2s = np.array([choose(L,2) for L in prok_lens]) euk_lc2s = np.array([choose(L,2) for L in euk_lens])
def download_from_repo(self, org, repo, branch, globs): """ Takes github info and file globs and downloads files from github using multiple processes :param org: github organization or person :param repo: github repo name :param branch: the branch name :param globs: list of strings in glob format of what to extract :return: """ with GitHub(self.options.token) as gh: listing = gh.list(org, repo, branch, globs) dest = "{0}{1}{2}".format( self.extract_dir, repo, sep ) with Pool(processes=self.pool_size) as pool: with requests.Session() as s: r = [ x for x in tqdm( pool.imap_unordered( partial( gh.raw, request_session=s, org=org, repo=repo, branch=branch, dest_dir=dest, ), listing, ) ) ]
def process_glove(args, vocab_list, save_path, size=4e5, random_init=True): """ :param vocab_list: [vocab] :return: """ if not gfile.Exists(save_path + ".npz"): glove_path = os.path.join(args.glove_dir, "glove.6B.{}d.txt".format(args.glove_dim)) if random_init: glove = np.random.randn(len(vocab_list), args.glove_dim) else: glove = np.zeros((len(vocab_list), args.glove_dim)) found = 0 with open(glove_path, 'r') as fh: for line in tqdm(fh, total=size): array = line.lstrip().rstrip().split(" ") word = array[0] vector = list(map(float, array[1:])) if word in vocab_list: idx = vocab_list.index(word) glove[idx, :] = vector found += 1 if word.capitalize() in vocab_list: idx = vocab_list.index(word.capitalize()) glove[idx, :] = vector found += 1 if word.upper() in vocab_list: idx = vocab_list.index(word.upper()) glove[idx, :] = vector found += 1 print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path)) np.savez_compressed(save_path, glove=glove) print("saved trimmed glove matrix at: {}".format(save_path))
def estremo_lite_vs_maxent_motifs(): n = 20 L = 10 Ns = np.linspace(10,10000,100) pss = [sella_hirsch_predictions(n=n,L=L,G=1000,N=N) for N in tqdm(Ns)] ics = np.array([mean_ic_from_eps(eps,n,L) for eps in enumerate_eps(N,L)]) expected_ics = [ics.dot(ps) for ps in pss]
def run(self, progress=True, verbose=False): """Compute all steps of the simulation. Be careful: if tmax is not set, this function will result in an infinit loop. Returns ------- (t, fields): last time and result fields. """ total_iter = int((self.tmax // self.user_dt) if self.tmax else None) log = logging.info if verbose else logging.debug if progress: with tqdm(initial=(self.i if self.i < total_iter else total_iter), total=total_iter) as pbar: for t, fields in self: pbar.update(1) log("%s running: t: %g" % (self.id, t)) try: return t, fields except UnboundLocalError: warnings.warn("Simulation already ended") for t, fields in self: log("%s running: t: %g" % (self.id, t)) try: return t, fields except UnboundLocalError: warnings.warn("Simulation already ended")
def ic_log_pvalue(N, L, des_ic, verbose=False, trials=100, method="ub"): print des_ic correction_per_col = 3/(2*log(2)*N) K = L * correction_per_col # correction per motif ic_for_beta = des_ic + K tolerance = 10**-10 beta = find_beta_for_mean_motif_ic(N,L,ic_for_beta,tolerance=tolerance,verbose=verbose) # correct val of beta countses = enumerate_counts(N) entropies = np.array(map(entropy_from_counts, countses)) iterator = tqdm(countses) if verbose else countses log_cols = np.array(map(log_counts_to_cols, iterator)) log_Zq = log_sum(log_cols + -beta*entropies)*L log_Zp = N*L*log(4) #log_prefactor = log_Zq - log_Zp + beta*2*L log_prefactor = log_Zq - log_Zp + beta*(2*L-K) if method == "UB": log_expectation_ub = (-beta*(des_ic)) log_pval_ub = log_prefactor + log_expectation_ub return log_pval_ub - log(2) elif method == "analytic": mu, sigma = calc_params(N, L, beta) log_expectation = log(compute_expectation_spec(beta, mu, sigma)) log_pval = log_prefactor + log_expectation return log_pval else: ms = maxent_motifs(N, L, des_ic, trials, beta=beta) ics = map(motif_ic, ms) print "des_ic, mean ics:", des_ic, mean(ics) log_expectation = log_sum([-beta*ic for ic in ics if ic > des_ic]) - log(trials) # Xxx loss of precision log_pval = log_prefactor + log_expectation return log_pval
def plot_DO(self, dos, target='stream', name='T0076', ymax=30000): ''' Plot target by do. target has to be either stream/mail ''' if type(dos) == str: dos = [dos] fig = plt.figure(figsize=(20, 30)) colors = sns.color_palette('deep', len(dos)) for j, year in tqdm(enumerate(['2014', '2015', '2016']), total=3): ax = plt.subplot(3, 1, j + 1) for i, do in enumerate(dos): tmp = self.get_DO(do, target=target, name=name) try: tmp[year].cnt.plot(ax=ax, label=do, lw=2, color=colors[i], marker='o') except: pass ax.legend() ax.legend(fontsize=18) ax.tick_params(axis='both', which='major', labelsize=30)
def update_integration_pre_build(from_path=None, to_path=None): """ All modifications that may happen to a integration content are here :param from_path: the input path where we scrap data from :param to_path: the output path to integration md files """ if exists(from_path): pattern = '**/*_manifest.json' for file_name in tqdm(sorted(glob.glob('{}{}'.format(from_path, pattern), recursive=True))): key_name = basename(file_name.replace('_manifest.json', '')) """ Scraping all sections that we can found """ data_array = readme_get_section(from_path, key_name) """ Gathering the manifest short description and adding the right token """ data_array.append([DESC_TOKEN,manifest_get_data(from_path,key_name,DESC_ATTRIBUTE)]) """ Inlining the data in the doc file """ file_update_content(to_path, key_name, data_array) else: print('Path does not exist: {}'.format(from_path)) exit(1)
def sync(self): """sync up doi and materials collections (needed after doicoll reset)""" existing_mp_ids = self.ad.matcoll.find( {'doi': {'$exists': True}}, {'_id': 0, 'task_id': 1} ).distinct('task_id') if existing_mp_ids: num_bibtex_errors = 0 docs = self.ad.doicoll.find( {'_id': {'$in': existing_mp_ids}}, {'doi': 1, 'bibtex': 1} ).limit(0 if self.show_pbar else 5) ndocs = docs.count() if self.show_pbar: pbar = tqdm(total=ndocs) for doc in docs: if num_bibtex_errors > 2: logger.error('abort bibtex generation (too many request errors)') return None doc['bibtex'] = self.save_bibtex_item(doc) if not doc['bibtex']: num_bibtex_errors += 1 continue self.build_item(doc) if self.show_pbar: pbar.update() if self.show_pbar: pbar.close() logger.info('{} materials synced'.format(ndocs)) else: logger.info('no materials with DOIs exist')
def build_directory(dirnm, force=False): # print 'Loading Variable to Summary Table Mapping' if force or not os.path.isfile('directory.json'): directory = {} for fname in tqdm(os.listdir(dirnm)): typ = fname[0:1] st_nm = fname[1:3] seq_nm = fname[4:8] with open(dirnm + fname, 'rb') as f: read_flg = False for ln in f: if ln.strip() not in ('', ';', 'RUN;') and read_flg: varnm = re.split('\s*', ln.strip())[0] try: directory[varnm].append([typ, st_nm, seq_nm]) except KeyError: directory[varnm] = [[typ, st_nm, seq_nm]] read_flg = True if 'INPUT' in ln else read_flg for key in ['FILEID', 'FILETYPE', 'STUSAB', 'CHARITER', 'SEQUENCE', 'LOGRECNO']: directory.pop(key) with open('directory.json', 'wb') as f: json.dump(directory, f) with open('directory.json', 'rb') as f: directory = json.load(f) return directory
def build(self, mpids=None): """build DOIs into matcoll""" # get mp-id's # - w/ valid doi & bibtex keys in doicoll # - but w/o doi & doi_bibtex keys in matcoll query = {'doi': {'$exists': True}, 'bibtex': {'$exists': True}} if mpids is not None: query['_id'] = {'$in': mpids} valid_mp_ids = self.ad.doicoll.find(query).distinct('_id') if valid_mp_ids: missing_mp_ids = self.ad.matcoll.find( { 'task_id': {'$in': valid_mp_ids}, 'doi': {'$exists': False}, 'doi_bibtex': {'$exists': False} }, {'_id': 0, 'task_id': 1} ).distinct('task_id') items = self.ad.doicoll.find( {'_id': {'$in': missing_mp_ids}}, {'doi': 1, 'bibtex': 1} ).sort('bibtexed_on', pymongo.ASCENDING) if self.show_pbar: pbar = tqdm(total=items.count()) for item in items: self.build_item(item) if self.show_pbar: pbar.update() if self.show_pbar: pbar.close() logger.info('all available DOIs built into matcoll') else: logger.info('no valid DOIs available for build')
def read_bson(bson_path, num_records, with_categories): rows = {} with open(bson_path, "rb") as f, tqdm(total=num_records) as pbar: offset = 0 while True: item_length_bytes = f.read(4) if len(item_length_bytes) == 0: break length = struct.unpack("<i", item_length_bytes)[0] f.seek(offset) item_data = f.read(length) assert len(item_data) == length item = bson.BSON.decode(item_data) product_id = item["_id"] num_imgs = len(item["imgs"]) row = [num_imgs, offset, length] if with_categories: row += [item["category_id"]] rows[product_id] = row offset += length f.seek(offset) pbar.update() columns = ["num_imgs", "offset", "length"] if with_categories: columns += ["category_id"] df = pd.DataFrame.from_dict(rows, orient="index") df.index.name = "product_id" df.columns = columns df.sort_index(inplace=True) return df
#加载测试集数据 # Get test data m = 10000 X = extract_data('./data/t10k-images-idx3-ubyte.gz', m, 28) y_dash = extract_labels('./data/t10k-labels-idx1-ubyte.gz', m).reshape(m, 1) # Normalize the data X -= int(np.mean(X)) # subtract mean X /= int(np.std(X)) # divide by standard deviation test_data = np.hstack((X, y_dash)) X = test_data[:, 0:-1] X = X.reshape(len(test_data), 1, 28, 28) y = test_data[:, -1] t = tqdm(range(len(X)), leave=True) #计算精准率和召回率 corr = 0 digit_count = [0 for i in range(10)] digit_correct = [0 for i in range(10)] for i in t: x = X[i] pred, prob = predict(x, f1, f2, w3, w4, b1, b2, b3, b4) digit_count[int(y[i])] += 1 if pred == y[i]: corr += 1 digit_correct[pred] += 1 t.set_description("Acc:%0.2f%%" % (float(corr / (i + 1)) * 100))
default='bolorspeech', help='dataset name') args = parser.parse_args() if args.dataset == 'mbspeech': from datasets.mb_speech import MBSpeech dataset = MBSpeech() elif args.dataset == 'librispeech': from datasets.libri_speech import LibriSpeech dataset = ConcatDataset([ LibriSpeech(name='train-clean-100'), LibriSpeech(name='train-clean-360'), LibriSpeech(name='train-other-500'), LibriSpeech(name='dev-clean', ) ]) else: from datasets.bolor_speech import BolorSpeech dataset = ConcatDataset([ BolorSpeech(name='train'), BolorSpeech(name='test'), BolorSpeech(name='demo'), BolorSpeech(name='annotation') ]) transform = Compose([LoadAudio(), ComputeMagSpectrogram()]) for data in tqdm(dataset): fname = data['fname'] data = transform(data) mel_spectrogram = data['input'] np.save(fname.replace('.wav', '.npy'), mel_spectrogram)
def download(url): with open('./sougou/{name}.scel'.format(name=j), 'wb') as f: f.write(requests.get(url).content) f.close() if __name__ == '__main__': first_urls = get_first(key_word) second_urls = get_second(first_urls) third_urls = get_third(second_urls) print "开始下载......" j = 0 counts = len(third_urls) #tqdm装饰在任意一个迭代器上,能显示当前迭代的进度 for url in tqdm(third_urls): time.sleep(0.01) download(url) j += 1 #print "下载{name}.scel完成,当前进度为{status} %".format(name = j,status = str(float(j)/counts *100)) print "下载完毕,开始导入词典.....\n" commands.getoutput( "./extract-sougou-dict.py sougou/*.scel -o sougou-dict.txt -mmseg") print "\t1. 生成词库成功,合并新词典....." if not os.path.exists("/usr/local/mmseg3/etc/unigram.txt"): print "\t请检查文件 /usr/local/mmseg3/etc/unigram.txt 是否存在!" sys.exit() os.system("cp /usr/local/mmseg3/etc/unigram.txt ./") commands.getoutput( "./merge-mmseg-dict.py -a unigram.txt -b sougou-dict.txt -o merged.txt"
#s_to_file = open(api+'.api', "w", encoding="utf-8") #s_to_file.write(str(todos)) #s_to_file.close() except: pass # читаем excel-файл wb = openpyxl.load_workbook("100bighh/firms.xlsx") # печатаем список листов sheets = wb.sheetnames for sheet in sheets: print(sheet) # получаем активный лист sheet = wb.active rows = sheet.max_row print(rows) for i in tqdm(range(2, rows + 1)): cell = sheet.cell(row=i, column=1) firm_no = str(cell.value) cell = sheet.cell(row=i, column=2) firm_name = str(cell.value) cell = sheet.cell(row=i, column=3) firm_id = str(cell.value) print(firm_no, firm_name, firm_id) apitofile(firm_id)
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() bed = myCommandLine.args['input_bed'] gtf = myCommandLine.args['gtf'] otherJuncs = myCommandLine.args['junctionsBed'] wiggle = myCommandLine.args['wiggleWindow'] threads = myCommandLine.args['threads'] outFile = myCommandLine.args['output_fname'] cleanup = myCommandLine.args['keepTemp'] resolveStrand = myCommandLine.args['correctStrand'] # There are a few functions that evaluate what verbose is defined as. # Instead of passing it around, just global it. global verbose verbose = myCommandLine.args['quiet'] global progress progress = myCommandLine.args['progress'] # Convert gtf to bed and split by cromosome. juncs, chromosomes = gtfToSSBed(gtf) # Do the same for the other juncs file. if otherJuncs != None: juncs, chromosomes = addOtherJuncs(juncs, otherJuncs, chromosomes) annotations = dict() for chrom, data in tqdm( juncs.items(), desc= "Step 3/5: Preparing annotated junctions to use for correction", total=len(list(juncs.keys())), dynamic_ncols=True, position=1) if progress else juncs.items(): annotations[chrom] = "%s_known_juncs.bed" % chrom with open("%s_known_juncs.bed" % chrom, "w") as out: for k, v in data.items(): annotation = v c1, c2, strand = k print(chrom, c1, c2, annotation, ".", strand, sep="\t", file=out) skippedChroms = set() readDict = dict() with open(bed) as lines: outDict = dict() for line in tqdm(lines, desc="Step 4/5: Preparing reads for correction", dynamic_ncols=True, position=1) if progress else lines: cols = line.rstrip().split() chrom = cols[0] if chrom not in chromosomes: if chrom not in skippedChroms: skippedChroms.add(chrom) if verbose: tqdm.write( "Reference sequence not found in annotations, skipping: %s" % (chrom), file=sys.stderr) continue else: if chrom not in outDict: readDict[chrom] = "%s_temp_reads.bed" % chrom outDict[chrom] = open("%s_temp_reads.bed" % chrom, 'w') print(line.rstrip(), file=outDict[chrom]) cmds = list() for chrom in readDict: juncs = annotations[chrom] reads = readDict[chrom] outDict[chrom].close() cmds.append((chrom, juncs, reads, resolveStrand)) p = Pool(threads) for i in tqdm(p.imap(runCMD, cmds), total=len(cmds), desc="Step 5/5: Correcting Splice Sites", dynamic_ncols=True, position=1) if progress else p.imap(runCMD, cmds): pass with open("%s_all_inconsistent.bed" % outFile, 'wb') as inconsistent: for chrom in readDict: with open("%s_inconsistent.bed" % chrom, 'rb') as fd: shutil.copyfileobj(fd, inconsistent, 1024 * 1024 * 10) if cleanup: os.remove(annotations[chrom]) os.remove(readDict[chrom]) os.remove("%s_inconsistent.bed" % chrom) with open("%s_all_corrected.bed" % outFile, 'wb') as corrected: for chrom in readDict: with open("%s_corrected.bed" % chrom, 'rb') as fd: shutil.copyfileobj(fd, corrected, 1024 * 1024 * 10) if cleanup: os.remove("%s_corrected.bed" % chrom) print("\n")
default='v1.0-simplified-train.jsonl.gz') parser.add_argument('--devfile', default='v1.0-simplified-dev.jsonl.gz') parser.add_argument('--passagefile', default='all_passages.jsonl') parser.add_argument('--queries_trainfile', default='train_queries.json') parser.add_argument('--answers_trainfile', default='train_answers.json') parser.add_argument('--queries_devfile', default='dev_queries.json') parser.add_argument('--answers_devfile', default='dev_answers.json') parser.add_argument('--qrelsfile', default='all_qrels.txt') args = parser.parse_args() traindata = [] with gzip.open(args.trainfile, 'rb') as fp: for i, line in enumerate( tqdm(fp, total=307373, desc='Reading trainset')): item = json.loads(line.strip()) eid = item['example_id'] doc = item.pop('document_text') item.pop('long_answer_candidates') pids = [] paras = [] for ans in item.pop('annotations'): lans = ans['long_answer'] pid = lans['candidate_index'] st = lans['start_token'] et = lans['end_token'] if pid not in pids: pids.append(pid) para = doc.split(" ")[st:et] paras.append(para)
def predict(self, X, y, refit=False, n_iter=None, alpha=None, L=None): if isinstance(X, pd.DataFrame): self.pdflag = True self.pd_index = X.index.values self.pd_columns = X.columns self.pd_yname = y.name X = X.values y = y.values if alpha is None: alpha = self.alpha if L is None: L = self.L if n_iter is None: n_iter = self.L X_imput = X.copy() init_idx = 0 if not np.allclose(self.X, X, equal_nan=True): init_idx, _ = self.update_x_y(X, y, refit=refit) self.nan_matrix = np.zeros(self.X.shape, dtype='bool') for i in range(self.nan_matrix.shape[0]): for j in range(self.nan_matrix.shape[1]): if np.isnan(self.X[i, j]): self.nan_matrix[i, j] = True X_di = self.X_di for n in range(self.n_iter): print('ITERATION # ', n) update_count = 0 for i in tqdm(range(init_idx, X_di.shape[0])): ncolumns = np.random.permutation( X_di.shape[1]) # To enchance imputation variance for col in ncolumns.tolist(): if self.nan_matrix[i, col] == True: idx_y = self.target_dict[self.y[i]] xi = X_di[i] sgraph = self.make_subgraph(xi, col) n_interval = self.estimate_interval(col, sgraph, idx_y) imput_val = self.value_by_node((col, n_interval)) self.X_di[i, col] = imput_val if update_count > self.update_step: self.update_weights_all() update_count = 0 update_count = update_count + 1 # categorical = self.disc.categorical for i in tqdm(range(init_idx, X_di.shape[0])): for col in range(X_di.shape[1]): if self.nan_matrix[i, col] == True: yyi = self.y[i] val = self.X_di[i, col] node = self.find_node_by_value(col, val) xj, yi = self.real_interval(node) notnan = np.invert(np.isnan(xj)) mu0 = xj[notnan].mean() std0 = xj[notnan].std() imput = None c_idx = np.where(yi == yyi)[0] xjc = xj[c_idx] notnan2 = np.invert(np.isnan(xjc)) mu1 = xjc[notnan2].mean() std1 = xjc[notnan2].std() if mu1 / mu0 > alpha: if self.random_state is not None: np.random.seed(self.random_state + i + col) imput = self.f_sample(mu1, L * std1) else: if self.random_state is not None: np.random.seed(self.random_state + i + col) imput = self.f_sample(mu0, L * std0) X_imput[i - init_idx, col] = imput if self.pdflag: X_imput = pd.DataFrame(X_imput, columns=self.pd_columns, index=self.pd_index) return X_imput, y
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') InceptionResnet_model_1 = InceptionResnetV1( pretrained='vggface2').eval().to(device) print('load InceptionResnet-vggface2.pt successfully') InceptionResnet_model_2 = InceptionResnetV1( pretrained='casia-webface').eval().to(device) print('load InceptionResnet-casia-webface.pt successfully') IR_50_model_1 = IR_50([112, 112]) IR_50_model_1.load_state_dict( torch.load( '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/Face_recognition/irse/model/backbone_ir50_asia.pth' )) IR_50_model_1.eval().to(device) print('load IR_50 successfully') IR_152_model_1 = IR_152([112, 112]) IR_152_model_1.load_state_dict( torch.load( '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/Face_recognition/irse/model/Backbone_IR_152_Epoch_112_Batch_2547328_Time_2019-07-13-02-59_checkpoint.pth' )) IR_152_model_1.eval().to(device) print('load IR_152 successfully') # IR_152_model_2 = IR_152([112, 112]) # IR_152_model_2.load_state_dict( # torch.load( # '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/Face_recognition/irse/model/Head_ArcFace_Epoch_112_Batch_2547328_Time_2019-07-13-02-59_checkpoint.pth')) # IR_152_model_2.eval().to(device) # print('load IR_152_ArcFace successfully') import insightface Insightface_iresnet34 = insightface.iresnet34(pretrained=True) Insightface_iresnet34.eval().to(device) print('load Insightface_iresnet34 successfully') Insightface_iresnet50 = insightface.iresnet50(pretrained=True) Insightface_iresnet50.eval().to(device) print('load Insightface_iresnet50 successfully') Insightface_iresnet100 = insightface.iresnet100(pretrained=True) Insightface_iresnet100.eval().to(device) print('load Insightface_iresnet100 successfully') ###########################vgg16 from Face_recognition.vgg16.vgg16 import CenterLossModel, loadCheckpoint vgg16_checkpoint = loadCheckpoint( '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/Face_recognition/vgg16/model' ) VGG16 = CenterLossModel(embedding_size=512, num_classes=712, checkpoint=vgg16_checkpoint).eval().to(device) print('load VGG16 successfully') ###########################resnet34 from Face_recognition.resnet34_triplet.resnet34 import Resnet34Triplet checkpoint = torch.load( '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/Face_recognition/resnet34_triplet/model/model_resnet34_triplet.pt' ) Resnet34 = Resnet34Triplet( embedding_dimension=checkpoint['embedding_dimension']).to(device) Resnet34.load_state_dict(checkpoint['model_state_dict']) print('load Resnet34 successfully') criterion = nn.MSELoss() # cpu # collect all images to attack paths = [] picpath = '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/images' for root, dirs, files in os.walk(picpath): for f in files: paths.append(os.path.join(root, f)) random.shuffle(paths) # paras eps = 1 steps = 30 output_path = './output_img' momentum = 1.0 for path in tqdm(paths): start = time.time() print('processing ' + path + ' ===============>') image = Image.open(path) # define paras # in_tensor is origin tensor of image # in_variable changes with gradient in_tensor = img2tensor(np.array(image)) # print(in_tensor.shape) in_variable = in_tensor.detach().to(device) in_tensor = in_tensor.squeeze().to(device) adv = None # in_tensor= img2tensor_224(image) # # print(in_tensor.shape) # in_variable = in_tensor.to(device) # in_tensor = in_tensor.squeeze().to(device) # adv = None # # # origin feature origin_InceptionResnet_model_1 = InceptionResnet_model_1(in_variable) origin_InceptionResnet_model_2 = InceptionResnet_model_2(in_variable) origin_IR_50_model_1 = IR_50_model_1(in_variable) origin_IR_152_model_1 = IR_152_model_1(in_variable) # # origin_IR_152_model_2 = IR_152_model_2(in_variable) origin_Insightface_iresent34 = Insightface_iresnet34(in_variable) origin_Insightface_iresent50 = Insightface_iresnet50(in_variable) origin_Insightface_iresent100 = Insightface_iresnet100(in_variable) ####### origin_VGG16 = VGG16.forward_GetFeature(in_variable) ########Resnet34 origin_Resnet34 = Resnet34(in_variable) # 1. untarget attack -> random noise # 2. target attack -> x = alpha * target + (1 - alpha) * x perturbation = torch.Tensor(3, 112, 112).uniform_(-0.1, 0.1).to(device) in_variable = in_variable + perturbation in_variable.data.clamp_(-1.0, 1.0) in_variable.requires_grad = True g_noise = 0.0 # sum gradient for i in range(steps): # print('step: ' + str(i)) # in_variable = in_variable.to(device) out_InceptionResnet_model_1 = InceptionResnet_model_1(in_variable) out_InceptionResnet_model_2 = InceptionResnet_model_2(in_variable) out_IR_50_model_1 = IR_50_model_1(in_variable) out_IR_152_model_1 = IR_152_model_1(in_variable) # # out_IR_152_model_2 = IR_152_model_2(in_variable) out_Insightface_iresent34 = Insightface_iresnet34(in_variable) out_Insightface_iresent50 = Insightface_iresnet50(in_variable) out_Insightface_iresent100 = Insightface_iresnet100(in_variable) ##### out_VGG16 = VGG16.forward_GetFeature(in_variable) ##### out_Resnet34 = Resnet34(in_variable) loss = criterion(origin_InceptionResnet_model_1, out_InceptionResnet_model_1) + \ criterion(origin_InceptionResnet_model_2, out_InceptionResnet_model_2) + \ criterion(origin_IR_50_model_1, out_IR_50_model_1) + \ criterion(origin_IR_152_model_1, out_IR_152_model_1) + \ criterion(origin_Insightface_iresent34, out_Insightface_iresent34) + \ criterion(origin_Insightface_iresent50, out_Insightface_iresent50) + \ criterion(origin_Insightface_iresent100, out_Insightface_iresent100) + \ criterion(origin_VGG16, out_VGG16) + \ criterion(origin_Resnet34, out_Resnet34) # print('loss : %f' % loss) # compute gradients loss.backward(retain_graph=True) g_noise = momentum * g_noise + (in_variable.grad / in_variable.grad.data.norm(1)) g_noise = g_noise / g_noise.data.norm(1) if i % 2 == 0: kernel = gkern(3, 2).astype(np.float32) gaussian_blur1 = GaussianBlur(kernel).to(device) g_noise = gaussian_blur1(g_noise) g_noise = torch.clamp(g_noise, -0.1, 0.1) else: addition = TVLoss() g_noise = addition(g_noise) in_variable.data = in_variable.data + ( (eps / 255.) * torch.sign(g_noise) ) # * torch.from_numpy(mat).unsqueeze(0).float() in_variable.grad.data.zero_() # unnecessary # deprocess image adv = in_variable.data.cpu().numpy()[0] # (3, 112, 112) perturbation = (adv - in_tensor.cpu().numpy()) adv = adv * 128.0 + 127.0 adv = adv.swapaxes(0, 1).swapaxes(1, 2) adv = adv[..., ::-1] adv = np.clip(adv, 0, 255).astype(np.uint8) sample_dir = '/notebooks/Workspace/tmp/pycharm_project_314/TianChi/main_5_output-8-29/' if not os.path.exists(sample_dir): os.makedirs(sample_dir) advimg = sample_dir + path.split('/')[-1].split('.')[-2] + '.jpg' cv2.imwrite(advimg, adv) print("save path is " + advimg) print('cost time is %.2f s ' % (time.time() - start))
def data_loader_for_combined_model(file_list, dataset, config, isVenation): shape_x = [] texture_x = [] img_x = [] vein_x = [] y = [] maxVal = config['max_val'] regx_str = config['regx_str'] regx = re.compile(regx_str) for path in tqdm(file_list): path = path.strip() strs = str.split(path, '/') f_name = regx.findall(strs[-1])[0] if strs[-2].startswith("yd"): d = strs[-2][2:] else: d = strs[-2] if dataset == 'soybean': period = strs[-3] shape_parent_path = os.path.join(config['shape_data_path'], d, period) texture_parent_path = os.path.join(config['texture_data_path'], d, period) vein_parent_path = os.path.join(config['vein_data_path'], d, period) else: shape_parent_path = os.path.join(config['shape_data_path'], d) texture_parent_path = os.path.join(config['texture_data_path'], d) if isVenation: vein_parent_path = os.path.join(config['vein_data_path'], d) shape_multiview_x = [] texture_multiview_x = [] if isVenation: vein_multiview_x = [] for i in range(config['shape_views']): channel_1 = np.loadtxt( os.path.join( shape_parent_path, f_name + '_' + str(view_combination[i][0]) + '.txt')) if channel_1.size < 4: channel_1 = np.reshape(channel_1, [1, 2]) channel_1 = np.repeat(channel_1, 100, axis=0) index1 = get_persistence(channel_1, shape_point_num) vec1 = channel_1[index1] vector1 = pht['shape'](vec1) channel_2 = np.loadtxt( os.path.join( shape_parent_path, f_name + '_' + str(view_combination[i][1]) + '.txt')) if channel_2.size < 4: channel_2 = np.reshape(channel_2, [1, 2]) channel_2 = np.repeat(channel_2, 100, axis=0) index2 = get_persistence(channel_2, shape_point_num) vec2 = channel_2[index2] vector2 = pht['shape'](vec2) channel_3 = np.loadtxt( os.path.join( shape_parent_path, f_name + '_' + str(view_combination[i][2]) + '.txt')) if channel_3.size < 4: channel_3 = np.reshape(channel_3, [1, 2]) channel_3 = np.repeat(channel_3, 100, axis=0) index3 = get_persistence(channel_3, shape_point_num) vec3 = channel_3[index3] vector3 = pht['shape'](vec3) feature = np.dstack([vector1, vector2, vector3]) flag = np.sum(np.isinf(feature).astype(int)) if flag > 0: print("Inf Error: {}".format(f_name)) shape_multiview_x.append(feature) for j in range(config['texture_views']): if dataset == 'cherry': texture_pairs = np.loadtxt( os.path.join(texture_parent_path, f_name + '_pd' + str(j) + '.txt')) else: texture_pairs = np.loadtxt( os.path.join(texture_parent_path, f_name + '-pd' + str(j) + '.txt')) if texture_pairs.size < 4: texture_pairs = np.reshape(texture_pairs, [1, 2]) texture_pairs = np.repeat(texture_pairs, 100, axis=0) index4 = get_persistence(texture_pairs, texture_and_vein_point_num) vec_texture = texture_pairs[index4] vec_texture = pht['texture'](vec_texture) texture_multiview_x.append(vec_texture) if isVenation: for m in range(config['vein_views']): if dataset == 'cherry': vein_pairs = np.loadtxt( os.path.join(vein_parent_path, f_name + '_pd' + str(m) + '.txt')) else: vein_pairs = np.loadtxt( os.path.join(vein_parent_path, f_name + '-pd' + str(m) + '.txt')) if vein_pairs.size < 4: vein_pairs = np.reshape(vein_pairs, [1, 2]) vein_pairs = np.repeat(vein_pairs, 100, axis=0) index5 = get_persistence(vein_pairs, texture_and_vein_point_num) vec_vein = vein_pairs[index5] vec_vein = pht['venation'](vec_vein) / maxVal vein_multiview_x.append(vec_vein) img_f_path = os.path.join(path) img = skio.imread(img_f_path) img = resize(img, [config['image_size'][0], config['image_size'][1], 3]) # print(np.max(img)) # img = img/255 shape_x.append(shape_multiview_x) texture_x.append(texture_multiview_x) img_x.append(img) y.append(int(d)) if isVenation: vein_x.append(vein_multiview_x) # result_map = dict() # result_map['img_x'] = img_x # result_map['shape_x'] = shape_x # result_map['texture_x'] = texture_x # if isVenation: # result_map['vein_x'] = vein_x # result_map['y'] = y # # return result_map if isVenation: return img_x, shape_x, texture_x, vein_x, y return img_x, shape_x, texture_x, y
from tqdm import * n = 10 bar = tqdm(total=10) bar.set_description("You're in a progress bar...") for i in range(n): bar.update() bar.close()
def train(load_model, learning_rate, num_epochs): """ Trains the autoencoder. Keyword arguments: load_model -- load previous model learning_rate -- learning rate of the algorithm num_epochs -- training epochs (optional) """ if not os.path.exists('../autoencoder_images'): os.mkdir('../autoencoder_images') batch_size = 128 code_size = 1024 linear_input = 2304 linear_output = 5760 vis = visdom.Visdom() img_transform = transforms.Compose([ transforms.Resize((60, 108), Image.ANTIALIAS), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataset = datasets.ImageFolder(root='../training_set/', transform=img_transform) dataset_length = len(dataset) #Training n_training_samples = (dataset_length / 5) * 3 train_sampler = SubsetRandomSampler( np.arange(n_training_samples, dtype=np.int64)) #Validation n_val_samples = (dataset_length / 5) val_sampler = SubsetRandomSampler( np.arange(n_training_samples, n_training_samples + n_val_samples, dtype=np.int64)) print(n_training_samples, n_training_samples + n_val_samples) #Test n_test_samples = (dataset_length / 5) test_sampler = SubsetRandomSampler( np.arange(n_training_samples + n_val_samples, n_training_samples + n_val_samples + n_test_samples, dtype=np.int64)) print(n_training_samples + n_val_samples, n_training_samples + n_val_samples + n_test_samples) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=2, drop_last=True) validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler, num_workers=2, drop_last=True) test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=test_sampler, num_workers=2, drop_last=True) train_loss_vector = [] val_loss_vector = [] epoch_vector = [] if load_model: model = torch.load('../autoencoder.pth') else: model = autoencoder(linear_input, linear_output, code_size).cuda() criterion = nn.MSELoss() #criterion = nn.BCELoss() m = nn.Sigmoid() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) for epoch in range(num_epochs): print('Training') for data in tqdm(train_loader): img, _ = data img = Variable(img).cuda() # ===================forward===================== output, _ = model(img) loss = criterion(output, img) # ===================backward==================== optimizer.zero_grad() loss.backward() optimizer.step() # ===================log======================== print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss.item())) if epoch % 1 == 0: pic = to_img(output.cpu().data) save_image(pic, '../autoencoder_images/image_{}.png'.format(epoch)) train_loss_vector.append(loss.item()) print('Validation') for data in tqdm(validation_loader): img, _ = data img = Variable(img).cuda() # ===================forward===================== output, _ = model(img) val_loss = criterion(output, img) # ===================backward==================== optimizer.zero_grad() val_loss.backward() optimizer.step() # ===================log======================== print('Validation loss:{:.4f}'.format(val_loss.item())) val_loss_vector.append(val_loss.item()) epoch_vector.append(epoch) validation = dict(x=epoch_vector, y=val_loss_vector, mode="markers+lines", type='custom', marker={ 'color': 'red', 'symbol': 104, 'size': "10" }) train = dict(x=epoch_vector, y=train_loss_vector, mode="markers+lines", type='custom', marker={ 'color': 'blue', 'symbol': 104, 'size': "10" }) layout = dict(title="Loss function", xaxis={'title': 'epochs'}, yaxis={'title': 'loss'}) vis._send({ 'data': [validation, train], 'layout': layout, 'win': 'aelosswin' }) torch.save(model, '../autoencoder.pth') print('Testing') for data in tqdm(test_loader): img, _ = data img = Variable(img).cuda() # ===================forward===================== output, _ = model(img) test_loss = criterion(output, img) # ===================backward==================== optimizer.zero_grad() test_loss.backward() optimizer.step() # ===================log======================== print('Testing loss:{:.4f}'.format(test_loss.item())) pic = to_img(output.cpu().data) save_image(pic, '../autoencoder_images/testing.png'.format(epoch)) torch.save(model, '../autoencoder.pth')
stopwords = open(stopwordFile, 'r').read().split() queries = dict([row for row in csv.reader(open(queryFile, 'r'))][1:]) titles = json.load(open(titleJson, "r")) trim = lambda f: [t.strip() for t in f if t.strip()] token = trim(open(tokenFile).read().split('\n'))#[:5000]#[:301] tokey = trim(open(tokeyFile).read().split('\n'))#[:5000]#[:301] # append title to doc print(""" appending title to document... """) title_weight = 2 for i, key in enumerate(tqdm(tokey)): title = retain_chinese(titles.get(key, '')).strip() if title and title != "Non": title_token = ' {}'.format(' '.join([w for w in cut_method(title) if w not in stopwords])) * title_weight token[i] += title_token #print('+= ' + title_token) if len(token) != len(tokey): print('token len sould eq to tokey len') exit(0) bm25 = BM25Transformer() vectorizer = TfidfVectorizer() print("""
def get_time_evolution(N, wmin, wmax, total_time): #N must be odd in order to locate the central oscillator if N % 2 == 0: raise ValueError('N must be odd') else: N = int(N / 2) + 3 centre = -2 t = 0 dt = 0.01 #Somewhat optimal value w = wmin dw = (wmax - wmin) / total_time * dt u = 0.01 num_steps = round(total_time / dt) #Empty lists to collect data frequencies = [] shifts = [] z = np.zeros((N, N)) v = np.zeros((N, N)) for step in tqdm(range(num_steps)): #Increase w and t, set position of the central oscillator w += dw t += dt z[centre, centre] = np.sin(w * t) #Find estimates of next v and z a = np.roll(z, 1, 1) + np.roll(z, -1, 1) + np.roll(z, 1, 0) + np.roll( z, -1, 0) - 4 * z - u * v vE_next = v + a * dt zE_next = z + v * dt #Clean up boundaries and correct position of the central oscillator zE_next[0] = zE_next[1] zE_next[-1] = zE_next[:, -3] zE_next[:, 0] = zE_next[:, 1] zE_next[:, -1] = zE_next[-3] zE_next[centre, centre] = np.sin(w * (t + dt)) #Find next v and z using Heun method aE_next = (np.roll(zE_next, 1, 1) + np.roll(zE_next, -1, 1) + np.roll(zE_next, 1, 0) + np.roll(zE_next, -1, 0) - 4 * zE_next - u * vE_next) vH_next = v + 0.5 * (a + aE_next) * dt zH_next = z + 0.5 * (v + vE_next) * dt #Set v and z for the next step v = vH_next z = zH_next #Clean up boundaries z[0] = z[1] z[-1] = z[:, -3] z[:, 0] = z[:, 1] z[:, -1] = z[-3] #Calculate and write zero plane displacement shift = np.mean(z[1:-1, 1:-1]**2) / (N - 2)**2 shifts.append(shift) frequencies.append(w) return frequencies, shifts
ws = wb.worksheets[0] row = ws.max_row if row == 1: ws.cell(row=row, column=1).value = 'user_id' ws.cell(row=row, column=2).value = 'user_name' ws.cell(row=row, column=3).value = 'portrait' ws.cell(row=row, column=4).value = 'post_id' ws.cell(row=row, column=5).value = 'content_text' ws.cell(row=row, column=6).value = 'lou_num' ws.cell(row=row, column=7).value = 'post_time' ws.cell(row=row, column=8).value = 'url' ws.cell(row=row, column=9).value = 'crawl_time' row += 1 need_update = 0 with tqdm(total=len(tiezi_list)) as pbar: for url in tiezi_list: # tiezi = Tiezi(url, is_check_tiezi, all_tiezi_address) tiezi = Tiezi(url) contents = tiezi.get_content() if len(contents) == 0: print("the length of contents is 0!") for content in contents: if content['post_id'] in all_post_id_ori: continue if content['post_id'] in all_post_id_new: continue if len(content['content_text']) == 0: continue ws.cell(row=row, column=1).value = content['user_id'] ws.cell(row=row, column=2).value = content['user_name']
def train(self, imsize, batch_size, input_ch, epoch_nbr, net_weights_init, dir_images, saver_directory, images_root, label_nbr, learning_rate, variable_scope="s"): with tf.Graph().as_default() as g: # create placeholders images = tf.placeholder(tf.float32, [None, imsize, imsize, input_ch], name="images") labels = tf.placeholder(tf.int32, [None, imsize, imsize], name="labels") with tf.variable_scope(variable_scope) as scope: # create model deconv_net, net = self.model_function(images, label_nbr) predictions = deconv_net[-1] # create saver saver = tf.train.Saver( [v for v in tf.global_variables() if variable_scope in v.name]) # error reshaped_labels = tf.reshape(labels, [-1]) reshaped_predictions = tf.reshape(predictions, [-1, label_nbr]) loss = tf.contrib.losses.sparse_softmax_cross_entropy( reshaped_predictions, reshaped_labels) # optimizer optimizer = tf.train.AdamOptimizer(learning_rate) train_step = optimizer.minimize(loss) # create session sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) # load net weights if needed if net is not None: net.load(net_weights_init, sess) # create the list of images in the folder directory = os.path.join(dir_images, images_root) directory_labels = os.path.join(dir_images, "labels/") files = [] for file in os.listdir(directory_labels): if file.endswith(".npz"): file = file.split(".")[:-1] file = ".".join(file) files.append(file) # load to get the size imsize = scipy.misc.imread( os.path.join(directory, files[0] + ".png")).shape # create directory if os.path.exists(saver_directory): shutil.rmtree(saver_directory) os.makedirs(saver_directory) # open file for loss f = open(os.path.join(saver_directory, "loss.txt"), 'w') # iterate for epoch in range(epoch_nbr): print("epoch " + str(epoch)) total_loss = 0 # create batches shuffle(files) batches = [ files[i:i + batch_size] for i in range(0, len(files), batch_size) ] batches = batches[: -1] # remove last batch (potentially not the same size) batch_ = np.zeros( (batch_size, imsize[0], imsize[1], imsize[2]), dtype=float) labels_ = np.zeros((batch_size, imsize[0], imsize[1]), dtype=int) for batch_files in tqdm(batches): for im_id in range(len(batch_files)): batch_[im_id] = scipy.misc.imread( os.path.join(directory, batch_files[im_id] + ".png")) labels_[im_id] = np.load( os.path.join(directory_labels, batch_files[im_id] + ".npz"))["arr_0"] batch_ /= 255 fd = {images: batch_, labels: labels_} [l, tr_] = sess.run([loss, train_step], fd) total_loss += l print(total_loss / (len(batches) * batch_size)) f.write(str(total_loss / (len(batches) * batch_size)) + " \n") f.flush() if ((epoch + 1) % 10 == 0): # save the model saver.save(sess, os.path.join(saver_directory, "model.ckpt")) # save the model saver.save(sess, os.path.join(saver_directory, "model.ckpt")) # close file f.close() # close session del sess
def load_patch_data(data_dir, preprocess_dir, patch_size, labels_known=True, num_patches=100): ''' Loads in datasets and returns the labeled preprocessed patches for use in the model. Determines the number of classes for the problem and assigns labels to each class, sorted alphabetically. Params: - data_dir: string, path to all training class directories - preprocess_dir: string, path to destination for robustfov files - patch_size: 3-element tuple of integers, size of patches to use for training - labels_known: boolean, True if we know the labels, such as for training or validation. False if we do not know the labels, such as loading in data to classify in production Returns: - data: list of 3D ndarrays, the patches of images to use for training - labels: list of 1D ndarrays, one-hot encoding corresponding to classes - all_filenames: list of strings, corresponding filenames for use in validation/test ''' data = [] labels = [] all_filenames = [] #################### CLASSIFICATION OF UNKNOWN DATA #################### if not labels_known: print("*** CALLING 3DRESAMPLE ***") orient_dir = orient(data_dir, preprocess_dir) print("*** CALLING ROBUSTFOV ***") robustfov_dir = robust_fov(orient_dir, preprocess_dir) filenames = [ x for x in os.listdir(robustfov_dir) if not os.path.isdir(os.path.join(robustfov_dir, x)) ] filenames.sort() for f in filenames: img = nib.load(os.path.join(robustfov_dir, f)).get_data() #normalized_img = normalize_data(img) patches = get_patches(img, patch_size, num_patches) for patch in tqdm(patches): data.append(patch) all_filenames.append(f) print("A total of {} patches collected.".format(len(data))) data = np.array(data) return data, all_filenames #################### TRAINING OR VALIDATION #################### # determine number of classes class_directories = [ os.path.join(data_dir, x) for x in os.listdir(data_dir) ] class_directories.sort() num_classes = len(class_directories) # write the mapping of class to a local file in the following space-separated format: # CLASS_NAME integer_category class_encodings_file = os.path.join(data_dir, "..", "..", "class_encodings.txt") if not os.path.exists(class_encodings_file): with open(class_encodings_file, 'w') as f: for i in range(len(class_directories)): f.write( os.path.basename(class_directories[i]) + " " + str(i) + '\n') print("*** GATHERING PATCHES ***") for i in range(len(class_directories)): filenames = os.listdir(class_directories[i]) filenames.sort() for f in tqdm(filenames): img = nib.load(os.path.join(class_directories[i], f)).get_data() #normalized_img = normalize_data(img) patches = get_patches(img, patch_size, num_patches) for patch in patches: data.append(patch) labels.append(to_categorical(i, num_classes=num_classes)) all_filenames.append(f) print("A total of {} patches collected.".format(len(data))) data = np.array(data, dtype=np.float16) data = np.reshape(data, (data.shape + (1, ))) labels = np.array(labels, dtype=np.float16) return data, labels, all_filenames
def run(self): # If this is the first data assimilation window, we can just run the model as normal if self.start_day == 0: assert self.current_particle_pop_df is None # Shouldn't have any preivously-created particles # load snapshot snapshot = Snapshot.load_full_snapshot(path=self.snapshot_file) # set params snapshot.update_params(self.params) # Can set the random seed to make it deterministic (None means np will choose one randomly) snapshot.seed_prngs(seed=None) # Create a simulator and upload the snapshot data to the OpenCL device simulator = Simulator(snapshot, opencl_dir=self.opencl_dir, gpu=self.use_gpu) simulator.upload_all(snapshot.buffers) if not self.quiet: # print(f"Running simulation {sim_number + 1}.") print(f"Running simulation") params = Params.fromarray( snapshot.buffers.params ) # XX Why extract Params? Can't just use PARAMS? summary = Summary( snapshot, store_detailed_counts=self.store_detailed_counts, max_time=self.run_length # Total length of the simulation ) # only show progress bar in quiet mode timestep_iterator = range(self.run_length) if self.quiet \ else tqdm(range(self.quiet), desc="Running simulation") iter_count = 0 # Count the total number of iterations # Run for iterations days for _ in timestep_iterator: # Update parameters based on lockdown params.set_lockdown_multiplier(snapshot.lockdown_multipliers, iter_count) simulator.upload("params", params.asarray()) # Step the simulator simulator.step() iter_count += 1 # Update the statuses simulator.download("people_statuses", snapshot.buffers.people_statuses) summary.update(iter_count, snapshot.buffers.people_statuses) if not self.quiet: for i in range(self.run_length): print(f"\nDay {i}") summary.print_counts(i) if not self.quiet: print("\nFinished") # Download the snapshot from OpenCL to host memory # XX This is 'None'. final_state = simulator.download_all(snapshot.buffers) pass else: # Otherwise we need to restart previous models stored in the current_particle_pop_df # XXXX CAN GET OLD MODEL STATES, WITH ALL DISEASE STATUSES, FROM THE DF. TWO ISSUES # 1. But need to work out how to draw these appropriately; can't assume they are each as good as # each other. THIS SHOULD BE OK, surely there's a way to go from the final particles and weights # to the DF of state vectors. Particle ID? Just try it out. # 2. Also: what to do about stochasticity. For a given (global) parameter combination, we will # get quite different results depending on the mode state. - I DON'T THINK THIS IS A PROBLEM. # ABC Commonly used with stochastic models. E.g. https://eprints.lancs.ac.uk/id/eprint/80439/1/mainR1.pdf # raise Exception("Not implemented yet") # Return the current state of the model in a dictionary describing what it is #return {"simulator": simulator} return {"simulator": snapshot}
def analyze_track(model, waveform, sample_rate): global cuts, mergelist, pbar, audio_len, labels, stats # state vars for analysis loop lastc = 1 # last seen class lasts = 0 # last visited second lastts = "00:00:00.000" # last cut was at this timestamp count = 0 # number of subtitle records lastwf = 0 # last frame of last analyzed. for 0s --> 1s at 16000Hz would be 16000 stats = [[0, 0] for _ in range(len(labels))] if args.srt: sub = open(video_path[:-4] + ".srt", 'w', encoding='utf-8') # subtitle track name else: sub = io.StringIO( ) # RAM file if no subtitle file needs to be generated window_size = int(sample_rate / args.window_size_divide) # 1s by default window_slide = int(window_size / args.window_slide_divide) # slide the window of size window_size by window_slide per iteration. # overlap may occour. print("analyzing track...") last_i = window_slide * int(audio_len / window_slide) pbar = tqdm(total=last_i) for i in range(0, audio_len, window_slide): pbar.update(n=window_slide) spectrogram = get_spectrogram(waveform, i, window_size) spectrogram = tf.expand_dims(spectrogram, axis=0) prediction = model(spectrogram) cls = int(tf.math.argmax(prediction[0])) conf = float(tf.nn.softmax(prediction[0])[cls]) # generate cut when we know the end of it (or the track is at its end) if cls != lastc or i == last_i: s = i / sample_rate if i == last_i: s += (audio_len - i) / sample_rate ts = "0" + str(datetime.timedelta(seconds=s))[:11] if len(ts) <= 8: ts += ".000" # if the window slide is overlapping the previous analyzed window # and prediction has changed, don't generate a new cut until we are over it # ...unless an "emh" is detected! [don't truncate last detected ehm] if labels[cls] != "emh" and i < lastwf and i < last_i: continue # generate subtitles record = str(count) + "\n" + lastts.replace('.',',') + " --> " + \ ts.replace('.',',') + "\n" + labels[lastc] + \ "\n[" + str(conf * 100)[:4] + "]" +"\n\n" count += 1 sub.write(record) stats[lastc][0] += 1 stats[lastc][1] += s - lasts lasts = s # generate cut if labels[lastc] == "speech": generate_cut(lastts, ts, count) elif args.generate_training_data: generate_tdata(lastts, ts, count, labels[lastc]) lastts = ts lastc = cls # slide the right hand side of the window detection. # This allows to cut segments > than window size lastwf = i + window_size if not args.spectrogram: continue img = spectrogram.numpy().T img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) cv2.putText(img, labels[cls], (5, 120), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2) cv2.imshow("spectrogram", img) cv2.waitKey(1) & 0xFF time.sleep(0.2) sub.close()
def preprocess_images(imageIds, dst, threeBandImagesDir): for imageId in tqdm(imageIds, leave=False): band_paths = [threeBandImagesDir / f"{imageId}.tif"] stack(band_paths=band_paths, out_path=dst / f"{imageId}.tif")
width=params.width, array_noise_level=params.array_noise_level, array_noise_seed=seeds_for_noise[CESnumber], mapping_perpair=params.mapping_perpair) ## Initialise map containers for each processor if pos_CES == 0: sky_out_tot = OutputSkyMap(projection=tod.projection, nside=tod.nside_out, obspix=tod.obspix, npixsky=tod.npixsky, pixel_size=tod.pixel_size) ## Scan input map to get TODs d = [] for det in tqdm(range(inst.focal_plane.nbolometer)): d.append(tod.map2tod(det)) ## Project TOD to maps tod.tod2map(np.array(d), sky_out_tot) MPI.COMM_WORLD.barrier() ## Coaddition over all processors. ## Note that all processors will then have the coadded data. ## If you want informations at the level of each CES (or group of), ## use instead: ## final_map = OutputSkyMap(nside=nside_out, obspix=tod.obspix) ## final_map.coadd_MPI(sky_out_tot, MPI=MPI) sky_out_tot.coadd_MPI(sky_out_tot, MPI=MPI)
def read_us_user_like_page(input_file_path): """Reading data "us_user_like_page" into a python dictionary. First check the structure of the input file, then calulate the number of shared users between pages and store it with a dictionary. Args: input_file_path: A string of path of input file: us_user_like_page. Returns: page_page_dict: A dictionary, containes multiple dictionaries that stores the numbers of shared useres betweeen two pages, using page id as key and shared useres as values, ex. { pageid1: {pageid2: (shared user pageid1 & pageid2)}, {pageid3: (shared user pageid1 & pageid3)}, ... pageid2: {pageid3: (shared user pageid2 & pageid3)}, ... } Raises: incorrect_file_type: Contradiction of correct input file structure. Example of correct structure: { user_id,like_pages,like_times 1000000736695525,21785951839,1 1000001070029820,"44473416732,50978409031,630067593722141","2,1,2" } """ page_page_dict = {} try: inputfile = open(input_file_path, "r") reader = csv.DictReader(inputfile) for i, test in enumerate(reader): test["user_id"] test["like_pages"] test["like_times"] break; except: raise incorrect_file_type("input should be an us_user_like_page data") with open(input_file_path, "r") as inputfile: reader = csv.DictReader(inputfile) for i, row in enumerate( tqdm( reader, total = get_num_lines(input_file_path))): pageid_list = row['like_pages'].split(',') for j, p in enumerate(pageid_list): if p not in page_page_dict: page_page_dict[p] = {} for k, p1 in enumerate(pageid_list): if k < j: continue elif k == j: page_page_dict[p][p] = page_page_dict[p].get(p,0) + 1 else: if p1 not in page_page_dict: page_page_dict[p1] = {} page_page_dict[p][p1] = page_page_dict[p].get(p1,0) + 1 page_page_dict[p1][p] = page_page_dict[p1].get(p,0) + 1 return(page_page_dict)
import csv from csv import writer notify = Notify() notify.register() url_names = 'https://www.maxpreps.com/rankings/football-fall-17/{}/state/texas.htm' url_scores = 'https://www.maxpreps.com/high-schools/{})/football-fall-17/schedule.htm' url_contact_info = 'https://www.maxpreps.com/high-schools/{})/home.htm' #state_set = {'texas'} state_set = [ 'indiana', 'maine', 'minnesota', 'north-dakota' 'nebraska', 'nevada', 'ohio', 'oregon', 'texas', 'virginia' ] for x in tqdm(range(0, 50, 1)): names = url_names.format(x) r = requests.get(names) sopa = BeautifulSoup(r.text, 'html.parser') for item in sopa.find_all('tr'): try: school_name.append( item.find('th', attrs={ 'class': 'school', 'scope': 'row' })) except: school_name.append(np.nan) new_list = [] for i in school_name: i = str(i)
unfreeze_model_param = list( model.module.model.embedding.parameters()) + list( criterion.parameters()) if epoch == 0: for param in list( set(model.parameters()).difference( set(unfreeze_model_param))): param.requires_grad = False if epoch == args.warm: for param in list( set(model.parameters()).difference( set(unfreeze_model_param))): param.requires_grad = True pbar = tqdm(enumerate(dl_tr)) for batch_idx, (x, y) in pbar: m = model(x.squeeze().cuda()) loss = criterion(m, y.squeeze().cuda()) opt.zero_grad() loss.backward() torch.nn.utils.clip_grad_value_(model.parameters(), 10) if args.loss == 'Proxy_Anchor': torch.nn.utils.clip_grad_value_(criterion.parameters(), 10) losses_per_epoch.append(loss.data.cpu().numpy()) opt.step()
from time import sleep from tqdm import * from multiprocessing import Pool, freeze_support, RLock str = '123124555' for i in tqdm(str): sleep(0.1)
def main(self): if self.mode == 'train': print("Training in epoch %s" % self.epoch) print(self.cfg.actions_weights) for batch_data in tqdm(self.data_loader): self.baseprocess(batch_data) # Optim self.optimizer.zero_grad() if self.cfg.center_loss_weight > 0: self.lossOpti.zero_grad() self.total_loss.backward() self.optimizer.step() # multiple (1./alpha) in order to remove the effect of alpha on updating centers if self.cfg.center_loss_weight > 0: for param in self.centerlossModel.parameters(): param.grad.data *= (1./self.cfg.center_loss_weight) lossOpti.step() # renew the action loss weight by accuracy if self.cfg.renew_weight: new_weight = torch.nn.functional.softmin(self.actions_meter.correct_rate_each, dim=0) new_weight = new_weight * 9. old_weight = torch.tensor(self.cfg.actions_weights) new_weight = old_weight * (1 - self.cfg.weight_renew_rate) + self.cfg.weight_renew_rate * new_weight self.cfg.actions_weights = new_weight.tolist() info = { 'mode': self.mode, 'time': self.epoch_timer.timeit(), 'epoch': self.epoch, 'loss': self.loss_meter.avg, 'actions_acc': self.actions_meter.correct_rate, 'actions_ave_acc': self.actions_meter.ave_rate, 'actions_each_acc': self.actions_meter.correct_rate_each.numpy().round(3), 'actions_each_num': self.actions_meter.all_num_each, 'actions_loss_weights': self.actions_loss_weight.correct_rate_each.numpy().round(3), 'actions_confusion': self.confuMatrix.class_acc.numpy().round(3), 'oriens_acc': self.oriens_meter.correct_rate, 'oriens_ave_acc': self.oriens_meter.ave_rate, 'oriens_each_acc': self.oriens_meter.correct_rate_each.numpy().round(3), 'oriens_each_num': self.oriens_meter.all_num_each, 'oriens_confusion': self.confuMatrix2.class_acc.numpy().round(3) } elif self.mode == 'test': print("Testing in test dataset") with torch.no_grad(): for batch_data in tqdm(self.data_loader): self.baseprocess(batch_data) info = { 'mode': self.mode, 'time': self.epoch_timer.timeit(), 'epoch': self.epoch, 'loss': self.loss_meter.avg, 'actions_acc': self.actions_meter.correct_rate, 'actions_ave_acc': self.actions_meter.ave_rate, 'actions_each_acc': self.actions_meter.correct_rate_each.numpy().round(3), 'actions_each_num': self.actions_meter.all_num_each, 'actions_confusion': self.confuMatrix.class_acc.numpy().round(3), 'oriens_acc': self.oriens_meter.correct_rate, 'oriens_ave_acc': self.oriens_meter.ave_rate, 'oriens_each_acc': self.oriens_meter.correct_rate_each.numpy().round(3), 'oriens_each_num': self.oriens_meter.all_num_each, 'oriens_confusion': self.confuMatrix2.class_acc.numpy().round(3) } else: assert False, "mode name incorrect" return info
def __init__(self, root=None, train=False, transform=None, mist_transform=None, loader=default_loader, seqlen=5, debug=False, dist_filter=None, off_3d=True, off_pc_render=True, overwrite_fofn=False, semantic_transform=np.array, env=None): print('Processing the data:') if not root: self.root = os.path.join( os.path.dirname(os.path.abspath(assets.__file__)), "dataset") else: self.root = root self.train = train self.env = env self.loader = loader self.seqlen = seqlen self.transform = transform self.target_transform = transform self.depth_trans = mist_transform self.semantic_trans = semantic_transform self._require_semantics = "SEMANTICS" in self.env.config[ "ui_components"] self.off_3d = off_3d self.select = [] self.fofn = self.root + '_fofn' + str(int(train)) + '.pkl' self.off_pc_render = off_pc_render self.dll = None if not self.off_pc_render: self.dll = np.ctypeslib.load_library('render', '.') if overwrite_fofn or not os.path.isfile(self.fofn): self.scenes = sorted([ d for d in (os.listdir(self.root)) if os.path.isdir(os.path.join(self.root, d)) and os.path.isfile(os.path.join(self.root, d, 'camera_poses.csv')) and os.path.isdir(os.path.join(self.root, d, 'pano')) ]) num_scenes = len(self.scenes) num_train = int(num_scenes * 0.9) print("Total %d scenes %d train %d test" % (num_scenes, num_train, num_scenes - num_train)) if train: self.scenes = self.scenes[:num_train] self.meta = {} if debug: last = 35 else: last = len(self.scenes) for scene in self.scenes[:last]: posefile = os.path.join(self.root, scene, 'camera_poses.csv') with open(posefile) as f: for line in f: l = line.strip().split(',') uuid = l[0] xyz = list(map(float, l[1:4])) quat = list(map(float, l[4:8])) if not scene in self.meta: self.meta[scene] = {} metadata = (uuid, xyz, quat) # print(uuid, xyz) if os.path.isfile( os.path.join(self.root, scene, 'pano', 'points', 'point_' + uuid + '.json')): self.meta[scene][uuid] = metadata print("Indexing") for scene, meta in tqdm(list(self.meta.items())): if len(meta) < self.seqlen: continue for uuid, v in list(meta.items()): dist_list = [ (uuid2, np.linalg.norm(np.array(v2[1]) - np.array(v[1]))) for uuid2, v2 in list(meta.items()) ] dist_list = sorted(dist_list, key=lambda x: x[-1]) if not dist_filter is None: if dist_list[1][-1] < dist_filter: self.select.append( [[scene, dist_list[i][0], dist_list[i][1]] for i in range(self.seqlen)]) else: self.select.append( [[scene, dist_list[i][0], dist_list[i][1]] for i in range(self.seqlen)]) with open(self.fofn, 'wb') as fp: pickle.dump([ self.scenes, self.meta, self.select, num_scenes, num_train ], fp) else: with open(self.fofn, 'rb') as fp: self.scenes, self.meta, self.select, num_scenes, num_train = pickle.load( fp) print("Total %d scenes %d train %d test" % (num_scenes, num_train, num_scenes - num_train))
def searem(X, string=''): clean = [] for nm in tqdm(list(X.columns)): if string not in nm: clean.append(nm) return (clean)
def CNN(epoch=100, batch_size=128, save_period=10, load_period=100, optimizer="sgd", learning_rate=0.01, dataset="MNIST", ctx=mx.gpu(0), method=1): #data selection if dataset == "MNIST": train_data, test_data = MNIST(batch_size) path = "weights/MNIST-{}.params".format(load_period) elif dataset == "CIFAR10": train_data, test_data = CIFAR10(batch_size) path = "weights/CIFAR10-{}.params".format(load_period) elif dataset == "FashionMNIST": train_data, test_data = FashionMNIST(batch_size) path = "weights/FashionMNIST-{}.params".format(load_period) else: return "The dataset does not exist." '''Follow these steps: •Define network •Initialize parameters •Loop over inputs •Forward input through network to get output •Compute loss with output and label •Backprop gradient •Update parameters with gradient descent. ''' #Convolution Neural Network # formula : output_size=((input−weights+2*Padding)/Stride)+1 # data size # MNIST,FashionMNIST = (batch size , 1 , 28 , 28) # CIFAR = (batch size , 3 , 32 , 32) '''note!!! To compile and optimize the HybridSequential, we can then call its hybridize method. Only HybridBlocks, e.g. HybridSequential, can be compiled. But you can still call hybridize on normal Block and its HybridBlock children will be compiled instead. We will talk more about HybridBlocks ''' if method == 1: #method 1 : HybridBlock net = HybridBlockNetwork() elif method == 2: #method 2 : Block net = BlockNetwork() else: #method 3 : using Sequential() net = gluon.nn.HybridSequential() # stacks 'Block's sequentially with net.name_scope(): net.add( gluon.nn.Conv2D(channels=60, kernel_size=(3, 3), strides=(1, 1), use_bias=True, activation="relu") ) # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30) net.add( gluon.nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) ) # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15) net.add( gluon.nn.Conv2D(channels=30, kernel_size=(6, 6), strides=(1, 1), use_bias=True, activation="relu") ) # MNIST : result = ( batch size , 30 , 8 , 8), CIFAR10 : result = ( batch size , 30 , 10 , 10) net.add( gluon.nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) ) # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5) net.add( gluon.nn.Dense(units=120, activation="sigmoid", use_bias=True, flatten=True)) net.add(gluon.nn.Dropout(0.2)) net.add( gluon.nn.Dense(units=64, activation="sigmoid", use_bias=True)) net.add(gluon.nn.Dropout(0.2)) net.add(gluon.nn.Dense(10, use_bias=True)) net.hybridize() # hybridize!!!! for faster learning - only for hybrid #weights initialization if os.path.exists(path): print("loading weights") net.load_params(filename=path, ctx=ctx) # weights load else: print("initializing weights") net.collect_params().initialize(mx.init.Normal(sigma=0.1), ctx=ctx) # weights initialization #net.initialize(mx.init.Normal(sigma=0.1),ctx=ctx) # weights initialization #optimizer trainer = gluon.Trainer(net.collect_params(), optimizer, {"learning_rate": learning_rate}) #learning for i in tqdm(range(1, epoch + 1, 1)): for data, label in train_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(train_mode=True): output = net(data) #loss definition '''Why do you write this? answer : Blocks, sequential, softmaxCrossEntropyLoss, and other gluon package keywords should be accessed as classes by default.''' loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True)( output, label) cost = nd.mean(loss).asscalar() loss.backward() trainer.step(batch_size, ignore_stale_grad=True) print(" epoch : {} , last batch cost : {}".format(i, cost)) #weight_save if i % save_period == 0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") if dataset == "MNIST": net.save_params("weights/MNIST-{}.params".format(i)) if dataset == "FashionMNIST": net.save_params("weights/FashionMNIST-{}.params".format(i)) elif dataset == "CIFAR10": net.save_params("weights/CIFAR10-{}.params".format(i)) test_accuracy = evaluate_accuracy(test_data, net, ctx) print("Test_acc : {}".format(test_accuracy)) return "optimization completed"
pixel_size=params.pixel_size, width=params.width, array_noise_level=params.array_noise_level, array_noise_seed=seeds_for_noise[CESnumber], mapping_perpair=True) ## Initialise map containers for each processor if pos_CES == 0: sky_out_tot = OutputSkyMap(projection=tod.projection, nside=tod.nside_out, obspix=tod.obspix, npixsky=tod.npixsky, pixel_size=tod.pixel_size, demodulation=True) for pair in tqdm(tod.pair_list): ## Demodulated TS d_demod = np.array([tod.map2tod(det) for det in pair]) d_demod = tod.demodulate_timestreams(d_demod) tod.tod2map(d_demod, sky_out_tot) MPI.COMM_WORLD.barrier() ## Coaddition over all processors. ## Note that all processors will then have the coadded data. ## If you want informations at the level of each CES (or group of), ## use instead: ## final_map = OutputSkyMap(nside=nside_out, obspix=tod.obspix) ## final_map.coadd_MPI(sky_out_tot, MPI=MPI) sky_out_tot.coadd_MPI(sky_out_tot, MPI=MPI)
# To evaluate on the validation set: #model.evaluate_generator(val_gen, steps=num_val_images // batch_size, workers=8) #------------------------PREDICTIONS--------------------------------------------------# from keras import backend as K from keras.preprocessing.image import ImageDataGenerator submission_df = pd.read_csv(data_dir + 'sample_submission.csv') submission_df.head() test_datagen = ImageDataGenerator() data = bson.decode_file_iter(open(test_bson_path, 'rb')) with tqdm(total=num_test_products) as pbar: for c, d in enumerate(data): product_id = d['_id'] num_imgs = len(d['imgs']) batch_x = np.zeros((num_imgs, 180, 180, 3), dtype=K.floatx()) for i in range(num_imgs): bson_img = d['imgs'][i]['picture'] #Load and preprocess the image img = load_img(io.BytesIO(bson_img), target_size=(180, 180)) x = img_to_array(img) x = test_datagen.random_transform(x) x = test_datagen.standardize(x)