def image_summary( self, samples: Optional[List["IMCSample"]] = None, rois: List["ROI"] = None, ): raise NotImplementedError from imc.utils import lacunarity, fractal_dimension rois = self._get_rois(samples, rois) roi_names = [r.name for r in rois] densities = pd.Series( {roi.name: roi.cells_per_area_unit() for roi in rois}, name="cell density", ) lacunarities = pd.Series( parmap.map(lacunarity, [roi.cell_mask_o for roi in rois], pm_pbar=True), index=roi_names, name="lacunarity", ) fractal_dimensions = pd.Series( parmap.map( fractal_dimension, [roi.cell_mask_o for roi in rois], pm_pbar=True, ), index=roi_names, name="fractal_dimension", ) morphos = pd.DataFrame( [densities * 1e4, lacunarities, fractal_dimensions]).T
def mapper(self): baseurl = 'https://www.rottentomatoes.com/critics/authors?letter=' for i in range(len(alpha)): realurl = baseurl + quote_plus(alpha[i]) url.append(realurl) parmap.map(self.crawler, url, pm_pbar=True, pm_processes=num_cores)
def test_map_progress(self): items = range(4) pfalse = parmap.map(_wait, items, pm_pbar=False) ptrue = parmap.map(_wait, items, pm_pbar=True) noparmap = list(map(_wait, items)) self.assertEqual(pfalse, ptrue) self.assertEqual(pfalse, noparmap)
def test_map_kwargs(self): items = range(2) pfalse = parmap.map(_fun_with_keywords, items, pm_parallel=False, a=10) ptrue = parmap.map(_fun_with_keywords, items, pm_parallel=True, a=10) noparmap = [ x + 10 + _DEFAULT_B for x in items] self.assertEqual(pfalse, ptrue) self.assertEqual(pfalse, noparmap)
def test_map_kwargs(self): items = range(2) pfalse = parmap.map(_fun_with_keywords, items, pm_parallel=False, a=10) ptrue = parmap.map(_fun_with_keywords, items, pm_parallel=True, a=10) noparmap = [x + 10 + _DEFAULT_B for x in items] self.assertEqual(pfalse, ptrue) self.assertEqual(pfalse, noparmap)
def test_map(self): items = range(4) pfalse = parmap.map(_identity, items, parallel=False) ptrue = parmap.map(_identity, items, parallel=True) noparmap = list(map(_identity, items)) self.assertEqual(pfalse, ptrue) self.assertEqual(pfalse, noparmap)
def get_labeled_patches(imgs, gts, n_segments=100, thres1=0.2, thres2=0.2): """ Get all the patches from the set of images. :param imgs: images :param gts: masks :param n_segments: max number of patches for image :param thres1: label = 1 if a proportion bigger than thres1 in the patch is masked as 1 :param thres2: label = 1 if pixels masked as 1 in patch / total number of pixels masked as 1 in the picture > thres2 :return: patches: list of patches, size [len(img), n_patches_per_image, 80,80] :return: labels: list of labels per each patch, size [len(img), n_patches_per_image] """ n = len(imgs) SLIC_list = np.asarray([ slic(imgs[i, :], n_segments, compactness=20, sigma=10) for i in range(len(imgs)) ]) # run box function to find all superpixel patches sizes boxes = parmap.map(box, SLIC_list) # populating x_train patches = parmap.map(xpatchify, zip(imgs, SLIC_list, boxes)) # labels labels = parmap.map(patch_cat, zip(gts, SLIC_list), thres1, thres2) return patches, labels
def build_HDF5(size): """ Gather the data in a single HDF5 file. """ df_attr = parse_attibutes() list_col_labels = [ c for c in df_attr.columns.values if c not in ["person", "imagenum", "image_path"] ] # Put train data in HDF5 hdf5_file = os.path.join(data_dir, "lfw_%s_data.h5" % size) with h5py.File(hdf5_file, "w") as hfw: data_color = hfw.create_dataset("lfw_%s_color" % size, (0, 3, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) data_sketch = hfw.create_dataset("lfw_%s_sketch" % size, (0, 1, size, size), maxshape=(None, 1, size, size), dtype=np.uint8) label = hfw.create_dataset("labels", data=df_attr[list_col_labels].values) label.attrs["label_names"] = list_col_labels arr_img = df_attr.image_path.values num_files = len(arr_img) chunk_size = 1000 num_chunks = num_files / chunk_size arr_chunks = np.array_split(np.arange(num_files), num_chunks) for chunk_idx in tqdm(arr_chunks): list_img_path = arr_img[chunk_idx].tolist() output = parmap.map(format_image, list_img_path, size) output_sketch = parmap.map(format_sketch, list_img_path, size) arr_img_color = np.concatenate(output, axis=0) arr_img_sketch = np.concatenate(output_sketch, axis=0) # Resize HDF5 dataset data_color.resize(data_color.shape[0] + arr_img_color.shape[0], axis=0) data_color[-arr_img_color.shape[0]:] = arr_img_color.astype( np.uint8) data_sketch.resize(data_sketch.shape[0] + arr_img_sketch.shape[0], axis=0) data_sketch[-arr_img_sketch.shape[0]:] = arr_img_sketch.astype( np.uint8)
def mix_rir_and_sound_source(self, mode): """ convolve speech and speech_rir (random selected) :param mode: tr/cv/tt :return: save multi-channel speech """ # path set save_path = os.getcwd() + '/multi_channel_speech/' + mode rir_path = os.getcwd() + '/rir/' + mode if mode == 'cv': rir_path = os.getcwd() + '/rir/tr' spc_path = '/home/dail/PycharmProjects/DCCRN/data/tr/clean' # rir list and sound source list rir_list = glob(rir_path + '/*/*.npz') spc_list = glob(spc_path + '/*.wav') # generate random rir index spc_list.sort() _use_par = False if _use_par == True: if mode == 'tr': _ = parmap.map(self.convolve_and_save_rir_tr, spc_list, pm_pbar=True, pm_processes=28) if mode == 'cv': _ = parmap.map(self.convolve_and_save_rir_cv, spc_list, pm_pbar=True, pm_processes=28) if mode == 'tt': _ = parmap.map(self.convolve_and_save_rir_tt, spc_list, pm_pbar=True, pm_processes=28) else: for i, _spc in enumerate(tqdm(spc_list)): # read audio file # aud, fs = librosa.core.load(_spc, sr=None, mono=False) aud, fs = audioread(_spc) if len(aud.shape) != 1: aud = aud[:, 0] #aud.shape[1] idx_s = np.random.randint(0, len(rir_list)) npz = np.load(rir_list[idx_s], allow_pickle=True) # convolve rir = npz['rir'] Y = ss.convolve(rir, aud[:, np.newaxis]) audiowrite( save_path + '/' + rir_list[idx_s].split('/')[-2] + '_' + rir_list[idx_s].split('/')[-1].split('.n')[0] + '_' + _spc.split('/')[-1], Y, fs)
def handle_batch(batch, instances): print 'Downloading batch '+batch+'... ', dir = 'downloaded_'+batch dnss = map(lambda (x,y): (dir, x, y), instances) os.makedirs(dir) parmap.map(handle_instance, dnss) parmap.map(extract_instance, dnss) print 'Done'
def store_apps_info(self, app_ids: [str]) -> None: """Adds the specified apps to the data set by retrieving all the info needed and appending them to the list of apps (kept in _info_file). :param app_ids: array of app ids. """ app_ids = set(app_ids) parmap.map(self.store_app_info, app_ids)
def iedbPredict(method, hlas, peptides, cpus=1, verbose=False): """Generate HLA:peptide binding affinity (log-IC50) predictions using the tools distributed by IEDB. Predictions are computed for all HLA:peptide combinations. Parameters ---------- method : string Prediction method (e.g. netmhcpan, smm, ann) If RAND is specified then random predictions are returned. hlas : list List of HLA alleles in the format A_0201 or A*0201 peptides : list of strings List of peptides, required to be all be of the same length. cpus : int Number of cores to use in parallelizing the predictions. Returns ------- df : pd.DataFrame Columns: method, hla, peptide, core, pred""" if verbose: """Create console handler and set level to debug""" logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(asctime)s:%(message)s') logging.info('HLA prediction initialized for %d HLA allele(s) using method %s on %d CPU(s)', len(hlas), method, cpus) cols = ['method', 'hla', 'peptide', 'core', 'pred'] if method == 'RAND': results = dict(method=[], hla=[], peptide=[], core=[], pred=[]) for h, pep in itertools.product(hlas, peptides): results['method'].append('RAND') results['hla'].append(h) results['peptide'].append(pep) results['core'].append(pep) results['pred'].append(np.random.rand()) resDf = pd.DataFrame(results, columns=cols) else: if cpus > 1: result = parmap.map(_predictOneHLA, hlas, method, peptides, verbose, pool=Pool(processes=cpus)) else: result = parmap.map(_predictOneHLA, hlas, method, peptides, verbose, parallel=False) """Remove None's""" resDf = pd.concat([r for r in result if not r is None], axis=0) """Take the log of the prediction if neccessary.""" if resDf.pred.max() > 100: resDf['pred'] = np.log(resDf.pred) if verbose: logging.info('Completed %d predictions (expected %d)', resDf.shape[0], len(hlas) * len(peptides)) return resDf
def delete_operation(project_id, mode): table_names = return_tables(project_id, schema_name) if mode == 'prefix': deletion_table_names = list(filter(lambda k: pattern in k, table_names)) print(deletion_table_names) results = parmap.map(delete_table, deletion_table_names, project_id) #results = p.map(delete_table, deletion_table_names) print('tables_deleted') elif mode == 'array': results = parmap.map(delete_table, reqd_table_names, project_id)
def test_pgen_with_parmap(): """ Really simple example of using multiple cpus to speed up computation of pgens with olga. """ import parmap from tcrdist.pgen import OlgaModel olga_beta = OlgaModel(chain_folder="human_T_beta", recomb_type="VDJ") parmap.map(olga_beta.compute_aa_cdr3_pgen, [ 'CASSYRVGTDTQYF', 'CATSTNRGGTPADTQYF', 'CASQGDSFNSPLHF', 'CASSPWTGSMALHF' ])
def load_chip_singleTask(input_dirs, tf): blacklist = make_blacklist() print('Loading and sorting BED file(s)') chip_bed_list, relaxed_bed_list = zip(*parmap.map(get_chip_bed, input_dirs, tf, blacklist.fn)) # Later we want to gather negative windows from the genome that do not overlap # with a blacklisted or ChIP region print('Generating regions to exclude for negative windows') nonnegative_regions_bed_file_list = parmap.map(nonnegative_wrapper, relaxed_bed_list, blacklist.fn) nonnegative_regions_bed_list = [BedTool(i) for i in nonnegative_regions_bed_file_list] return chip_bed_list, nonnegative_regions_bed_list
def insta_image_crawler_main(tagAndUrls, imgDir, iteration): # 'tagAndUrls'는 검색할 #Tag와 그에 대한 Url의 1:1 쌍 구성들이다. 'imgDir'은 내려받은 image를 저장할 경로이다. # 'iteration'은 스크롤 다운을 반복할 횟수이다. imgList = [] # 입력한 #Tag에 대해 스크롤다운하여 가져온 이미지들의 Url을 모아 둘 배열을 선언한다. while True: # 멀티프로세싱을 위해, 입력한 #Tag(에 대한 Url)들을 프로세스를 수행할 PC의 CPU 코어 갯수만큼씩 잘라 작업을 수행할 것이다. if len(tagAndUrls ) <= cores: # #Tag의 수가 Core의 수 이하라면, #Tag의 수 만큼만 작업을 분할해 수행한다. indexedtagAndUrls = list( enumerate(np.array_split(tagAndUrls, len(tagAndUrls)))) sleepTimeBias = len(tagAndUrls) * 0.5 # 스크롤 다운이 동시에 병렬로 진행되므로, 스크롤 다운 수행 후 새 페이지에 이미지가 로드되길 기다릴 시간의 기본값에 더할 연장값을 산출한다. for imgListPerCore in parmap.map(image_crawler, indexedtagAndUrls, iteration, sleepTimeBias): # 'parmap.map'으로 분할한 작업의 수에 따라 멀티프로세싱을 수행한다. for img in imgListPerCore: imgList.append( img) # 각 분할 작업으로부터 돌려받은 이미지의 Url 값들을 'imgList'에 더한다. break # 남아있는 #Tag의 수가 코어 수 이하라는 것은 뒤에 더 남은 #Tag가 없다는 것을 의미하므로, 위의 작업이 수행되면 반복과정을 종료한다. else: # 남아있는 #Tag의 수가 현재 PC의 CPU 코어 수보다 많을 경우, 다음의 과정을 진행한다. indexedtagAndUrls = list( enumerate(np.array_split(tagAndUrls[0:cores], cores))) del tagAndUrls[0:cores] # 'tagAndUrls'로부터 #Tag를 CPU 코어 수 만큼 가져온 다음, 전체 목록인 'tagAndUrls'에서 해당 부분은 삭제한다. sleepTimeBias = cores * 0.5 # 스크롤 다운이 동시에 병렬로 진행되므로, 스크롤 다운 수행 후 새 페이지에 이미지가 로드되길 기다릴 시간의 기본값에 더할 연장값을 산출한다. for imgListPerCore in parmap.map(image_crawler, indexedtagAndUrls, iteration, sleepTimeBias): # 'parmap.map'으로 분할한 작업의 수에 따라 멀티프로세싱을 수행한다. for img in imgListPerCore: imgList.append( img) # 각 분할 작업으로부터 돌려받은 이미지의 Url 값들을 'imgList'에 더한다. indexedImgList = list(enumerate(np.array_split(list(set(imgList)), cores))) # 가져와 단순 추가해 둔 이미지 Url들에 대해, 단순한 Url값 기준 중복제거 처리를 수행한다.(이미지의 내용을 기준으로 비교하는 것은 아니다.) # 중복제거 처리 후, 멀티프로세싱을 위해 전체 양을 현재 PC의 Core의 수로 나눠 쪼개 둔다. fileNames = [] # 저장된 파일들의 이름을 적재할 배열을 선언한다. cnt = 0 # 저장된 파일들을 계수할 변수를 0으로 초기화하며 선언한다. for fileNamesPerCore, cntPerCore in parmap.map(image_saver, indexedImgList, imgDir): # 위에서 분할해 둔 Url 목록을, 파일 저장 목표 경로인 'imgDir' Parameter와 함께 # 'parmap.map'을 통해 파일 저장 메소드에 넘겨 CPU Core 수 대로 멀티프로세싱(병렬처리)를 수행한다. # 각각의 프로세스로부터 반환받은 파일 이름들과 갯수에 대해 다음의 과정을 수행하게 된다. for fileName in fileNamesPerCore: fileNames.append( fileName) # 각각의 병렬 프로세스로부터 넘겨받은 파일명 목록을 전체 목록에 더해 넣는다. cnt += cntPerCore # 개별 프로세스 각각의 파일 저장 갯수를 총 갯수에 차례대로 모두 더한다. return fileNames, cnt # 모든 병렬 프로세스의 파일명 목록과 갯수를 합한 것들을 호출측에 각각 반환한다.
def pairwise_filter_conv_parallel(self): # Cat: TODO: this may still crash memory in some cases; can split into additional bits units = np.array_split(np.unique(self.up_up_map), self.n_processors) if self.multi_processing: parmap.map(parallel_conv_filter, list(zip(np.arange(len(units)), units)), self.n_time, self.up_up_map, self.unit_overlap, self.up_factor, self.vis_chan, self.approx_rank, self.deconv_dir, processes=self.n_processors, pm_pbar=True) else: units = np.unique(self.up_up_map) for k in range(len(units)): print("unit : ", k) parallel_conv_filter([k, [units[k]]], self.n_time, self.up_up_map, self.unit_overlap, self.up_factor, self.vis_chan, self.approx_rank, self.deconv_dir) # load temp_temp saved files from disk due to memory overload otherwise temp_array = [] for i in range(len(units)): fname = self.deconv_dir + '/temp_temp_chunk_' + str(i) + '.npy' temp_pairwise_conv = np.load(fname) temp_array.extend(temp_pairwise_conv) os.remove(fname) # initialize empty list and fill only correct locations print(" gathering temp_temp results...") pairwise_conv = [] for i in range(self.n_unit): pairwise_conv.append(None) ctr = 0 for unit2 in np.unique(self.up_up_map): pairwise_conv[unit2] = temp_array[ctr] ctr += 1 pairwise_conv = np.array(pairwise_conv) print(pairwise_conv.shape) # save to disk, don't keep in memory np.save(self.deconv_dir + "/pairwise_conv.npy", pairwise_conv)
def build_HDF5(jpeg_dir): """ Gather the data in a single HDF5 file. """ # Put train data in HDF5 hdf5_file = os.path.join(output_dir, NAME) with h5py.File(hdf5_file, "w") as hfw: list_img = glob.glob(os.path.join(jpeg_dir, "*.jpg")) list_img = np.array(list_img) data_color = hfw.create_dataset("data", (0, 3, SIZE, SIZE), maxshape=(None, 3, SIZE, SIZE), dtype=np.uint8) num_files = len(list_img) chunk_size = TO BE SPECIFIED #set to num_files if dataset is small num_chunks = num_files / chunk_size arr_chunks = np.array_split(np.arange(num_files), num_chunks) for chunk_idx in tqdm(arr_chunks): list_img_path = list_img[chunk_idx].tolist() output = parmap.map(format_image, list_img_path, SIZE, parallel=True) arr_img_color = np.concatenate(output, axis=0) # Resize HDF5 dataset data_color.resize(data_color.shape[0] + arr_img_color.shape[0], axis=0) data_color[-arr_img_color.shape[0]:] = arr_img_color.astype(np.uint8)
def stock_price_ambiguity(max_workers): cur = connect(db_path).cursor() cur.execute('SELECT name FROM sqlite_master WHERE type="table"') stock_list = [row[0] for row in cur] cur.close() result = File_manager('analyzed', 'ambiguity') t = int(result.ver['ambiguity']) * 1000000 try: result_df = pd.read_csv(result.path, index_col=0) except EmptyDataError: result_df = None with Pool(processes=int(max_workers), initializer=init) as p: stock_ambiguity = map(worker, stock_list, t, pm_pool=p, pm_pbar=True) stock_ambiguity = pd.concat(stock_ambiguity, axis=1) if result_df is not None: stock_ambiguity = pd.concat([result_df.iloc[:-1], stock_ambiguity]) stock_ambiguity.sort_index(axis=0, inplace=True) result.update_version({'ambiguity': stock_ambiguity.index[-1]}) stock_ambiguity.to_csv(result.path)
def calculate_score(model, model_weights_path, musdb_dir='musdb', n_workers=1, n_fft=2048, hop_length=512, slice_duration=2): mus = musdb.DB(root_dir=musdb_dir) music_list = mus.load_mus_tracks(subsets='test') model_weights = torch.load(model_weights_path) model.load_state_dict(model_weights) # model.cuda() scores = parmap.map(calculate_SDR, music_list, pm_processes=n_workers, pm_pbar=True, model=model, n_fft=n_fft, hop_length=hop_length, slice_duration=slice_duration) print(scores) print(np.mean(scores)) print(np.median(scores)) torch.save(scores, 'scores')
def main(): data = np.loadtxt("./exampleTargets/C3/K2C3cat.txt", usecols=(0,)) cfg = pl.loadDefaultConfig() taskList = cfg["taskList"] # for i in range(len(taskList)): # taskList[i] = "pl.%s" %(taskList[i]) # cfg['taskList'] = taskList cfg["taskList"] = taskList[:10] print cfg["taskList"] count = multiprocessing.cpu_count() - 1 p = pool.Pool(count) print count cfg["debug"] = False parallel = cfg.get("debug", False) parallel = False # Pool doesn't release threads even when it runs to completion. # Problem not related to exceptions being raised with contextlib.closing(pool.Pool(count)) as p: out = parmap.map(pl.runOne, data[1:3], cfg, parallel=parallel) p.join() p.close() return out
def disaster_message_preprocessor(max_workers): mode = 'w' input = File_manager('raw', 'disasterMessage') output = File_manager('preprocessed', 'disasterMessage') new_ver = input.ver.copy() if new_ver['disasterMessage'] == '0': return raw = read_csv(input.path) t = output.ver['disasterMessage'] new_ver.update(File_manager('ref', 'userdic', format='txt').ver) new_ver.update(File_manager('ref', 'stopwords').ver) compare = output.compare_version(new_ver) header = True n = len(compare) if n: output.update_version(new_ver) if n == 1 and compare[0] == 'disasterMessage' and t != '0': mode = 'a' header = False raw = raw.iloc[t:] else: return df_split = array_split(raw, max_workers) df_list = parmap.map(tsk, df_split, pm_pbar=True, pm_pool=Pool(max_workers, initializer=initializer)) concat(df_list).to_csv(output.path, mode=mode, index=False, header=header)
def load_chip_multiTask(input_dir): tfs, chip_beds, merged_chip_bed = get_chip_beds(input_dir) print('Removing peaks outside of X chromosome and autosomes') chroms, chroms_sizes, genome_bed = get_genome_bed() merged_chip_bed = merged_chip_bed.intersect(genome_bed, u=True, sorted=True) print('Windowing genome') genome_windows = BedTool().window_maker(g=genome_sizes_file, w=genome_window_size, s=genome_window_step) print('Extracting windows that overlap at least one ChIP interval') positive_windows = genome_windows.intersect(merged_chip_bed, u=True, f=1.0*(genome_window_size/2+1)/genome_window_size, sorted=True) # Exclude all windows that overlap a blacklisted region blacklist = make_blacklist() print('Removing windows that overlap a blacklisted region') positive_windows = positive_windows.intersect(blacklist, wa=True, v=True, sorted=True) num_positive_windows = positive_windows.count() # Binary binding target matrix of all positive windows print('Number of positive windows:', num_positive_windows) print('Number of targets:', len(tfs)) # Generate targets print('Generating target matrix of all positive windows') y_positive = parmap.map(intersect_count, chip_beds, positive_windows.fn) y_positive = np.array(y_positive, dtype=bool).T print('Positive matrix sparsity', (~y_positive).sum()*1.0/np.prod(y_positive.shape)) merged_chip_slop_bed = merged_chip_bed.slop(g=genome_sizes_file, b=genome_window_size) # Later we want to gather negative windows from the genome that do not overlap # with a blacklisted or ChIP region nonnegative_regions_bed = merged_chip_slop_bed.cat(blacklist) return tfs, positive_windows, y_positive, nonnegative_regions_bed
def bootstrap_par(reads, smat_raw, B, test_c=0.01, nprocs=1): """ Similarity correction using a bootstrapping procedure for more robust corrections and error estimates. Bootstrapping conducted in parallel. Args: reads -- [numpy.array (M,N)] array with mapping information; reads[m,n]==1, if read n mapped to species m. smat_raw -- mapping information for similarity matrix. species have same ordering as reads array B -- Number of bootstrap samples test_c -- For testing: treat species as not present, if estimated concentration is below test_c. nprocs -- Number of parallel bootstrap processes to perform. Return: [p_values, abundances, variances] -- list of floats """ # M: Number of species, N: Number of reads M,N = reads.shape resList = parmap.map(_boot_iteration, range(B), reads, smat_raw, test_c, B, M, N, processes=nprocs) # merging arrays (found, core, fails) found = np.concatenate( [x['found'] for x in resList] ) corr = np.concatenate( [x['corr'] for x in resList] ) fails = np.concatenate( [x['fails'] for x in resList] ) # calculations p_values = np.mean(fails, axis=0) abundances = np.mean(corr, axis=0) variances = np.var(corr, axis=0) return p_values, abundances, variances
def build_HDF5(jpeg_dir, size=64): """ Gather the data in a single HDF5 file. """ # Put train data in HDF5 hdf5_file = os.path.join(data_dir, "CelebA_%s_data.h5" % size) with h5py.File(hdf5_file, "w") as hfw: list_img = glob.glob(os.path.join(jpeg_dir, "*.jpg")) list_img = np.array(list_img) data_color = hfw.create_dataset("data", (0, 3, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) num_files = len(list_img) chunk_size = 2000 num_chunks = num_files / chunk_size arr_chunks = np.array_split(np.arange(num_files), num_chunks) for chunk_idx in tqdm(arr_chunks): list_img_path = list_img[chunk_idx].tolist() output = parmap.map(format_image, list_img_path, size, parallel=True) arr_img_color = np.concatenate(output, axis=0) # Resize HDF5 dataset data_color.resize(data_color.shape[0] + arr_img_color.shape[0], axis=0) data_color[-arr_img_color.shape[0]:] = arr_img_color.astype(np.uint8)
def preproce_db_210301(self): # /media/jeonghwan/HDD2/Dataset/clean_label/clean_cut6 aud_path = '/media/jeonghwan/HDD2/Dataset/clean_label/clean_cut6/' npz_list = glob(aud_path + '*.npz') plt.figure(1) # for i, npz_ in enumerate(tqdm(npz_list)): # npz = np.load(npz_, allow_pickle=True) # aud = npz['aud'] # label = npz['label'] # # # downsample # aud_re = librosa.resample(aud, 48000, 16000) # resampling # # # label downsample # label = label[::3] # np.savez(self.save_path + npz_.split('/')[-1], aud=aud_re, label=label) # # # # For checking # # plt.subplot(2,1 , 1) # # plt.plot(aud_re) # # plt.subplot(2, 1, 2) # # plt.plot(label[::3]) # # plt.show() # # exit() _ = parmap.map(self.resample_aud_label, npz_list, pm_pbar=True, pm_processes=24)
def on_receive(self, message): """ :param message: :return: """ try: W = message["W"] bounding_rect = message["bounding_rect"] colors = message["colors"] vector_field_is_visible = message["vector_field_is_visible"] W_colors = [(W[i], colors[i]) for i in xrange(len(vector_field_is_visible)) if vector_field_is_visible[i]] if not W_colors: return None images = parmap.map(vector_field_to_image, zip(W, colors), bounding_rect, pool=self.pool) image = reduce(alpha_composite, images) if len(W) > 1 else images[0] return image except Exception as e: print "Exception" print e return None
def runAll(func, iterable, config): """Run func over every element on iterable in parallel. Not yet run or tested. Inputs: ---------- func (A function) The top level function, e.g runOne(), below iterable (list, array, etc.) A list of values to operate on. config (Clipboard) A configuration clipboard. """ count = multiprocessing.cpu_count() - 1 p = pool.Pool(count) parallel = config.get('debug', False) with contextlib.closing(pool.Pool(count)) as p: out = parmap.map(runOne, iterable, config, pool=p, parallel=parallel) return out
def main(args): modellist = args.modellist testlist = [line.strip() for line in open(args.testlist)] #paths and file names outputPath = args.outputPath keepPredict= args.keepPredict scale = args.scale kind = args.kind seg = int(args.seg) modelbox=[] model_name_list=[] for line in open(modellist): modelfile = line.strip() #print "Loading...", modelfile model = joblib.load(modelfile) modelbox.append(model) #not sure whether the model type can be add to the list print "Loaded model", modellist if 'scale' in locals() and len(scale): (maxDict, minDict) = load_param(scale) # now do 'map' in parallel print 'Executing predict parmap:', len (testlist) if os.environ.get('PBS_NUM_PPN') is None: mapresult = [iterateFiles (file, outputPath, modelbox, keepPredict, kind, seg) for file in testlist] else: np = int(os.environ.get('PBS_NUM_PPN')) print ' np=', np mapresult = parmap.map (iterateFiles, testlist, outputPath, modelbox, keepPredict, kind, seg, processes=np) print 'Done!'
def gen_grid_interp(dim, zbounds=[-2.5, -0.25], bound_dict={ 'alpha': [-5, -1.5], 'mu': [-1.5, 0.75], 'sigma': [0.25, 1.0] }): ### Modified to accomodate input function span_list = {} for key in bound_dict.keys(): print(key) span_list[key] = np.arange(*bound_dict[key], 0.1) print(span_list[key]) MESH = np.meshgrid(*[span_list[key] for key in span_list.keys()], indexing='ij') pool_output = parmap.map(pool_function, zip(*[GRID_ELE.flatten() for GRID_ELE in MESH]), zbounds, pm_processes=4) print([len(span_list[key]) for key in span_list.keys()]) grid_interp = RegularGridInterpolator( [GRID_ELE.flatten() for GRID_ELE in MESH], np.array(pool_output).flatten().reshape( [len(span_list[key]) for key in span_list.keys()])) return grid_interp
def run_split_parallel(ptps, labels, CONFIG, ptp_cut=5): all_units = np.unique(labels) new_labels = np.ones(len(ptps), 'int32') * -1 n_processors = CONFIG.resources.n_processors if CONFIG.resources.multi_processing: units_in = [] for j in range(n_processors): units_in.append(all_units[slice(j, len(all_units), n_processors)]) results = parmap.map(run_split, units_in, ptps, labels, CONFIG, ptp_cut, processes=n_processors) n_labels = 0 for rr in results: for rr2 in rr: ii_ = rr2[:, 0] lab_ = rr2[:, 1] new_labels[ii_] = lab_ + n_labels n_labels += len(np.unique(lab_)) else: results = run_split(all_units, ptps, labels, CONFIG, ptp_cut) n_labels = 0 for rr in results: ii_ = rr[:, 0] lab_ = rr[:, 1] new_labels[ii_] = lab_ + n_labels n_labels += len(np.unique(lab_)) return new_labels
def generate_heuristics(self, model,min_cardinality=1, max_cardinality=1): """ Generates heuristics over given feature cardinality model: fit logistic regression or a decision tree max_cardinality: max number of features each heuristic operates over """ # have to make a dictionary?? or feature combinations here? or list of arrays? feature_combinations_final = [] heuristics_final = [] feature_length = 0 for cardinality in range(min_cardinality, max_cardinality + 1): feature_combinations = self.generate_feature_combinations(cardinality) #######single-core # heuristics = [] # for i, comb in enumerate(feature_combinations): # heuristics.append(self.fit_function(comb, model)) ########with parmap heuristics = parmap.map(self.fit_and_return, feature_combinations, model, pm_pbar=True) feature_combinations_final.append(feature_combinations) heuristics_final.append(heuristics) return heuristics_final, feature_combinations_final
def parmap_batch_generator(data_total, endpoints_total, mins_dynamic, scales_dynamic, max_n_step): time_series_all = [] time_series_endpoint_all = [] for p in range(len(data_total)): print(p) path = data_total[p] path_endpoint = endpoints_total[p] data_frame = pd.read_hdf(path).fillna(0) data_frame_endpoint = pd.read_hdf(path_endpoint).fillna(0) assert not data_frame.isnull().values.any(), "No NaNs allowed" assert not data_frame_endpoint.isnull().values.any(), "No NaNs allowed" patients = data_frame.patientunitstayid.unique() temp = parmap.map(get_patient_n, patients, data_frame, data_frame_endpoint, max_n_step, mins_dynamic, scales_dynamic) data = [] labels = [] for a in range(len(temp)): for b in range(len(temp[a][1])): labels.append(temp[a][1][b]) data.append(temp[a][0][b]) data = np.array(data) labels = np.array(labels) time_series_all.extend(data) time_series_endpoint_all.extend(labels) return time_series_all, time_series_endpoint_all
def get_evoked_map(mouse): lofiles, lofilenames = get_file_list(base_dir, mouse) print lofilenames lop = get_distance_var(lofiles) all_frames = get_video_frames(lofiles) print "Alligning all video frames..." all_frames = parmap.starmap(shift_frames, zip(all_frames, lop)) all_frames = np.asarray(all_frames, dtype=np.float32) print np.shape(all_frames) new_all_frames = parmap.map(process_frames_evoked, all_frames) all_frames = np.reshape(all_frames, (all_frames.shape[0]*all_frames.shape[1], all_frames.shape[2], all_frames.shape[3])) save_to_file("conc_RAW.raw", all_frames, np.float32) print "Creating array.." new_all_frames = np.asarray(new_all_frames, dtype=np.float32) print "Averaging together..." new_all_frames = np.mean(new_all_frames, axis=0) print np.shape(new_all_frames) save_to_file("evoked_trial_noBP_GSR.raw", new_all_frames, np.float32)
def build_HDF5(jpeg_dir, nb_channels, data_dir, size=256): """ Gather the data in a single HDF5 file. """ data_dir = os.path.join(data_dir, 'processed') # Put train data in HDF5 file_name = os.path.basename(jpeg_dir.rstrip("/")) hdf5_file = os.path.join(data_dir, "%s_data.h5" % file_name) with h5py.File(hdf5_file, "w") as hfw: for dset_type in ["train", "test", "val"]: list_img = [ img for img in Path(jpeg_dir).glob('%s/*.jpg' % dset_type) ] list_img = [str(img) for img in list_img] list_img.extend(list(Path(jpeg_dir).glob('%s/*.png' % dset_type))) list_img = list(map(str, list_img)) list_img = np.array(list_img) data_full = hfw.create_dataset("%s_data_full" % dset_type, (0, nb_channels, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) data_sketch = hfw.create_dataset("%s_data_sketch" % dset_type, (0, nb_channels, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) num_files = len(list_img) chunk_size = 100 num_chunks = num_files / chunk_size arr_chunks = np.array_split(np.arange(num_files), num_chunks) for chunk_idx in tqdm(arr_chunks): list_img_path = list_img[chunk_idx].tolist() output = parmap.map(format_image, list_img_path, size, nb_channels, pm_parallel=False) arr_img_full = np.concatenate([o[0] for o in output], axis=0) arr_img_sketch = np.concatenate([o[1] for o in output], axis=0) # Resize HDF5 dataset data_full.resize(data_full.shape[0] + arr_img_full.shape[0], axis=0) data_sketch.resize(data_sketch.shape[0] + arr_img_sketch.shape[0], axis=0) data_full[-arr_img_full.shape[0]:] = arr_img_full.astype( np.uint8) data_sketch[-arr_img_sketch.shape[0]:] = arr_img_sketch.astype( np.uint8)
def main(): data = np.loadtxt("./exampleTargets/C3/K2C3cat.txt", usecols=(0, )) cfg = pl.loadDefaultConfig() taskList = cfg['taskList'] # for i in range(len(taskList)): # taskList[i] = "pl.%s" %(taskList[i]) # cfg['taskList'] = taskList cfg['taskList'] = taskList[:10] print cfg['taskList'] count = multiprocessing.cpu_count() - 1 p = pool.Pool(count) print count cfg['debug'] = False parallel = cfg.get('debug', False) parallel = False #Pool doesn't release threads even when it runs to completion. #Problem not related to exceptions being raised with contextlib.closing(pool.Pool(count)) as p: out = parmap.map(pl.runOne, data[1:3], cfg, parallel=parallel) p.join() p.close() return out
def prepare_dataset(data_path, subset=None, path_to_save='./numpy_data', processed_csv_path='./processed_dataset.csv', resample_rate=None, n_fft=2048, hop_length=512, slice_duration=2, n_workers=1): print('hop_length = ', hop_length) mus = musdb.DB(root_dir=data_path) music_list = mus.load_mus_tracks(subsets=subset) print('Starting preparing dataset...') if not os.path.exists(path_to_save): os.makedirs(path_to_save) processed_csv = pd.DataFrame(columns=['mix'] + list(music_list[0].targets.keys())) # p = multiprocessing.Pool(6) rows = parmap.map(process_audio, music_list, processed_csv, pm_pbar=True, pm_processes=n_workers, path_to_save=path_to_save, n_fft=n_fft, resample_rate=resample_rate, hop_length=hop_length, slice_duration=slice_duration) for r in rows: for n in r: processed_csv.loc[len(processed_csv)] = n processed_csv.to_csv(processed_csv_path, index=False)
def GMMSel(coords, gmm, covar=None, sel_gmm=None, cutoff_nd=3., rng=np.random): # swiss cheese selection based on a GMM: # if within 1 sigma of any component: you're out! import multiprocessing, parmap n_chunks, chunksize = sel_gmm._mp_chunksize() inside = np.array(parmap.map(insideComponent, range(sel_gmm.K), sel_gmm, coords, covar, cutoff_nd, pm_chunksize=chunksize)) return np.max(inside, axis=0)
def get_distance_var(lof,width,height,frame_oi): filtered_frames=[] print('') print('Now in get_distance_var') print(lof) for f in lof: print('loop') print(f) frames=get_green_frames(f,width,height) print(type(frames)) filtered_frames.append(filter2_test_j(frames[frame_oi,:,:])) print "Getting all the distances.." # Get all the distances using all videos as ref point, thus size of matrix is n^2 list_of_ref = [] for frame_ref in filtered_frames: list_of_positions = [] res_trials = parmap.map(image_registration.chi2_shift, filtered_frames, frame_ref) # res_trials is array of trials * [dx, dy, edx, edy] for res in res_trials: list_of_positions.append(Position(res[0], res[1])) #for frame in filtered_frames: # dx, dy, edx, edy = image_registration.chi2_shift(frame_ref, frame) # list_of_positions.append(Position(dx, dy)) list_of_ref.append(list_of_positions) print "Finding the min..." list_of_positions = find_min_ref(list_of_ref) return list_of_positions
def main(): # Print M0 such that you are able to reconstruct the messages used at any node print("M0_1:") prints(M0_1) print("M0_2:") prints(M0_2) # Make room for 2^n nodes print("[!] Initializing hypercube") hc = [None for _ in range(2**DIM)] print("[+] Completed initializing hypercube") # Compute the MAC of each node in parallel and show a progress bar print("[!] Building hypercube...") hc = parmap.map(functools.partial(build_node, hc=hc), range(len(hc)), pm_pbar=True, pm_processes=8, pm_chunksize=100) print("[+] Completed building hypercube") # Check for collisions in O(n) print("[!] Checking for full collisions...") D = defaultdict(list) for idx, mac in enumerate(hc): D[tuple(mac)].append(idx) D = {k: v for k, v in D.items() if len(v) > 1} if len(D.items()) == 0: print("[-] No collisions found") else: print("[+] Collisions found") for k, v in D.items(): print("Nodes", v, "share the following MAC:") prints(k)
def test_map_without_parallel_timings(self): NUM_TASKS = 6 items = range(NUM_TASKS) mytime = time.time() pfalse = parmap.map(_wait, items, pm_parallel=False) elapsed = time.time() - mytime self.assertTrue(elapsed >= TIME_PER_TEST*NUM_TASKS) self.assertEqual(pfalse, list(range(NUM_TASKS)))
def parallel(self, names, fileType='fasta', nprocs=1, **kwargs): """Running simulator using apply_async Args: names -- NameFile class with iter_names() method fileType -- sequence file format nprocs -- max number of parallel simulation calls kwargs -- passed to simulator Attribs added to each name instance in names: simReadsFile -- file name of simulated reads simReadsFileType -- file type (eg., 'fasta' or 'fastq') simReadsFileCount -- number of simulated reads Return: boolean on run success/fail """ # making list of fasta file to provide simulator call fastaFiles = [name.get_fastaFile() for name in names.iter_names()] # settig kwargs new_simulator = partial(self, **kwargs) # calling simulator res = parmap.map(new_simulator, fastaFiles, processes=nprocs) # checking that simulated reads were created for all references; return 1 if no file for row in res: if row['simReadsFile'] is None or not os.path.isfile(row['simReadsFile']): return 1 elif os.stat(row['simReadsFile'])[0] == 0: # file size = 0 return 1 # converting reads to fasta if needed if fileType.lower() == 'fasta': for result in res: simFile = result['simReadsFile'] fileType = result['simReadsFileType'].lower() if fileType != 'fasta': fastaFile = os.path.splitext(simFile)[0] + '.fna' SeqIO.convert(simFile, fileType, fastaFile, 'fasta') result['simReadsFile'] = fastaFile result['simReadsFileType'] = 'fasta' # setting attribs in name instances for i,name in enumerate(names.iter_names()): # read file simReadsFile = res[i]['simReadsFile'] name.set_simReadsFile(simReadsFile) # file type fileType = res[i]['simReadsFileType'].lower() name.set_simReadsFileType(fileType) # number of simulated reads num_reads = len([True for i in SeqIO.parse(simReadsFile, fileType)]) name.set_simReadsCount(num_reads) return 0
def parallel_map(*args, processes = 1): """ Wrapper function for 'parmap.map': Parallises the computations in 'map' form if required. If only one process is needed, computations are performed serially """ if processes == 1: return [args[0](element, *args[2:]) for element in args[1]] return parmap.map(*args, processes = processes)
def test_map_with_parallel_timings(self): NUM_TASKS = 6 items = range(NUM_TASKS) mytime = time.time() ptrue = parmap.map(_wait, items, pm_processes=NUM_TASKS, pm_parallel=True) elapsed = time.time() - mytime self.assertTrue(elapsed >= TIME_PER_TEST) self.assertTrue(elapsed < TIME_PER_TEST*(NUM_TASKS-1)) self.assertEqual(ptrue, list(range(NUM_TASKS)))
def hcluster(features, ed, cf, distance=CID): # cluster the rows of the "features" matrix distances = {} currentclustid = -1 # clusters are initially just the individual rows clust = [cluster_node(array(features[i]), id=i) for i in range(len(features))] while len(clust) > 1: lowestpair = (0, 1) closest = distance(clust[0].id, clust[1].id, ed, cf) len1 = len(clust) # loop through every pair looking for the smallest distance '''for i in range(len(clust)): for j in range(i + 1, len(clust)): # distances is the cache of distance calculations if (clust[i].id, clust[j].id) not in distances: distances[(clust[i].id, clust[j].id)] = distance( clust[i].id, clust[j].id, ed, cf) d = distances[(clust[i].id, clust[j].id)] if d < closest: closest = d lowestpair = (i, j)''' args = [[clust, distances, 0, 0, len1/2, len1/2, 0, distance], [clust, distances, len1/2, len1/2, len1 - 1, len1 - 1, 0, distance], [clust, distances, len1 - 1, 0, len1/2, len1/2, 1, distance], [clust, distances, len1 - 1, 0, len1/2, len1/2, 2, distance] ] final = parmap.map(findMin, args) print final minindex = argmin([distances[clust[lowestpair[0]].id, clust[lowestpair[1]].id] for \ lowestpair in final]) lowestpair = final[minindex] # calculate the average of the two clusters mergevec = [(clust[lowestpair[0]].vec[i] + clust[lowestpair[1]].vec[i]) / 2.0 for i in range(len(clust[0].vec))] # create the new cluster newcluster = cluster_node(array(mergevec), left=clust[lowestpair[0]], right=clust[lowestpair[1]], distance=closest, id=currentclustid) # cluster ids that weren't in the original set are negative currentclustid -= 1 del clust[lowestpair[1]] del clust[lowestpair[0]] clust.append(newcluster) return clust[0]
def calc(series): len1 = len(series) ser = [] for i in range(len1): for j in range(i+1, len1): ser.append([series[i], series[j]]) distances = parmap.map(calculateED, ser) print "len of distances: " + str(len(distances)) return distances
def generatePredictions(method, hlas, peptides, cpus=1, verbose=False): """Does not work because peptides is also an iterator....""" if verbose: """Create console handler and set level to debug""" logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(asctime)s:%(message)s') logging.info('HLA prediction initialized for %d HLA allele(s) using method %s on %d CPU(s)', len(hlas), method, cpus) if cpus > 1: result = parmap.map(predictHLA_mhctools, hlas, method, peptides, verbose, pool=Pool(processes=cpus)) else: result = parmap.map(predictHLA_mhctools, hlas, method, peptides, verbose, parallel=False) """Remove None's""" outDf = pd.concat([r for r in result if not r is None], axis=0) """Take the log of the prediction if neccessary.""" if outDf.affinity.max() > 100: outDf.loc[:, 'pred'] = np.log(outDf.affinity) if verbose: logging.info('Completed %d predictions (expected %d)', outDf.shape[0], len(hlas) * len(peptides)) return outDf
def calc_loss_arrays(fc, sc, component_resp_df, parallel_proc): # print("\nCalculating system response to hazard transfer parameters...") component_resp_dict = component_resp_df.to_dict() sys_output_dict = {k: {o: 0 for o in fc.network.out_node_list} for k in sc.hazard_intensity_str} ids_comp_vs_haz = {p: np.zeros((sc.num_samples, fc.num_elements)) for p in sc.hazard_intensity_str} calculated_output_array = np.zeros((sc.num_samples, sc.num_hazard_pts)) economic_loss_array = np.zeros_like(calculated_output_array) output_array_given_recovery = np.zeros( (sc.num_samples, sc.num_hazard_pts, sc.num_time_steps) ) if parallel_proc: print('\nInitiating computation of loss arrays...') print(Fore.YELLOW + 'using parallel processing\n' + Fore.RESET) parallel_return = parmap.map( multiprocess_enabling_loop, range(len(sc.hazard_intensity_str)), sc.hazard_intensity_str, sc.num_hazard_pts, fc, sc ) for idxPGA, _PGA in enumerate(sc.hazard_intensity_str): ids_comp_vs_haz[_PGA] = parallel_return[idxPGA][0] sys_output_dict[_PGA] = parallel_return[idxPGA][1] component_resp_dict[_PGA] = parallel_return[idxPGA][2] calculated_output_array[:, idxPGA] = parallel_return[idxPGA][3] economic_loss_array[:, idxPGA] = parallel_return[idxPGA][4] output_array_given_recovery[:, idxPGA, :] = \ parallel_return[idxPGA][5] else: print('\nInitiating computation of loss arrays...') print(Fore.RED + 'not using parallel processing\n' + Fore.RESET) for idxPGA, _PGA in enumerate(sc.hazard_intensity_str): ids_comp_vs_haz[_PGA], \ sys_output_dict[_PGA], \ component_resp_dict[_PGA], \ calculated_output_array[:, idxPGA], \ economic_loss_array[:, idxPGA], \ output_array_given_recovery[:, idxPGA, :] = \ multiprocess_enabling_loop( idxPGA=idxPGA, _PGA_dummy=_PGA, nPGA=sc.num_hazard_pts, fc=fc, sc=sc) return ids_comp_vs_haz, \ sys_output_dict, \ component_resp_dict, \ calculated_output_array, \ economic_loss_array, \ output_array_given_recovery
def multiprocess(f, iterable, *args, **kwargs): """ Map an iterable to a function. Default key function chunks iterable by 1000s. :param f: function :param iterable: any iterable where each item is sent to f :param *args: arguments passed to mapped function :param **kwargs: additional arguments for parmap.map """ chunksize = kwargs.pop('chunksize', 1000) key = kwargs.pop('key', lambda k, l=count(): next(l)//chunksize) for k, g in groupby(iterable, key=key): yield parmap.map(f, g, *args, **kwargs)
def get_correlation_map(seed_x, seed_y, frames): seed_pixel = np.asarray(frames[:, seed_x, seed_y], dtype=np.float32) print np.shape(seed_pixel) # Reshape into time and space frames = np.reshape(frames, (frames.shape[0], width*height)) print np.shape(frames) print 'Getting correlation... x=', seed_x, ", y=", seed_y correlation_map = parmap.map(corr, frames.T, seed_pixel) correlation_map = np.asarray(correlation_map, dtype=np.float32) correlation_map = np.reshape(correlation_map, (width, height)) print np.shape(correlation_map) return correlation_map
def build_HDF5(jpeg_dir, nb_channels, size=256): """ Gather the data in a single HDF5 file. """ # Put train data in HDF5 file_name = os.path.basename(jpeg_dir.rstrip("/")) hdf5_file = os.path.join(data_dir, "%s_data.h5" % file_name) with h5py.File(hdf5_file, "w") as hfw: for dset_type in ["train", "test", "val"]: list_img = list(Path(jpeg_dir).glob('%s/*.jpg' % dset_type)) list_img.extend(list(Path(jpeg_dir).glob('%s/*.png' % dset_type))) list_img = map(str, list_img) list_img = np.array(list_img) data_full = hfw.create_dataset("%s_data_full" % dset_type, (0, nb_channels, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) data_sketch = hfw.create_dataset("%s_data_sketch" % dset_type, (0, nb_channels, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) num_files = len(list_img) chunk_size = 100 num_chunks = num_files / chunk_size arr_chunks = np.array_split(np.arange(num_files), num_chunks) for chunk_idx in tqdm(arr_chunks): list_img_path = list_img[chunk_idx].tolist() output = parmap.map(format_image, list_img_path, size, nb_channels, parallel=False) arr_img_full = np.concatenate([o[0] for o in output], axis=0) arr_img_sketch = np.concatenate([o[1] for o in output], axis=0) # Resize HDF5 dataset data_full.resize(data_full.shape[0] + arr_img_full.shape[0], axis=0) data_sketch.resize(data_sketch.shape[0] + arr_img_sketch.shape[0], axis=0) data_full[-arr_img_full.shape[0]:] = arr_img_full.astype(np.uint8) data_sketch[-arr_img_sketch.shape[0]:] = arr_img_sketch.astype(np.uint8)
def get_reads_in_intervals(bam, intervals, strand_specific=False): """ Counts reads in a iterable holding strings representing genomic intervals of the type chrom:start-end. """ # count, create dataframe coverage = parmap.map( coverage_single, intervals.values(), bam, strand_specific=strand_specific, parallel=True) if not strand_specific: coverage = np.vstack(coverage) else: coverage = (np.vstack([x[0] for x in coverage]), np.vstack([x[1] for x in coverage])) return coverage
def get_correlation_map(self, seed_x, seed_y, frames): seed_pixel = np.asarray(frames[:, seed_x, seed_y], dtype=np.float32) print np.shape(seed_pixel) # Reshape into time and space frames = np.reshape(frames, (frames.shape[0], width*height)) print np.shape(frames) #correlation_map = [] print 'Getting correlation...' #correlation_map = Parallel(n_jobs=4, backend="threading")(delayed(corr)(pixel, seed_pixel) for pixel in frames.T) #correlation_map = [] #for i in range(frames.shape[-1]): # correlation_map.append(pearsonr(frames[:, i], seed_pixel)[0]) correlation_map = parmap.map(corr, frames.T, seed_pixel) correlation_map = np.asarray(correlation_map, dtype=np.float32) correlation_map = np.reshape(correlation_map, (width, height)) print np.shape(correlation_map) return correlation_map
def return_answer(text): print 'searching...' global an1 #arr = np.array(qvectors) #qvector = arr.sum(axis = 0)/len(arr) qvectors = calculate_qvectors(text) # print qvectors # from sklearn.metrics.pairwise import cosine_similarity # an2 = [] an2 = parmap.map(calculate_sim, answervectors[:35000], qvectors, pool = Pool(12)) # print an2 # pool.close() # pool.join() try: qq = sorted(an2, key=lambda x: x[1], reverse = True) ans=[] for t in qq[0:5]: ans.append(sents[t[0]]) except IndexError: ans = 'error' return ans
def getPage(page): print 'Fetching watches list of user %s(page %d)...' % (uid, page) global s, header url = 'http://weibo.cn/%s/follow?page=%d' % (uid, page) max_num_per_sec(1) resp = s.get(url, headers=header) if resp.status_code != 200: raise Exception("%d - status code err" % resp.status_code) soup = BeautifulSoup.BeautifulSoup(resp.text) try: trs = [i.find('tr') for i in soup.findAll('table')] except: print soup debug_save(resp.content) print 'GET %s error, cookies: ' % url print s.cookies.get_dict() exit() res = parmap.map(parseTr, trs) pagelist_div = soup.find(id='pagelist') if pagelist_div is not None: totalpage = int(find_inner_text(pagelist_div.find('div').text, '/',u'\u9875')) else: totalpage = 1 return res, totalpage
def build_HDF5(size=64): """ Gather the data in a single HDF5 file. """ # Read evaluation file, build it if it does not exist # In evaluation status, "0" represents training image, "1" represents # validation image, "2" represents testing image; d_partition = {} with open(os.path.join(raw_dir, "Eval/list_eval_partition.txt"), "r") as f: lines = f.readlines() for celeb in lines: celeb = celeb.rstrip().split() img = celeb[0] attrs = int(celeb[1]) d_partition[img] = attrs with open(os.path.join(data_dir, "d_partition.pickle"), "w") as fd: pickle.dump(d_partition, fd) # Put train data in HDF5 hdf5_file = os.path.join(data_dir, "CelebA_%s_data.h5" % size) with h5py.File(hdf5_file, "w") as hfw: for dset_idx, dset_type in enumerate(["training", "validation", "test"]): list_img = [] for img in d_partition.keys(): if d_partition[img] == dset_idx: list_img.append(os.path.join(raw_dir, "img_align_celeba", img)) list_img = np.array(list_img) data_color = hfw.create_dataset("%s_color_data" % dset_type, (0, 3, size, size), maxshape=(None, 3, size, size), dtype=np.uint8) data_lab = hfw.create_dataset("%s_lab_data" % dset_type, (0, 3, size, size), maxshape=(None, 3, size, size), dtype=np.float64) data_black = hfw.create_dataset("%s_black_data" % dset_type, (0, 1, size, size), maxshape=(None, 1, size, size), dtype=np.uint8) num_files = len(list_img) chunk_size = 1000 num_chunks = num_files / chunk_size arr_chunks = np.array_split(np.arange(num_files), num_chunks) for chunk_idx in tqdm(arr_chunks): list_img_path = list_img[chunk_idx].tolist() output = parmap.map(format_image, list_img_path, size, parallel=True) arr_img_color = np.vstack([o[0] for o in output if o[0].shape[0] > 0]) arr_img_lab = np.vstack([o[1] for o in output if o[0].shape[0] > 0]) arr_img_black = np.vstack([o[2] for o in output if o[0].shape[0] > 0]) # Resize HDF5 dataset data_color.resize(data_color.shape[0] + arr_img_color.shape[0], axis=0) data_lab.resize(data_lab.shape[0] + arr_img_lab.shape[0], axis=0) data_black.resize(data_black.shape[0] + arr_img_black.shape[0], axis=0) data_color[-arr_img_color.shape[0]:] = arr_img_color.astype(np.uint8) data_lab[-arr_img_lab.shape[0]:] = arr_img_lab.astype(np.float64) data_black[-arr_img_black.shape[0]:] = arr_img_black.astype(np.uint8)
def align_frames(mouse, dir, freq): lofiles, lofilenames = get_file_list(dir+"Videos/", mouse) print lofilenames lop = get_distance_var(lofiles) all_frames = np.asarray(get_video_frames(lofiles), dtype=np.uint8) print "Alligning all video frames..." all_frames = parmap.starmap(shift_frames, zip(all_frames, lop)) ## for i in range(len(lop)): ## for frame in all_frames[i]: ## frame = image_registration.fft_tools.shift2d(frame, lop[i].dx, lop[i].dy) print np.shape(all_frames) count = 0 new_all_frames = parmap.map(process_frames, all_frames, freq, mouse, dir) ''' for frames in all_frames: print np.shape(frames) save_to_file("Green/"+lofilenames[count][:-4]+"_aligned.raw", frames, np.float32) print "Calculating mean..." avg_pre_filt = calculate_avg(frames) print "Temporal filter..." frames = cheby_filter(frames) frames += avg_pre_filt save_to_file("Green/Cheby/"+lofilenames[count][:-4]+"_BPFilter_0.1-1Hz.raw", frames, np.float32) print "Calculating DF/F0..." frames = calculate_df_f0(frames) save_to_file("Green/DFF/"+lofilenames[count][:-4]+"_DFF.raw", frames, np.float32) print "Applying MASKED GSR..." #frames = gsr(frames) frames = masked_gsr(frames, save_dir+"202_mask.raw") save_to_file("Green/GSR/"+lofilenames[count][:-4]+"_GSR.raw", frames, np.float32) print "Getting SD map..." sd = standard_deviation(frames) save_to_file("Green/SD_maps/"+lofilenames[count][:-4]+"_SD.raw", frames, np.float32) new_all_frames.append(frames) count += 1 ''' print "Creating array..." new_all_frames = np.asarray(new_all_frames, dtype=np.float32) all_frames = np.asarray(all_frames, dtype=np.float32) print "Joining Files..." new_all_frames = np.reshape(new_all_frames, (new_all_frames.shape[0]*new_all_frames.shape[1], new_all_frames.shape[2], new_all_frames.shape[3])) all_frames = np.reshape(all_frames, (all_frames.shape[0]*all_frames.shape[1], all_frames.shape[2], all_frames.shape[3])) print "Shapes: " print np.shape(all_frames) print np.shape(new_all_frames) where_are_NaNs = np.isnan(new_all_frames) new_all_frames[where_are_NaNs] = 0 save_to_file("FULL_conc.raw", new_all_frames, np.float32) save_to_file("conc_RAW.raw", all_frames, np.float32) sd = standard_deviation(new_all_frames) save_to_file("FULL_SD.raw", sd, np.float32) print "Displaying correlation map..." mapper = CorrelationMapDisplayer(new_all_frames) mapper.display('spectral', -0.3, 1.0)
def test_eval_ned_baseline(aph_testset_dataframe, aph_test_ann_files): """TODO.""" ann_dir, ann_files = aph_test_ann_files testset_gold_df = aph_testset_dataframe logger.info( tabulate( testset_gold_df.head(20)[["type", "surface", "scope", "urn"]] ) ) kb_cfg_file = pkg_resources.resource_filename( 'knowledge_base', 'config/virtuoso_local.ini' ) kb = KnowledgeBase(kb_cfg_file) """ kb_data = { "author_names": kb.author_names , "author_abbreviations": kb.author_abbreviations , "work_titles": kb.work_titles , "work_abbreviations": kb.work_abbreviations } with codecs.open("citation_extractor/data/pickles/kb_data.pkl","wb") as pickle_file: pickle.dump(kb_data, pickle_file) """ with codecs.open("citation_extractor/data/pickles/kb_data.pkl","rb") as pickle_file: kb_data = pickle.load(pickle_file) cms = {} ############################## # Test 1: default parameters # ############################## cms["cm1"] = CitationMatcher( kb, fuzzy_matching_entities=False, fuzzy_matching_relations=False, **kb_data ) ############################## # Test 2: best parameters # ############################## cms["cm2"] = CitationMatcher( kb, fuzzy_matching_entities=True, fuzzy_matching_relations=True, min_distance_entities=4, max_distance_entities=7, distance_relations=4, **kb_data ) """ ##################################### # Test 3: alternative parameters # ##################################### cms["cm3"] = CitationMatcher(kb , fuzzy_matching_entities=True , fuzzy_matching_relations=False , min_distance_entities=4 , max_distance_entities=7) """ comp_evaluation = [] comp_accuracy_by_type = [] # for each citation matcher disambiguate the records in the test set, # carry out the evaluation and store the results in two temporary lists # (to be transformed later on into two dataframes) for key in sorted(cms.keys()): cm = cms[key] testset_target_df = testset_gold_df.copy() # run the parallel processing of records results = parmap.map(_pprocess, ((n, row[0], row[1]) for n, row in enumerate(testset_target_df.iterrows())), cm) # collect the results and update the dataframe for instance_id, urn in results: testset_target_df.loc[instance_id]["urn_clean"] = urn # save pickle for later #testset_target_df.to_pickle("citation_extractor/data/pickles/test_target_dataframe_%s.pkl" % key) scores, accuracy_by_type, error_types, errors = evaluate_ned(testset_gold_df, ann_dir, testset_target_df, strict=True) # aggregate and format the evaluation measure already with percentages scores = {score_key: "%.2f%%" % (scores[score_key]*100) for score_key in scores} scores["id"] = key comp_evaluation.append(scores) # aggregate and format the accuracy by type already with percentages accuracy = {type_key : "%.2f%%" % (accuracy_by_type[type_key]*100) for type_key in accuracy_by_type} accuracy["id"] = key comp_accuracy_by_type.append(accuracy) comp_evaluation_df = pd.DataFrame(comp_evaluation, index=[score["id"] for score in comp_evaluation]) del comp_evaluation_df["id"] # we don't need it twice (already in the index) comp_accuracy_by_type_df = pd.DataFrame(comp_accuracy_by_type, index=[accuracy["id"] for accuracy in comp_accuracy_by_type]) del comp_accuracy_by_type_df["id"] # we don't need it twice (already in the index) logger.info("\n" + tabulate(comp_evaluation_df, headers=comp_evaluation_df.columns)) logger.info("\n" + tabulate(comp_accuracy_by_type_df, headers=comp_accuracy_by_type_df.columns)) logger.info("\n" + "\n".join(["%s: %s" % (key, cms[key].settings) for key in cms]))