def test_concat_basic(nums): nums_py = list(map(lambda x: x + 1, nums)) nums_py1 = list(map(lambda x: x ** 2, nums_py)) nums_py2 = list(map(lambda x: -x, nums_py)) nums_py = nums_py1 + nums_py2 nums_pl = pr.map(lambda x: x + 1, nums) nums_pl1 = pr.map(lambda x: x ** 2, nums_pl) nums_pl2 = pr.map(lambda x: -x, nums_pl) nums_pl = pr.concat([nums_pl1, nums_pl2]) assert sorted(nums_pl) == sorted(nums_py)
def test_map_square_event_end(nums): namespace = pr._get_namespace() namespace.x = 0 namespace.done = False namespace.active_workers = -1 def set_1(): namespace.x = 1 def set_2(stage_status): namespace.x = 2 namespace.active_workers = stage_status.active_workers namespace.done = stage_status.done nums_pl = pr.map(lambda x: x**2, nums, workers=3, on_start=set_1, on_done=set_2) nums_pl = list(nums_pl) assert namespace.x == 2 assert namespace.done == True assert namespace.active_workers == 0
def test_map_id(nums): nums_py = nums nums_pl = pr.map(lambda x: x, nums) nums_pl = list(nums_pl) assert nums_pl == nums_py
def handle_pairs(type, subject_labels, subject_data, subject_ids, other_ids, threshold, buckets_number, es, dry_run, workers_production, workers_score, workers_write, queue_production_score, queue_score_result, queue_write, index, doc): #do some initial setup vectorizer = DictVectorizer(sparse=True) tdidf_transformer = LocalTfidfTransformer(smooth_idf=False, ) data_vector = vectorizer.fit_transform( [subject_data[i] for i in subject_ids]) data_vector = data_vector > 0 data_vector = data_vector.astype(int) transformed_data = tdidf_transformer.fit_transform(data_vector) sums_vector = np.squeeze(np.asarray( transformed_data.sum(1)).ravel()) #sum by row '''put vectors in buckets''' buckets = {} for i in range(buckets_number): buckets[i] = [] vector_hashes = {} for i in range(len(subject_ids)): vector = transformed_data[i].toarray()[0] digested = digest_in_buckets(vector, buckets_number) for bucket in digested: buckets[bucket].append(i) vector_hashes[i] = digested idf = dict(zip(vectorizer.feature_names_, list(tdidf_transformer.idf_))) idf_ = 1 - tdidf_transformer.idf_ #now everything is computed that can be baked into the function arguments produce_pairs_local_init_baked = functools.partial( produce_pairs_local_init, vector_hashes, buckets, threshold, sums_vector, data_vector) calculate_pairs_local_init_baked = functools.partial( calculate_pairs_local_init, type, subject_labels, subject_ids, other_ids, threshold, idf, idf_) #create stage for producing disease-to-disease pipeline_stage = pr.flat_map(produce_pairs, range(len(subject_ids)), workers=workers_production, maxsize=queue_production_score, on_start=produce_pairs_local_init_baked) #create stage to calculate disease-to-disease pipeline_stage = pr.map(calculate_pair, pipeline_stage, workers=workers_score, maxsize=queue_score_result, on_start=calculate_pairs_local_init_baked) #store in elasticsearch #this could be multi process, but just use a single for now store_in_elasticsearch(pipeline_stage, es, dry_run, workers_write, queue_write, index, doc)
def test_map_id_pipe(nums): nums_pl = ( nums | pr.map(lambda x: x) | list ) assert nums_pl == nums
def test_map_square(nums): nums_py = map(lambda x: x**2, nums) nums_py = list(nums_py) nums_pl = pr.map(lambda x: x**2, nums) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_map_square_workers(nums): nums_py = map(lambda x: x ** 2, nums) nums_py = list(nums_py) nums_pl = pr.map(lambda x: x ** 2, nums, workers=2) nums_pl = list(nums_pl) assert sorted(nums_pl) == sorted(nums_py)
def test_concat_multiple(nums): nums_py = [ x + 1 for x in nums ] nums_py1 = nums_py + nums_py nums_py2 = nums_py1 + nums_py nums_pl = pr.map(lambda x: x + 1, nums) nums_pl1 = pr.concat([nums_pl, nums_pl]) nums_pl2 = pr.concat([nums_pl1, nums_pl]) assert sorted(nums_py1) == sorted(list(nums_pl1)) assert sorted(nums_py2) == sorted(list(nums_pl2))
def test_from_to_iterable(nums): nums_pl = nums nums_pl = pr.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = pr.map(sum, nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl
def main(): parser = argparse.ArgumentParser() parser.add_argument('tsvfile', type=str) parser.add_argument('-o', '--out', type=pathlib.Path, required=True) parser.add_argument('-sep', default=' ', help='Separator for input tsvfile') parser.add_argument('-f', '--feat', type=str, choices=FEATURES.keys(), default='spec') parser.add_argument('-sr', default=16000, type=int) parser.add_argument('-c', default=4, type=int) parser.add_argument('-cmn', default=False, action='store_true') parser.add_argument('-cvn', default=False, action='store_true') args = parser.parse_args() df = pd.read_csv(args.tsvfile, sep=args.sep, usecols=[0], header=0, names=[0]) #Just use first column args.out.parent.mkdir(parents=True, exist_ok=True) CMVN_SCALER = StandardScaler(with_mean=args.cmn, with_std=args.cvn) feature_fun = FEATURES[args.feat] def extract_feature(fname): y, sr = librosa.load(fname, sr=args.sr) y = feature_fun(y.astype(np.float32), sr) y = CMVN_SCALER.fit_transform(y) return fname, y all_files = df[0].unique() if args.out.is_file(): print("File exists {}. Removing ..".format(args.out)) args.out.unlink() # Remove if exists with h5py.File(args.out, 'w') as hdf5_file, tqdm(total=len(all_files), unit='file') as pbar: for fname, feat in pr.map(extract_feature, all_files, workers=args.c, maxsize=int(2 * args.c)): # Scale feature directly hdf5_file[fname] = feat pbar.set_postfix(name=pathlib.Path(fname).stem, shape=feat.shape) pbar.update()
def test_flat_map_square_workers(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = pr.map(lambda x: x**2, nums) nums_pl = pr.flat_map(_generator, nums_pl, workers=3) nums_pl = list(nums_pl) assert sorted(nums_pl) == sorted(nums_py)
def test_flat_map_square(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = list(nums_py) nums_pl = pr.map(lambda x: x**2, nums) nums_pl = pr.flat_map(_generator, nums_pl) nums_pl = list(nums_pl) assert nums_pl == nums_py
def test_error_handling(): error = None def raise_error(x): raise MyError() stage = pr.map(raise_error, range(10)) try: list(stage) except MyError as e: error = e assert isinstance(error, MyError)
def test_map_square_event_start(nums): nums_py = map(lambda x: x**2, nums) nums_py = list(nums_py) namespace = pr._get_namespace() namespace.x = 0 def set_1(): namespace.x = 1 nums_pl = pr.map(lambda x: x**2, nums, on_start=set_1) nums_pl = list(nums_pl) assert nums_pl == nums_py assert namespace.x == 1
def test_worker_info(): nums = range(100) n_workers = 4 def set_1(worker_info): return worker_info.index nums_pl = pr.map( lambda x, index: index, nums, on_start=set_1, workers=n_workers, ) nums_pl = set(nums_pl) assert nums_pl.issubset(set(range(n_workers)))
def test_flat_map_square_filter_workers_pipe(nums): def _generator(x): yield x yield x + 1 yield x + 2 nums_py = map(lambda x: x**2, nums) nums_py = cz.mapcat(_generator, nums_py) nums_py = cz.filter(lambda x: x > 1, nums_py) nums_py = list(nums_py) nums_pl = (nums | pr.map(lambda x: x**2) | pr.flat_map(_generator, workers=3) | pr.filter(lambda x: x > 1) | list) assert sorted(nums_pl) == sorted(nums_py)
from pypeln import process as pr import time def do_print(x): time.sleep(1) print(x) stage = pr.map(do_print, range(1000), workers=5) pr.run(stage)
if ext == '.gz': with gzip.open(fname, 'rb') as gzipped_wav: y, sr = sf.read(io.BytesIO(gzipped_wav.read()), dtype='float32') # Multiple channels, reduce if y.ndim == 2: y = y.mean(1) y = librosa.resample(y, sr, ARGS.sr) elif ext in ('.wav', '.flac'): y, sr = sf.read(fname, dtype='float32') if y.ndim > 1: y = y.mean(1) y = librosa.resample(y, sr, ARGS.sr) except Exception as e: # Exception usually happens because some data has 6 channels , which librosa cant handle logging.error(e) logging.error(fname) raise lms_feature = np.log(librosa.feature.melspectrogram(y, **MEL_ARGS) + EPS).T return fname, lms_feature with h5py.File(ARGS.output, 'w') as store: for fname, feat in tqdm(pr.map(extract_feature, DF[ARGS.col].unique(), workers=ARGS.c, maxsize=4), total=len(DF[ARGS.col].unique())): basename = Path(fname).name store[basename] = feat
def object_detection( self, n_samples, n_objects, objects_pattern, backgrounds_pattern, output_dir, workers=1, rotation_angles=None, object_resize=None, background_resize=None, iou_threshold=0.0, object_channel_multiply=(0.5, 1.5), background_channel_multiply=(0.5, 1.5), object_channel_invert=False, background_channel_invert=False, background_rotate=False, object_scale=1.0, output_extension="png", segmentation=False, ): if segmentation: generator = dr.GenerateSegmentation() generator_name = "segmentation" else: generator = dr.GeneratePascalVoc() generator_name = "pascal_voq" # create transform transform = dr.Compose([ dr.RandomChannelMultiply( objects_range=object_channel_multiply, background_range=background_channel_multiply, ) if object_channel_multiply or background_channel_multiply else dr.NoOp(), dr.RandomChannelInvert( objects=object_channel_invert, background=background_channel_invert, ) if object_channel_invert or background_channel_invert else dr.NoOp(), dr.Resize( objects=object_resize, background=background_resize, ) if object_resize or background_resize else dr.NoOp(), dr.RandomRotation90(background=background_rotate), dr.ObjectRandomPosition(), dr.ObjectRandomScale( scale=object_scale) if object_scale != 1.0 else dr.NoOp(), dr.ObjectRandomRotation( angles=rotation_angles) if rotation_angles else dr.NoOp(), dr.NonMaxSupression(iou_threshold=iou_threshold), generator, ]) # get iterables all_object_filepaths = glob(objects_pattern) all_object_labels = [ os.path.dirname(filepath).split(os.sep)[-1] for filepath in all_object_filepaths ] if segmentation: labels_map = sorted(set(all_object_labels)) labels_map = dict((key, i) for i, key in enumerate(labels_map)) all_object_labels = [ labels_map[label] for label in all_object_labels ] all_background_filepaths = glob(backgrounds_pattern) all_object_idx = np.arange(len(all_object_filepaths)) # make output dir os.makedirs(output_dir, exist_ok=True) def create_sample(_i): if hasattr(n_objects, "__iter__"): n_objs = np.random.randint(low=n_objects[0], high=n_objects[1] + 1) else: n_objs = n_objects object_idxs = np.random.choice(all_object_idx, n_objs) object_images = [ dr.utils.inv_chanels( cv2.imread( all_object_filepaths[i], cv2.IMREAD_UNCHANGED, )) for i in object_idxs ] object_labels = [all_object_labels[i] for i in object_idxs] background_idx = np.random.randint(len(all_background_filepaths)) background_image = dr.utils.inv_chanels( cv2.imread( all_background_filepaths[background_idx], cv2.IMREAD_UNCHANGED, )) row = dr.create_scene( background=background_image, objects=list(zip(object_labels, object_images)), ) row = transform(row) generator_obj = row[generator_name] generator_obj.save(output_dir, extension=output_extension) def on_start(worker_info): np.random.seed(worker_info.index) random.seed(worker_info.index + 100) stage = pr.map(create_sample, range(n_samples), workers=workers, on_start=on_start) stage = (x for x in tqdm(stage, total=n_samples)) pr.run(stage)
row = row[1] y, sr = load_audio(row["file_name"], mono=not args.nomono) # feature = extractfeat(row["file_name"], sr, **params) if y.ndim > 1: feat = np.array([extractfeat(i, sr, **params) for i in y]) else: feat = extractfeat(y, sr, **params) return row["audio_id"], feat return extract wav_df = pd.read_csv(args.wav_csv, sep="\t") feat_csv_data = [] with h5py.File(args.feat_h5, "w") as feat_store, tqdm(total=wav_df.shape[0]) as pbar: for audio_id, feat in pr.map(pypeln_wrapper(args.extractfeat, **argsdict), wav_df.iterrows(), workers=args.process_num, maxsize=4): # Transpose feat, nsamples to nsamples, feat feat = np.vstack(feat).transpose() feat_store[audio_id] = feat feat_csv_data.append({ "audio_id": audio_id, "hdf5_path": str(Path(args.feat_h5).absolute()) }) pbar.update() pd.DataFrame(feat_csv_data).to_csv(args.feat_csv, sep="\t", index=False)
def process_evidences_pipeline(filenames, first_n, es_client, redis_client, dry_run, output_folder, num_workers, num_writers, max_queued_events, eco_scores_uri, schema_uri, es_hosts, excluded_biotypes, datasources_to_datatypes): logger = logging.getLogger(__name__) if not filenames: logger.error('tried to run with no filenames at all') raise RuntimeError("Must specify at least one filename of evidence") # files that are not fetchable failed_filenames = list(itertools.ifilterfalse(IO.check_to_open, filenames)) for uri in failed_filenames: logger.warning('failed to fetch uri %s', uri) # get the filenames that are properly fetchable #sort the list for consistent behaviour checked_filenames = sorted((set(filenames) - set(failed_filenames))) logger.info('start evidence processing pipeline') #load lookup tables lookup_data = LookUpDataRetriever( es_client, redis_client, [], (LookUpDataType.TARGET, LookUpDataType.DISEASE, LookUpDataType.ECO)).lookup #create a iterable of lines from all file handles evs = IO.make_iter_lines(checked_filenames, first_n) #create functions with pre-baked arguments validation_on_start_baked = functools.partial(validation_on_start, lookup_data, eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes) writer_global_init, writer_local_init, writer_main, writer_local_shutdown, writer_global_shutdown = setup_writers( dry_run, es_hosts, output_folder) if writer_global_init: writer_global_init() #here is the pipeline definition pl_stage = pr.map(process_evidence, evs, workers=num_workers, maxsize=max_queued_events, on_start=validation_on_start_baked) pl_stage = pr.map(writer_main, pl_stage, workers=num_writers, maxsize=max_queued_events, on_start=writer_local_init, on_done=writer_local_shutdown) logger.info('run evidence processing pipeline') results = reduce_tuple_with_sum(pr.to_iterable(pl_stage)) #perform any single-thread cleanup if writer_global_shutdown: writer_global_shutdown() logger.info("results (failed: %s, succeed: %s)", results[0], results[1]) if failed_filenames: raise RuntimeError('unable to handle %s', str(failed_filenames)) if not results[1]: raise RuntimeError("No evidence was sucessful!")
def process_evidences_pipeline( filenames, first_n, es_hosts, es_index_valid, es_index_invalid, es_doc_valid, es_doc_invalid, es_mappings_valid, es_mappings_invalid, es_settings_valid, es_settings_invalid, es_index_gene, es_index_eco, es_index_efo, dry_run, workers_validation, queue_validation, workers_write, queue_write, eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes): logger = logging.getLogger(__name__) # do not pass this es object to other processess, single process only! es = new_es_client(es_hosts) if not filenames: logger.error('tried to run with no filenames at all') raise RuntimeError("Must specify at least one filename of evidence") # files that are not fetchable failed_filenames = list(itertools.ifilterfalse(IO.check_to_open, filenames)) for uri in failed_filenames: logger.warning('failed to fetch uri %s', uri) # get the filenames that are properly fetchable #sort the list for consistent behaviour checked_filenames = sorted((set(filenames) - set(failed_filenames))) logger.info('start evidence processing pipeline') #create a iterable of lines from all file handles evs = IO.make_iter_lines(checked_filenames, first_n) #create functions with pre-baked arguments validation_on_start_baked = functools.partial(validation_on_start, eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo) #here is the pipeline definition pl_stage = pr.map(process_evidence, evs, workers=workers_validation, maxsize=queue_validation, on_start=validation_on_start_baked) logger.info('stages created, running scoring and writing') with URLZSource(es_mappings_valid).open() as mappings_file: mappings_valid = json.load(mappings_file) with URLZSource(es_mappings_invalid).open() as mappings_file: mappings_invalid = json.load(mappings_file) with URLZSource(es_settings_valid).open() as settings_file: settings_valid = json.load(settings_file) with URLZSource(es_settings_invalid).open() as settings_file: settings_invalid = json.load(settings_file) with ElasticsearchBulkIndexManager(es, es_index_invalid, settings_invalid, mappings_invalid): with ElasticsearchBulkIndexManager(es, es_index_valid, settings_valid, mappings_valid): #load into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(pl_stage, es_index_valid, es_index_invalid, es_doc_valid, es_doc_invalid) failcount = 0 if not dry_run: results = None if workers_write > 0: logger.debug("Using parallel bulk writer for Elasticearch") # this can silently crash ? results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=workers_write, queue_size=queue_write, chunk_size=chunk_size) else: logger.debug( "Using streaming bulk writer for Elasticearch") results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) print('stages created, ran scoring and writing') logger.info('stages created, ran scoring and writing') if failed_filenames: raise RuntimeError('unable to handle %s', str(failed_filenames))
def process_all(self, dry_run): # do not pass this es object to other processess, single process only! es = new_es_client(self.es_hosts) targets = self.get_targets(es) self.logger.info('setting up stages') #bake the arguments for the setup into function objects produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, self.es_hosts, self.es_index_val_right, self.scoring_weights, self.is_direct_do_not_propagate, self.datasources_to_datatypes) score_producer_local_init_baked = functools.partial(score_producer_local_init, self.datasources_to_datatypes, dry_run, self.es_hosts, self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo) #pipeline stage for making the lists of the target/disease pairs and evidence pipeline_stage1 = pr.flat_map(produce_evidence, targets, workers=self.workers_production, maxsize=self.queue_produce, on_start=produce_evidence_local_init_baked) #pipeline stage for scoring the evidence sets #includes writing to elasticsearch pipeline_stage2 = pr.map(score_producer, pipeline_stage1, workers=self.workers_score, maxsize=self.queue_score, on_start=score_producer_local_init_baked) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #load into elasticsearch self.logger.info('stages created, running scoring and writing') client = es chunk_size = 1000 #TODO make configurable actions = self.elasticsearch_actions(pipeline_stage2, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: self.logger.debug("Using parallel bulk writer for Elasticearch") results = elasticsearch.helpers.parallel_bulk(client, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: self.logger.debug("Using streaming bulk writer for Elasticearch") results = elasticsearch.helpers.streaming_bulk(client, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) self.logger.info("DONE")
def extract_feature(fname): """extract_feature Extracts a log mel spectrogram feature from a filename, currently supports two filetypes: 1. Wave 2. Gzipped wave :param fname: filepath to the file to extract """ ext = Path(fname).suffix if ext in ('.wav', '.flac'): y, sr = sf.read(fname, dtype='float32') if y.ndim > 1: y = y.mean(1) y = librosa.resample(y, sr, ARGS.sr) lms_feature = np.log(librosa.feature.melspectrogram(y, **MEL_ARGS) + EPS).T return fname, lms_feature with h5py.File(ARGS.output, 'w') as store: for fname, feat in tqdm(pr.map(extract_feature, [ p for p in DF[ARGS.col].unique() if not p.split("/")[-1].startswith(".") ], workers=ARGS.c, maxsize=4), total=len(DF[ARGS.col].unique())): basename = Path(fname).name store[basename] = feat
################### # from_to_iterable ################### @hp.given(nums=st.lists(st.integers())) @hp.settings(max_examples=MAX_EXAMPLES) def test_from_to_iterable(nums): nums_pl = nums nums_pl = pr.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = pr.map(sum, nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl if __name__ == '__main__': error = None def raise_error(x): raise MyError() stage = pr.map(raise_error, range(10)) list(stage)