コード例 #1
0
ファイル: test_pr.py プロジェクト: MatthieuGonnet/pypeln
def test_concat_basic(nums):

    nums_py = list(map(lambda x: x + 1, nums))
    nums_py1 = list(map(lambda x: x ** 2, nums_py))
    nums_py2 = list(map(lambda x: -x, nums_py))
    nums_py = nums_py1 + nums_py2

    nums_pl = pr.map(lambda x: x + 1, nums)
    nums_pl1 = pr.map(lambda x: x ** 2, nums_pl)
    nums_pl2 = pr.map(lambda x: -x, nums_pl)
    nums_pl = pr.concat([nums_pl1, nums_pl2])

    assert sorted(nums_pl) == sorted(nums_py)
コード例 #2
0
def test_map_square_event_end(nums):

    namespace = pr._get_namespace()
    namespace.x = 0
    namespace.done = False
    namespace.active_workers = -1

    def set_1():
        namespace.x = 1

    def set_2(stage_status):
        namespace.x = 2
        namespace.active_workers = stage_status.active_workers
        namespace.done = stage_status.done

    nums_pl = pr.map(lambda x: x**2,
                     nums,
                     workers=3,
                     on_start=set_1,
                     on_done=set_2)
    nums_pl = list(nums_pl)

    assert namespace.x == 2
    assert namespace.done == True
    assert namespace.active_workers == 0
コード例 #3
0
ファイル: test_pr.py プロジェクト: MatthieuGonnet/pypeln
def test_map_id(nums):

    nums_py = nums

    nums_pl = pr.map(lambda x: x, nums)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
コード例 #4
0
def handle_pairs(type, subject_labels, subject_data, subject_ids, other_ids,
                 threshold, buckets_number, es, dry_run, workers_production,
                 workers_score, workers_write, queue_production_score,
                 queue_score_result, queue_write, index, doc):

    #do some initial setup
    vectorizer = DictVectorizer(sparse=True)
    tdidf_transformer = LocalTfidfTransformer(smooth_idf=False, )
    data_vector = vectorizer.fit_transform(
        [subject_data[i] for i in subject_ids])
    data_vector = data_vector > 0
    data_vector = data_vector.astype(int)
    transformed_data = tdidf_transformer.fit_transform(data_vector)
    sums_vector = np.squeeze(np.asarray(
        transformed_data.sum(1)).ravel())  #sum by row
    '''put vectors in buckets'''
    buckets = {}
    for i in range(buckets_number):
        buckets[i] = []
    vector_hashes = {}
    for i in range(len(subject_ids)):
        vector = transformed_data[i].toarray()[0]
        digested = digest_in_buckets(vector, buckets_number)
        for bucket in digested:
            buckets[bucket].append(i)
        vector_hashes[i] = digested

    idf = dict(zip(vectorizer.feature_names_, list(tdidf_transformer.idf_)))
    idf_ = 1 - tdidf_transformer.idf_

    #now everything is computed that can be baked into the function arguments

    produce_pairs_local_init_baked = functools.partial(
        produce_pairs_local_init, vector_hashes, buckets, threshold,
        sums_vector, data_vector)

    calculate_pairs_local_init_baked = functools.partial(
        calculate_pairs_local_init, type, subject_labels, subject_ids,
        other_ids, threshold, idf, idf_)

    #create stage for producing disease-to-disease
    pipeline_stage = pr.flat_map(produce_pairs,
                                 range(len(subject_ids)),
                                 workers=workers_production,
                                 maxsize=queue_production_score,
                                 on_start=produce_pairs_local_init_baked)

    #create stage to calculate disease-to-disease
    pipeline_stage = pr.map(calculate_pair,
                            pipeline_stage,
                            workers=workers_score,
                            maxsize=queue_score_result,
                            on_start=calculate_pairs_local_init_baked)

    #store in elasticsearch
    #this could be multi process, but just use a single for now
    store_in_elasticsearch(pipeline_stage, es, dry_run, workers_write,
                           queue_write, index, doc)
コード例 #5
0
ファイル: test_pr.py プロジェクト: MatthieuGonnet/pypeln
def test_map_id_pipe(nums):

    nums_pl = (
        nums
        | pr.map(lambda x: x)
        | list
    )

    assert nums_pl == nums
コード例 #6
0
def test_map_square(nums):

    nums_py = map(lambda x: x**2, nums)
    nums_py = list(nums_py)

    nums_pl = pr.map(lambda x: x**2, nums)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
コード例 #7
0
ファイル: test_pr.py プロジェクト: MatthieuGonnet/pypeln
def test_map_square_workers(nums):

    nums_py = map(lambda x: x ** 2, nums)
    nums_py = list(nums_py)

    nums_pl = pr.map(lambda x: x ** 2, nums, workers=2)
    nums_pl = list(nums_pl)

    assert sorted(nums_pl) == sorted(nums_py)
コード例 #8
0
ファイル: test_pr.py プロジェクト: MatthieuGonnet/pypeln
def test_concat_multiple(nums):

    nums_py = [ x + 1 for x in nums ]
    nums_py1 = nums_py + nums_py
    nums_py2 = nums_py1 + nums_py

    nums_pl = pr.map(lambda x: x + 1, nums)
    nums_pl1 = pr.concat([nums_pl, nums_pl])
    nums_pl2 = pr.concat([nums_pl1, nums_pl])

    assert sorted(nums_py1) == sorted(list(nums_pl1))
    assert sorted(nums_py2) == sorted(list(nums_pl2))
コード例 #9
0
def test_from_to_iterable(nums):

    nums_pl = nums
    nums_pl = pr.from_iterable(nums_pl)
    nums_pl = cz.partition_all(10, nums_pl)
    nums_pl = pr.map(sum, nums_pl)
    nums_pl = list(nums_pl)

    nums_py = nums
    nums_py = cz.partition_all(10, nums_py)
    nums_py = map(sum, nums_py)
    nums_py = list(nums_py)

    assert nums_py == nums_pl
コード例 #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('tsvfile', type=str)
    parser.add_argument('-o', '--out', type=pathlib.Path, required=True)
    parser.add_argument('-sep',
                        default=' ',
                        help='Separator for input tsvfile')
    parser.add_argument('-f',
                        '--feat',
                        type=str,
                        choices=FEATURES.keys(),
                        default='spec')
    parser.add_argument('-sr', default=16000, type=int)
    parser.add_argument('-c', default=4, type=int)
    parser.add_argument('-cmn', default=False, action='store_true')
    parser.add_argument('-cvn', default=False, action='store_true')
    args = parser.parse_args()

    df = pd.read_csv(args.tsvfile,
                     sep=args.sep,
                     usecols=[0],
                     header=0,
                     names=[0])  #Just use first column
    args.out.parent.mkdir(parents=True, exist_ok=True)

    CMVN_SCALER = StandardScaler(with_mean=args.cmn, with_std=args.cvn)

    feature_fun = FEATURES[args.feat]

    def extract_feature(fname):
        y, sr = librosa.load(fname, sr=args.sr)
        y = feature_fun(y.astype(np.float32), sr)
        y = CMVN_SCALER.fit_transform(y)
        return fname, y

    all_files = df[0].unique()

    if args.out.is_file():
        print("File exists {}. Removing ..".format(args.out))
        args.out.unlink()  # Remove if exists
    with h5py.File(args.out, 'w') as hdf5_file, tqdm(total=len(all_files),
                                                     unit='file') as pbar:
        for fname, feat in pr.map(extract_feature,
                                  all_files,
                                  workers=args.c,
                                  maxsize=int(2 * args.c)):
            # Scale feature directly
            hdf5_file[fname] = feat
            pbar.set_postfix(name=pathlib.Path(fname).stem, shape=feat.shape)
            pbar.update()
コード例 #11
0
def test_flat_map_square_workers(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = pr.map(lambda x: x**2, nums)
    nums_pl = pr.flat_map(_generator, nums_pl, workers=3)
    nums_pl = list(nums_pl)

    assert sorted(nums_pl) == sorted(nums_py)
コード例 #12
0
def test_flat_map_square(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = list(nums_py)

    nums_pl = pr.map(lambda x: x**2, nums)
    nums_pl = pr.flat_map(_generator, nums_pl)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
コード例 #13
0
def test_error_handling():

    error = None

    def raise_error(x):
        raise MyError()

    stage = pr.map(raise_error, range(10))

    try:
        list(stage)

    except MyError as e:
        error = e

    assert isinstance(error, MyError)
コード例 #14
0
def test_map_square_event_start(nums):

    nums_py = map(lambda x: x**2, nums)
    nums_py = list(nums_py)

    namespace = pr._get_namespace()
    namespace.x = 0

    def set_1():
        namespace.x = 1

    nums_pl = pr.map(lambda x: x**2, nums, on_start=set_1)
    nums_pl = list(nums_pl)

    assert nums_pl == nums_py
    assert namespace.x == 1
コード例 #15
0
def test_worker_info():

    nums = range(100)
    n_workers = 4

    def set_1(worker_info):
        return worker_info.index

    nums_pl = pr.map(
        lambda x, index: index,
        nums,
        on_start=set_1,
        workers=n_workers,
    )
    nums_pl = set(nums_pl)

    assert nums_pl.issubset(set(range(n_workers)))
コード例 #16
0
def test_flat_map_square_filter_workers_pipe(nums):
    def _generator(x):
        yield x
        yield x + 1
        yield x + 2

    nums_py = map(lambda x: x**2, nums)
    nums_py = cz.mapcat(_generator, nums_py)
    nums_py = cz.filter(lambda x: x > 1, nums_py)
    nums_py = list(nums_py)

    nums_pl = (nums
               | pr.map(lambda x: x**2)
               | pr.flat_map(_generator, workers=3)
               | pr.filter(lambda x: x > 1)
               | list)

    assert sorted(nums_pl) == sorted(nums_py)
コード例 #17
0
ファイル: exit_on_ctr_c.py プロジェクト: new07/pypeln
from pypeln import process as pr
import time


def do_print(x):
    time.sleep(1)
    print(x)


stage = pr.map(do_print, range(1000), workers=5)

pr.run(stage)
コード例 #18
0
        if ext == '.gz':
            with gzip.open(fname, 'rb') as gzipped_wav:
                y, sr = sf.read(io.BytesIO(gzipped_wav.read()),
                                dtype='float32')
                # Multiple channels, reduce
                if y.ndim == 2:
                    y = y.mean(1)
                y = librosa.resample(y, sr, ARGS.sr)
        elif ext in ('.wav', '.flac'):
            y, sr = sf.read(fname, dtype='float32')
            if y.ndim > 1:
                y = y.mean(1)
            y = librosa.resample(y, sr, ARGS.sr)
    except Exception as e:
        # Exception usually happens because some data has 6 channels , which librosa cant handle
        logging.error(e)
        logging.error(fname)
        raise
    lms_feature = np.log(librosa.feature.melspectrogram(y, **MEL_ARGS) + EPS).T
    return fname, lms_feature


with h5py.File(ARGS.output, 'w') as store:
    for fname, feat in tqdm(pr.map(extract_feature,
                                   DF[ARGS.col].unique(),
                                   workers=ARGS.c,
                                   maxsize=4),
                            total=len(DF[ARGS.col].unique())):
        basename = Path(fname).name
        store[basename] = feat
コード例 #19
0
ファイル: cli.py プロジェクト: divait/domain-randomization
    def object_detection(
        self,
        n_samples,
        n_objects,
        objects_pattern,
        backgrounds_pattern,
        output_dir,
        workers=1,
        rotation_angles=None,
        object_resize=None,
        background_resize=None,
        iou_threshold=0.0,
        object_channel_multiply=(0.5, 1.5),
        background_channel_multiply=(0.5, 1.5),
        object_channel_invert=False,
        background_channel_invert=False,
        background_rotate=False,
        object_scale=1.0,
        output_extension="png",
        segmentation=False,
    ):

        if segmentation:
            generator = dr.GenerateSegmentation()
            generator_name = "segmentation"
        else:
            generator = dr.GeneratePascalVoc()
            generator_name = "pascal_voq"

        # create transform
        transform = dr.Compose([
            dr.RandomChannelMultiply(
                objects_range=object_channel_multiply,
                background_range=background_channel_multiply,
            ) if object_channel_multiply or background_channel_multiply else
            dr.NoOp(),
            dr.RandomChannelInvert(
                objects=object_channel_invert,
                background=background_channel_invert,
            ) if object_channel_invert or background_channel_invert else
            dr.NoOp(),
            dr.Resize(
                objects=object_resize,
                background=background_resize,
            ) if object_resize or background_resize else dr.NoOp(),
            dr.RandomRotation90(background=background_rotate),
            dr.ObjectRandomPosition(),
            dr.ObjectRandomScale(
                scale=object_scale) if object_scale != 1.0 else dr.NoOp(),
            dr.ObjectRandomRotation(
                angles=rotation_angles) if rotation_angles else dr.NoOp(),
            dr.NonMaxSupression(iou_threshold=iou_threshold),
            generator,
        ])

        # get iterables
        all_object_filepaths = glob(objects_pattern)
        all_object_labels = [
            os.path.dirname(filepath).split(os.sep)[-1]
            for filepath in all_object_filepaths
        ]

        if segmentation:
            labels_map = sorted(set(all_object_labels))
            labels_map = dict((key, i) for i, key in enumerate(labels_map))
            all_object_labels = [
                labels_map[label] for label in all_object_labels
            ]

        all_background_filepaths = glob(backgrounds_pattern)
        all_object_idx = np.arange(len(all_object_filepaths))

        # make output dir
        os.makedirs(output_dir, exist_ok=True)

        def create_sample(_i):
            if hasattr(n_objects, "__iter__"):
                n_objs = np.random.randint(low=n_objects[0],
                                           high=n_objects[1] + 1)
            else:
                n_objs = n_objects

            object_idxs = np.random.choice(all_object_idx, n_objs)

            object_images = [
                dr.utils.inv_chanels(
                    cv2.imread(
                        all_object_filepaths[i],
                        cv2.IMREAD_UNCHANGED,
                    )) for i in object_idxs
            ]

            object_labels = [all_object_labels[i] for i in object_idxs]

            background_idx = np.random.randint(len(all_background_filepaths))
            background_image = dr.utils.inv_chanels(
                cv2.imread(
                    all_background_filepaths[background_idx],
                    cv2.IMREAD_UNCHANGED,
                ))

            row = dr.create_scene(
                background=background_image,
                objects=list(zip(object_labels, object_images)),
            )

            row = transform(row)

            generator_obj = row[generator_name]

            generator_obj.save(output_dir, extension=output_extension)

        def on_start(worker_info):
            np.random.seed(worker_info.index)
            random.seed(worker_info.index + 100)

        stage = pr.map(create_sample,
                       range(n_samples),
                       workers=workers,
                       on_start=on_start)
        stage = (x for x in tqdm(stage, total=n_samples))

        pr.run(stage)
コード例 #20
0
        row = row[1]
        y, sr = load_audio(row["file_name"], mono=not args.nomono)
        # feature = extractfeat(row["file_name"], sr, **params)
        if y.ndim > 1:
            feat = np.array([extractfeat(i, sr, **params) for i in y])
        else:
            feat = extractfeat(y, sr, **params)
        return row["audio_id"], feat

    return extract


wav_df = pd.read_csv(args.wav_csv, sep="\t")
feat_csv_data = []
with h5py.File(args.feat_h5,
               "w") as feat_store, tqdm(total=wav_df.shape[0]) as pbar:
    for audio_id, feat in pr.map(pypeln_wrapper(args.extractfeat, **argsdict),
                                 wav_df.iterrows(),
                                 workers=args.process_num,
                                 maxsize=4):
        # Transpose feat, nsamples to nsamples, feat
        feat = np.vstack(feat).transpose()
        feat_store[audio_id] = feat
        feat_csv_data.append({
            "audio_id": audio_id,
            "hdf5_path": str(Path(args.feat_h5).absolute())
        })
        pbar.update()

pd.DataFrame(feat_csv_data).to_csv(args.feat_csv, sep="\t", index=False)
コード例 #21
0
def process_evidences_pipeline(filenames, first_n, es_client, redis_client,
                               dry_run, output_folder, num_workers,
                               num_writers, max_queued_events, eco_scores_uri,
                               schema_uri, es_hosts, excluded_biotypes,
                               datasources_to_datatypes):
    logger = logging.getLogger(__name__)

    if not filenames:
        logger.error('tried to run with no filenames at all')
        raise RuntimeError("Must specify at least one filename of evidence")

    # files that are not fetchable
    failed_filenames = list(itertools.ifilterfalse(IO.check_to_open,
                                                   filenames))

    for uri in failed_filenames:
        logger.warning('failed to fetch uri %s', uri)

    # get the filenames that are properly fetchable
    #sort the list for consistent behaviour
    checked_filenames = sorted((set(filenames) - set(failed_filenames)))

    logger.info('start evidence processing pipeline')

    #load lookup tables
    lookup_data = LookUpDataRetriever(
        es_client, redis_client, [],
        (LookUpDataType.TARGET, LookUpDataType.DISEASE,
         LookUpDataType.ECO)).lookup

    #create a iterable of lines from all file handles
    evs = IO.make_iter_lines(checked_filenames, first_n)

    #create functions with pre-baked arguments
    validation_on_start_baked = functools.partial(validation_on_start,
                                                  lookup_data, eco_scores_uri,
                                                  schema_uri,
                                                  excluded_biotypes,
                                                  datasources_to_datatypes)

    writer_global_init, writer_local_init, writer_main, writer_local_shutdown, writer_global_shutdown = setup_writers(
        dry_run, es_hosts, output_folder)
    if writer_global_init:
        writer_global_init()

    #here is the pipeline definition
    pl_stage = pr.map(process_evidence,
                      evs,
                      workers=num_workers,
                      maxsize=max_queued_events,
                      on_start=validation_on_start_baked)

    pl_stage = pr.map(writer_main,
                      pl_stage,
                      workers=num_writers,
                      maxsize=max_queued_events,
                      on_start=writer_local_init,
                      on_done=writer_local_shutdown)

    logger.info('run evidence processing pipeline')
    results = reduce_tuple_with_sum(pr.to_iterable(pl_stage))

    #perform any single-thread cleanup
    if writer_global_shutdown:
        writer_global_shutdown()

    logger.info("results (failed: %s, succeed: %s)", results[0], results[1])
    if failed_filenames:
        raise RuntimeError('unable to handle %s', str(failed_filenames))

    if not results[1]:
        raise RuntimeError("No evidence was sucessful!")
コード例 #22
0
ファイル: Evidences.py プロジェクト: jwills/data_pipeline
def process_evidences_pipeline(
        filenames, first_n, es_hosts, es_index_valid, es_index_invalid,
        es_doc_valid, es_doc_invalid, es_mappings_valid, es_mappings_invalid,
        es_settings_valid, es_settings_invalid, es_index_gene, es_index_eco,
        es_index_efo, dry_run, workers_validation, queue_validation,
        workers_write, queue_write, eco_scores_uri, schema_uri,
        excluded_biotypes, datasources_to_datatypes):

    logger = logging.getLogger(__name__)

    # do not pass this es object to other processess, single process only!
    es = new_es_client(es_hosts)

    if not filenames:
        logger.error('tried to run with no filenames at all')
        raise RuntimeError("Must specify at least one filename of evidence")

    # files that are not fetchable
    failed_filenames = list(itertools.ifilterfalse(IO.check_to_open,
                                                   filenames))

    for uri in failed_filenames:
        logger.warning('failed to fetch uri %s', uri)

    # get the filenames that are properly fetchable
    #sort the list for consistent behaviour
    checked_filenames = sorted((set(filenames) - set(failed_filenames)))

    logger.info('start evidence processing pipeline')

    #create a iterable of lines from all file handles
    evs = IO.make_iter_lines(checked_filenames, first_n)

    #create functions with pre-baked arguments
    validation_on_start_baked = functools.partial(validation_on_start,
                                                  eco_scores_uri, schema_uri,
                                                  excluded_biotypes,
                                                  datasources_to_datatypes,
                                                  es_hosts, es_index_gene,
                                                  es_index_eco, es_index_efo)

    #here is the pipeline definition
    pl_stage = pr.map(process_evidence,
                      evs,
                      workers=workers_validation,
                      maxsize=queue_validation,
                      on_start=validation_on_start_baked)

    logger.info('stages created, running scoring and writing')

    with URLZSource(es_mappings_valid).open() as mappings_file:
        mappings_valid = json.load(mappings_file)

    with URLZSource(es_mappings_invalid).open() as mappings_file:
        mappings_invalid = json.load(mappings_file)

    with URLZSource(es_settings_valid).open() as settings_file:
        settings_valid = json.load(settings_file)

    with URLZSource(es_settings_invalid).open() as settings_file:
        settings_invalid = json.load(settings_file)

    with ElasticsearchBulkIndexManager(es, es_index_invalid, settings_invalid,
                                       mappings_invalid):
        with ElasticsearchBulkIndexManager(es, es_index_valid, settings_valid,
                                           mappings_valid):
            #load into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(pl_stage, es_index_valid,
                                            es_index_invalid, es_doc_valid,
                                            es_doc_invalid)
            failcount = 0

            if not dry_run:
                results = None
                if workers_write > 0:
                    logger.debug("Using parallel bulk writer for Elasticearch")
                    # this can silently crash ?
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=workers_write,
                        queue_size=queue_write,
                        chunk_size=chunk_size)
                else:
                    logger.debug(
                        "Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)

                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)

            print('stages created, ran scoring and writing')
            logger.info('stages created, ran scoring and writing')

    if failed_filenames:
        raise RuntimeError('unable to handle %s', str(failed_filenames))
コード例 #23
0
ファイル: Association.py プロジェクト: jwills/data_pipeline
    def process_all(self, dry_run):

        # do not pass this es object to other processess, single process only!
        es = new_es_client(self.es_hosts)

        targets = self.get_targets(es)

        self.logger.info('setting up stages')

        #bake the arguments for the setup into function objects
        produce_evidence_local_init_baked = functools.partial(produce_evidence_local_init, 
            self.es_hosts, self.es_index_val_right,
            self.scoring_weights, self.is_direct_do_not_propagate, 
            self.datasources_to_datatypes)
        score_producer_local_init_baked = functools.partial(score_producer_local_init,
            self.datasources_to_datatypes, dry_run, self.es_hosts,
            self.es_index_gene, self.es_index_eco, self.es_index_hpa, self.es_index_efo)
        
        #pipeline stage for making the lists of the target/disease pairs and evidence
        pipeline_stage1 = pr.flat_map(produce_evidence, targets, 
            workers=self.workers_production,
            maxsize=self.queue_produce,
            on_start=produce_evidence_local_init_baked)

        #pipeline stage for scoring the evidence sets
        #includes writing to elasticsearch
        pipeline_stage2 = pr.map(score_producer, pipeline_stage1, 
            workers=self.workers_score,
            maxsize=self.queue_score,
            on_start=score_producer_local_init_baked)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #load into elasticsearch
            self.logger.info('stages created, running scoring and writing')
            client = es
            chunk_size = 1000 #TODO make configurable
            actions = self.elasticsearch_actions(pipeline_stage2, 
                self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    self.logger.debug("Using parallel bulk writer for Elasticearch")
                    results = elasticsearch.helpers.parallel_bulk(client, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    self.logger.debug("Using streaming bulk writer for Elasticearch")
                    results = elasticsearch.helpers.streaming_bulk(client, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

        self.logger.info("DONE")
コード例 #24
0
ファイル: extract_features.py プロジェクト: russellizadi/CDur
def extract_feature(fname):
    """extract_feature
    Extracts a log mel spectrogram feature from a filename, currently supports two filetypes:

    1. Wave
    2. Gzipped wave

    :param fname: filepath to the file to extract
    """
    ext = Path(fname).suffix
    if ext in ('.wav', '.flac'):
        y, sr = sf.read(fname, dtype='float32')
        if y.ndim > 1:
            y = y.mean(1)
        y = librosa.resample(y, sr, ARGS.sr)

    lms_feature = np.log(librosa.feature.melspectrogram(y, **MEL_ARGS) + EPS).T
    return fname, lms_feature


with h5py.File(ARGS.output, 'w') as store:
    for fname, feat in tqdm(pr.map(extract_feature, [
            p for p in DF[ARGS.col].unique()
            if not p.split("/")[-1].startswith(".")
    ],
                                   workers=ARGS.c,
                                   maxsize=4),
                            total=len(DF[ARGS.col].unique())):
        basename = Path(fname).name
        store[basename] = feat
コード例 #25
0
###################
# from_to_iterable
###################
@hp.given(nums=st.lists(st.integers()))
@hp.settings(max_examples=MAX_EXAMPLES)
def test_from_to_iterable(nums):

    nums_pl = nums
    nums_pl = pr.from_iterable(nums_pl)
    nums_pl = cz.partition_all(10, nums_pl)
    nums_pl = pr.map(sum, nums_pl)
    nums_pl = list(nums_pl)

    nums_py = nums
    nums_py = cz.partition_all(10, nums_py)
    nums_py = map(sum, nums_py)
    nums_py = list(nums_py)

    assert nums_py == nums_pl


if __name__ == '__main__':
    error = None

    def raise_error(x):
        raise MyError()

    stage = pr.map(raise_error, range(10))

    list(stage)