Beispiel #1
0
def index_threaded(fp_out):
    datasets = pluplusch(catalogs=dl.catalogs, cache_dir=datadir, proxies=proxies())
    futures = {}
    with ProcessPoolExecutor(4) as e:
        while True:
            if len(futures) < 5:
                try:
                    dataset = next(datasets)
                except StopIteration:
                    pass
                else:
                    if not u.ignore(dataset):
                        futures[(dataset["catalog"], dataset["datasetid"])] = e.submit(meta.snowflake, dataset)

            for key, future in list(futures.items()):
                if future.done():
                    dataset = future.result()
                    fp_out.write(json.dumps(dataset) + "\n")
                    del (futures[key])
                    logger.debug("In line for snowflaking: %s" % futures.keys())

            if futures == []:
                break
Beispiel #2
0
def index(fp_out):
    for dataset in pluplusch(catalogs=dl.catalogs, cache_dir=datadir, proxies=proxies()):
        if not u.ignore(dataset):
            meta.snowflake(dataset)
            fp_out.write(json.dumps(dataset) + "\n")
Beispiel #3
0
def filter_columns(catalogs = []):
    for dataset in pluplusch(catalogs = catalogs):
        colnames = list(filter(partial(re.match, NAME), dataset['colnames']))
        yield dataset['download_url'], colnames