def index_threaded(fp_out): datasets = pluplusch(catalogs=dl.catalogs, cache_dir=datadir, proxies=proxies()) futures = {} with ProcessPoolExecutor(4) as e: while True: if len(futures) < 5: try: dataset = next(datasets) except StopIteration: pass else: if not u.ignore(dataset): futures[(dataset["catalog"], dataset["datasetid"])] = e.submit(meta.snowflake, dataset) for key, future in list(futures.items()): if future.done(): dataset = future.result() fp_out.write(json.dumps(dataset) + "\n") del (futures[key]) logger.debug("In line for snowflaking: %s" % futures.keys()) if futures == []: break
def index(fp_out): for dataset in pluplusch(catalogs=dl.catalogs, cache_dir=datadir, proxies=proxies()): if not u.ignore(dataset): meta.snowflake(dataset) fp_out.write(json.dumps(dataset) + "\n")
def filter_columns(catalogs = []): for dataset in pluplusch(catalogs = catalogs): colnames = list(filter(partial(re.match, NAME), dataset['colnames'])) yield dataset['download_url'], colnames