def test_ignore(): path = os.path.join('blizzard','test','fixture','lieux-de-tournage-de-films-long-metrage-paris.p') with open(path, 'rb') as fp: dataset = { 'fields': [], 'download': pickle.load(fp), } n.assert_true(ignore(dataset))
def index_threaded(fp_out): datasets = pluplusch(catalogs=dl.catalogs, cache_dir=datadir, proxies=proxies()) futures = {} with ProcessPoolExecutor(4) as e: while True: if len(futures) < 5: try: dataset = next(datasets) except StopIteration: pass else: if not u.ignore(dataset): futures[(dataset["catalog"], dataset["datasetid"])] = e.submit(meta.snowflake, dataset) for key, future in list(futures.items()): if future.done(): dataset = future.result() fp_out.write(json.dumps(dataset) + "\n") del (futures[key]) logger.debug("In line for snowflaking: %s" % futures.keys()) if futures == []: break
def index(fp_out): for dataset in pluplusch(catalogs=dl.catalogs, cache_dir=datadir, proxies=proxies()): if not u.ignore(dataset): meta.snowflake(dataset) fp_out.write(json.dumps(dataset) + "\n")