def extract_sopr(options): if not os.path.exists(ORIG_DIR): mkdir_p(ORIG_DIR) if options.get('loglevel', None): log.setLevel(options['loglevel']) cache_paths = glob(os.path.join(CACHE_DIR, 'sopr/*/*/*.zip')) log.debug("cache paths ({num}):".format(num=len(cache_paths)) + "\n\t".join(cache_paths)) extracted = cache_paths >> filter(lambda x: check_ext(x, ext='.zip')) \ >> map(lambda p: translate_dir(p, from_dir=CACHE_DIR, to_dir=ORIG_DIR)) \ >> ThreadPool(extract_all_zips) for path, destination_dir, num_files in extracted: log.info("successfully extracted " + "{path} to {dest_dir} ({num} files)".format( path=path, dest_dir=destination_dir, num=num_files)) for url, exception in extracted.failure: log.error("extracting from {path} failed: {exception}".format( url=url, exception=exception))
def confirm_download_schedule(schedule): """Reports the total number of bytes and total number of files to download. Also lists the inaccessible files (based on HEAD response). Then asks user to confirm downloading. """ def content_length(tpl): return tpl[2][1] def status_code(tpl): return tpl[2][0] def href(tpl): return tpl[0] def is_OK(tpl): return status_code(tpl) == 200 def not_OK(tpl): return status_code(tpl) != 200 increment = lambda x, _: x + 1 file_count = ( schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0)) bytes_to_download = ( schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum) inaccessible_files = (schedule >> stream.filter(not_OK) >> list) if len(inaccessible_files) > 0: print print "Some files are inaccessible:" for (idx, sched) in enumerate(inaccessible_files): print "%d: %d %s" % (idx, status_code(sched), href(sched)) if bytes_to_download > 0: print print "Need to download %s in %d files." % ( pretty_bytes(bytes_to_download), file_count) print print "Are you sure you want to continue? [Y/n]" user_input = raw_input("> ") return (user_input.upper() in ("", "Y", "YES")) else: print print "Nothing to download." return False
def confirm_download_schedule(schedule): """Reports the total number of bytes and total number of files to download. Also lists the inaccessible files (based on HEAD response). Then asks user to confirm downloading. """ def content_length(tpl): return tpl[2][1] def status_code(tpl): return tpl[2][0] def href(tpl): return tpl[0] def is_OK(tpl): return status_code(tpl) == 200 def not_OK(tpl): return status_code(tpl) != 200 increment = lambda x, _: x + 1 file_count = schedule >> stream.filter(is_OK) >> stream.reduce(increment, 0) bytes_to_download = schedule >> stream.filter(is_OK) >> stream.map(content_length) >> sum inaccessible_files = schedule >> stream.filter(not_OK) >> list if len(inaccessible_files) > 0: print print "Some files are inaccessible:" for (idx, sched) in enumerate(inaccessible_files): print "%d: %d %s" % (idx, status_code(sched), href(sched)) if bytes_to_download > 0: print print "Need to download %s in %d files." % (pretty_bytes(bytes_to_download), file_count) print print "Are you sure you want to continue? [Y/n]" user_input = raw_input("> ") return user_input.upper() in ("", "Y", "YES") else: print print "Nothing to download." return False
def run( self ): if self.type == "TRANSIENT": sink = item[:1] else: sink = min dataFilter = MovAvg(10) nop = map(myPrint1) elements = [nop, nop, nop, nop, nop] elements[0] = map(myPrint1) elements[1] = map(lambda x: dataFilter(x)) elements[2] = filter(lambda x: bigVals(x, 40)) elements[3] = map(notify_ctrl) self.myGen() >> elements[0] >> elements[1] >> elements[2] >> elements[3] >> elements[4] >> sink
yield math.sqrt(i) #static computation in a time block ts = time() evens = instream[::2] odds = instream[1::2] evens = map(math.sqrt, evens) odds = map(math.sqrt, odds) even_ans = scan(ops.add, evens) odd_ans = reduce(ops.add, odds ) static_time = time() - ts #streaming computation # create our filters cong_2 = lambda x: x%2==0 evens = filter(cong_2) odds = filter(lambda x: not cong_2(x)) ts = time() # wire the split into the filters instream >> tee(evens) instream >> odds # wire up the map and fold (scan/accumulate) foldedevens = (evens >> stream.map(math.sqrt) >> fold(ops.add)) print(time() - ts) sqrtodds = odds >> (stream.Processor(my_sqrt)) print("established the sqrter %f" % (time() - ts)) foldedodd = sqrtodds >> stream.fold(ops.add) print("made odd folder: %f" % (time() - ts)) # force execution soans = foldedodd >> item[-1:]
return values def randomized(n): values = [] for _ in range(n): values.append(randint(-sys.maxint, sys.maxint)) return values for v in [10, 100, 1000] >> stream.map(alternating): dataset.append(v) for v in [10, 100, 1000] >> stream.map(randomized): dataset.append(v) func = stream.filter(lambda x: x & 1) resultset = dataset >> stream.map(lambda s: s >> func >> set) >> list ## Test scenario def threadpool(i): result = dataset[i] >> stream.ThreadPool(func, poolsize=2) >> set pprint(result) assert result == resultset[i] def processpool(i): result = dataset[i] >> stream.ProcessPool(func, poolsize=2) >> set pprint(result)
values.append(-i) return values def randomized(n): values = [] for _ in range(n): values.append(randint(-sys.maxint, sys.maxint)) return values for v in [10, 100, 1000] >> map(alternating): dataset.append(v) for v in [10, 100, 1000] >> map(randomized): dataset.append(v) func = filter(lambda x: x&1) resultset = dataset >> map(lambda s: s >> func >> set) >> list ## Test scenario def threadpool(i): result = dataset[i] >> ThreadPool(func, poolsize=2) >> set pprint(result) assert result == resultset[i] def processpool(i): result = dataset[i] >> ProcessPool(func, poolsize=2) >> set pprint(result) assert result == resultset[i]
# no argument given: exit if len(args) == 0: print('Nothing to do. Try -h or --help option.') sys.exit(0) # main action arguments if args[0] in ('-h', '--help'): # help usage() sys.exit(0) elif args[0] in ('-r', '--retrieve'): # -r / --retrieve: get tweets in json format to STDOUT args = args[1:] s = None if len(args) > 0 and args[0] == 'filter': s = stream.filter(track=args[1:], limit=RETRIEVAL_LIMIT) elif len(args) > 0 and args[0] == 'news': s = stream.filter(follow=list(english_sources_twitter.values()), limit=RETRIEVAL_LIMIT) elif len(args) > 1 and args[0] == 'idsfile': s = stream.fromIDsFile(args[1]) else: s = stream.filter() for tweet in s: print(json.dumps(tweet)) sys.exit(0) elif args[0] in ('-a', '--analyze'): # -a / --analyze: generate in output directory topics.json, summary.txt and stats.csv # exit if too few arguments if len(args) < 3:
def randomized(n): values = [] for _ in range(n): values.append(randint(-sys.maxint, sys.maxint)) return values for v in [10, 100, 1000] >> stream.map(alternating): dataset.append(v) for v in [10, 100, 1000] >> stream.map(randomized): dataset.append(v) func = stream.filter(lambda x: x & 1) resultset = dataset >> stream.map(lambda s: s >> func >> set) >> list ## Test scenario def threadpool(i): result = dataset[i] >> stream.ThreadPool(func, poolsize=2) >> set pprint(result) assert result == resultset[i] def processpool(i): result = dataset[i] >> stream.ProcessPool(func, poolsize=2) >> set pprint(result)