def parseCommandLine(): usage = "Usage: %prog [options] query" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to") parser.add_option("-n", "--noop", default=False, action="store_true", help="run in noop mode without modifying anything") parser.add_option("-v", "--verbose", default=False, action="store_true", help="enable verbose logging") parser.add_option("-m", "--musiconly", default=False, action="store_true", help="only parse music feeds") parser.add_option("-a", "--appsonly", default=False, action="store_true", help="only parse app feeds") parser.add_option("-q", "--qps", default=2, type="int", dest="max_qps", action="store", help="max QPS sent to iTunes") (options, args) = parser.parse_args() if options.db: utils.init_db_config(options.db) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] one_or_more_entity_ids_to_delete entity_id_to_keep" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-n", "--noop", default=False, action="store_true", help="run the dedupper in noop mode without modifying anything") parser.add_option("-v", "--verbose", default=False, action="store_true", help="enable verbose logging") parser.add_option("-f", "--force", default=False, action="store_true", help="force overriding of keys during deduping") (options, args) = parser.parse_args() if len(args) < 2: parser.print_help() sys.exit(1) if options.db: utils.init_db_config(options.db) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-n", "--noop", default=False, action="store_true", help="run the dedupper in noop mode without modifying anything") parser.add_option("-r", "--ratio", default=None, type="string", action="store", dest="ratio", help="where this crawler fits in to a distributed stack") parser.add_option("-o", "--offset", default=0, type="int", dest="offset", help="start index of entities to import") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of entities to import") (options, args) = parser.parse_args() Globals.options = options options.verbose = False options.mount = True if options.db: utils.init_db_config(options.db) options.album_popularity_per_genre = AppleEPFAlbumPopularityPerGenreRelationalDB() options.song_popularity_per_genre = AppleEPFSongPopularityPerGenreRelationalDB() options.album_popularity_per_genre.start() options.song_popularity_per_genre.start() options.album_popularity_per_genre.join() options.song_popularity_per_genre.join() options.count0 = options.album_popularity_per_genre.execute('SELECT COUNT(*) FROM "%s"' % \ options.album_popularity_per_genre.table).fetchone()[0] options.count1 = options.song_popularity_per_genre.execute('SELECT COUNT(*) FROM "%s"' % \ options.song_popularity_per_genre.table).fetchone()[0] options.count = options.count0 + options.count1 if options.ratio: num, den = options.ratio.split('/') num, den = int(num), int(den) num, den = float(num), float(den) options.offset = int(math.floor((options.count * (num - 1)) / den)) options.limit = int(math.ceil(options.count / den) + 1) utils.log("ratio %s) offset=%d, limit=%d" % (options.ratio, options.offset, options.limit)) else: if options.limit is None: options.limit = options.count return options
def parseCommandLine(): usage = "Usage: %prog [options] query" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to") parser.add_option("-n", "--noop", default=False, action="store_true", help="noop mode (run read-only)") parser.add_option("-c", "--check", default=None, action="store", help="optionally filter checks based off of their name") parser.add_option("-s", "--sampleSetSize", default=None, type="int", action="store", help="sample size as a percentage (e.g., 5 for 5%)") (options, args) = parser.parse_args() if options.sampleSetSize is None: options.sampleSetRatio = 1.0 else: options.sampleSetRatio = options.sampleSetSize / 100.0 if options.db: utils.init_db_config(options.db) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] query" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", help="db to connect to (e.g., peach.db0; defaults to localhost)") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of results to return") parser.add_option("-o", "--offset", default=0, type="int", help="optional offset into results to support paging") parser.add_option("-L", "--Local", default=False, action="store_true", help="enable local nearby search") parser.add_option("-a", "--a", default=None, type="string", action="store", dest="location", help="location (lat/lng, e.g. '40.736,-73.989')") parser.add_option("-q", "--quick", default=False, action="store_true", help="disable third party API queries") parser.add_option("-v", "--verbose", default=None, action="store_true", help="turn verbosity on") parser.add_option("-c", "--category", default=None, type="string", action="store", dest="category", help="filters results by a given category") parser.add_option("-s", "--subcategory", default=None, type="string", action="store", dest="subcategory", help="filters results by a given subcategory") (options, args) = parser.parse_args() if len(args) <= 0: parser.print_help() sys.exit(1) if options.db: utils.init_db_config(options.db) if options.location: try: lat, lng = options.location.split(',') options.location = (float(lat), float(lng)) except Exception: print "invalid location given '%s'" % options.location parser.print_help() sys.exit(1) if options.verbose is not None: global _verbose _verbose = options.verbose options.kinds, options.types = _convertCategorySubcategory(options.category, options.subcategory) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] query" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to") (options, args) = parser.parse_args() if options.db: utils.init_db_config(options.db) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-n", "--noop", default=False, action="store_true", help="run in noop mode without modifying anything") parser.add_option("-r", "--ratio", default=None, type="string", action="store", dest="ratio", help="where this crawler fits in to a distributed stack") parser.add_option("-o", "--offset", default=0, type="int", dest="offset", help="start index of entities to import") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of entities to import") (options, args) = parser.parse_args() Globals.options = options if options.db: utils.init_db_config(options.db) infile = file('autocomplete.txt', 'r') options.count = utils.getNumLines(infile) infile.close() if options.ratio: num, den = options.ratio.split('/') num, den = int(num), int(den) num, den = float(num), float(den) options.offset = int(math.floor((options.count * (num - 1)) / den)) options.limit = int(math.ceil(options.count / den) + 1) utils.log("ratio %s) offset=%d, limit=%d" % (options.ratio, options.offset, options.limit)) else: if options.limit is None: options.limit = options.count options.verbose = False return options
def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option( "-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output" ) (options, args) = parser.parse_args() Globals.options = options if options.db: utils.init_db_config(options.db) options.verbose = False return options
def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-n", "--noop", default=False, action="store_true", help="run the dedupper in noop mode without modifying anything") (options, args) = parser.parse_args() Globals.options = options if options.db: utils.init_db_config(options.db) return options
def parseCommandLine(): usage = "Usage: %prog [options] command [args]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-l", "--limit", dest="limit", default=0, type="int", help="Limit number of records processed") parser.add_option("-n", "--noop", action="store_true", default=False, help="don't make any actual changes or notifications") parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") (options, args) = parser.parse_args() if options.db: utils.init_db_config(options.db) return options
def export(): import argparse parser = argparse.ArgumentParser() parser.add_argument( "-D", "--drop", action="store_true", default=False, help="drop existing collections before performing any insertions", ) parser.add_argument("-d", "--db", default=None, type=str, help="db to connect to") parser.add_argument( "-o", "--output_namespace", type=str, default="stamped.users", help=( "mongo db and collection namespace to store output to " "in dot-notation (e.g., defaults to stamped.users)" ), ) parser.add_argument( "-s", "--state_namespace", type=str, default="local.elasticmongo", help=("mongo db and collection namespace to store elasticmongo " "mapping and index metadata"), ) parser.add_argument("-v", "--version", action="version", version="%(prog)s " + __version__) args = parser.parse_args() if args.db: utils.init_db_config(args.db) api = MongoStampedAPI(lite_mode=True) conn = api._entityDB._collection._connection coll = __get_collection(conn, args.state_namespace) export_config(coll, args.output_namespace, args.drop)
def parseCommandLine(): usage = "Usage: %prog [options]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-s", "--seed", default=None, type="string", action="store", dest="seed", help="seed id to start with") parser.add_option("-n", "--noop", default=False, action="store_true", help="run the dedupper in noop mode without modifying anything") parser.add_option("-p", "--place", default=False, action="store_true", help="dedupe only place entities") parser.add_option("-P", "--nonplace", default=False, action="store_true", help="dedupe only non-place entities") parser.add_option("-v", "--verbose", default=False, action="store_true", help="enable verbose logging") (options, args) = parser.parse_args() if len(args) > 0: parser.print_help() sys.exit(1) if not (options.place or options.nonplace): options.place = True options.nonplace = True if options.db: utils.init_db_config(options.db) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] query" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to") parser.add_option("-n", "--noop", default=False, action="store_true", help="run in noop mode without modifying anything") parser.add_option("-v", "--verbose", default=False, action="store_true", help="enable verbose logging") parser.add_option("-l", "--limit", default=None, action="store", type="int", help="limit number to convert") (options, args) = parser.parse_args() if options.db: utils.init_db_config(options.db) return (options, args)
def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-a", "--all", action="store_true", dest="all", default=False, help="crawl all available sources (defaults to true if no sources are specified)") parser.add_option("-o", "--offset", default=None, type="int", dest="offset", help="start index of entities to import") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of entities to import") parser.add_option("-r", "--ratio", default=None, type="string", action="store", dest="ratio", help="where this crawler fits in to a distributed stack") parser.add_option("-s", "--sink", default=None, type="string", action="store", dest="sink", help="where to output to (test or mongodb)") parser.add_option("-t", "--test", default=False, action="store_true", dest="test", help="run the crawler with limited input for testing purposes") parser.add_option("-c", "--count", default=False, action="store_true", dest="count", help="print overall entity count from all sources specified and return") parser.add_option("-u", "--update", default=False, action="store_true", dest="update", help="update the existing collection as opposed to dropping it and " + "overwriting any previous contents (the default)") parser.add_option("-g", "--geocode", default=False, action="store_true", dest="geocode", help="Geocode places to ensure all places have a valid lat/lng associated with them.") parser.add_option("-m", "--mount", default=False, action="store_true", dest="mount", help="mount crawler data directory if necessary") parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") #parser.add_option("-d", "--distribute", type="string", # action="callback", callback=parseDistributedHosts, # help="run the crawler distributed across the given set of hosts") (options, args) = parser.parse_args() #if hasattr(Globals.options, 'distributed'): # options.distributed = Globals.options.distributed # options.hosts = Globals.options.hosts #else: # options.distributed = False # options.hosts = [] options.offset = 0 Globals.options = options if len(args) == 0: options.all = True if options.all: options.sources = EntitySources.instantiateAll() else: options.sources = [ ] for arg in args: source = EntitySources.instantiateSource(arg) if source is None: print "Error: unrecognized source '%s'" % arg parser.print_help() sys.exit(1) else: options.sources.append(source) for source in options.sources: source._globals = _globals if options.count or options.ratio: count = 0 for source in options.sources: count += source.getMaxNumEntities() if options.count: print count sys.exit(0) else: options.count = count num, den = options.ratio.split('/') num, den = int(num), int(den) num, den = float(num), float(den) options.offset = int(math.floor((count * (num - 1)) / den)) options.limit = int(math.ceil(count / den) + 1) if options.db: utils.init_db_config(options.db) if options.sink == "test": options.sink = TestEntitySink() elif options.sink == "merge": options.sink = MergeEntitySink() else: from api.MongoStampedAPI import MongoStampedAPI options.sink = MongoStampedAPI(options.db) return options
def parseCommandLine(): """ Usage: autocomplete.py [options] query Options: --version show program's version number and exit -h, --help show this help message and exit -d DB, --db=DB db to connect to for output -l LIMIT, --limit=LIMIT limits the number of entities to import -a LOCATION, --a=LOCATION location -f, --full use full search -v, --verbose turn verbosity on -c CATEGORY, --category=CATEGORY filters results by a given category -s SUBCATEGORY, --subcategory=SUBCATEGORY filters results by a given subcategory """ usage = "Usage: %prog [options] query" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of entities to import") parser.add_option("-L", "--Local", default=False, action="store_true", help="local nearby search") parser.add_option("-a", "--a", default=None, type="string", action="store", dest="location", help="location (lat/lng, e.g. '40.7360067,-73.98884296')") parser.add_option("-f", "--full", default=False, action="store_true", help="use full search") parser.add_option("-p", "--prefix", default=False, action="store_true", help="use faster prefix-based search") parser.add_option("-v", "--verbose", default=False, action="store_true", help="turn verbosity on") parser.add_option("-S", "--Stats", default=False, action="store_true", help="view ranking stats") parser.add_option("-c", "--category", default=None, type="string", action="store", dest="category", help="filters results by a given category") parser.add_option("-s", "--subcategory", default=None, type="string", action="store", dest="subcategory", help="filters results by a given subcategory") (options, args) = parser.parse_args() if len(args) <= 0: parser.print_help() sys.exit(1) if options.db: utils.init_db_config(options.db) if options.location: assert ',' in options.location lat, lng = options.location.split(',') options.location = (float(lat), float(lng)) if options.category and options.category not in categories: raise Exception("Unrecognized category: '%s'; must be one of: %s" % (options.category, ", ".join(categories))) return (options, args)
def __init__(self, db=None, **kwargs): self.__statsSink = None StampedAPI.__init__(self, "MongoStampedAPI", **kwargs) if db: utils.init_db_config(db)
from gevent.pool import Pool from api.MongoStampedAPI import MongoStampedAPI if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('image_urls', nargs='*', action="append") parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) parser.add_argument('-d', '--db', action='store') args = parser.parse_args() db = S3ImageDB() if args.db is not None: utils.init_db_config(args.db) if args.image_urls is not None: args.image_urls = args.image_urls[0] if args.image_urls is not None and len(args.image_urls) > 0: # example url: http://thetvdb.com/banners/_cache/posters/211751-2.jpg db.addEntityImages(args.image_urls) else: # perform a bulk conversion of all thetvdb.com entity images, moving each # image over to our own CDN (via S3 / Cloudfront) and updating the entity # reference accordingly. api = MongoStampedAPI() pool = Pool(32)