def _run(self): utils.log("[%s] initializing" % self) f, numLines, filename = self._open_file(countLines=False) table_format = epf.parse_table_format(f, filename) self.table_format = table_format f.close() numLines = self.execute('SELECT COUNT(*) FROM "%s"' % self.table).fetchone()[0] utils.log("[%s] parsing ~%d entities from '%s'" % (self, numLines, self.table)) rows = self.execute('SELECT * FROM "%s"' % self.table) #self._globals['rows'] = rows; self._output.put(StopIteration); return count = 0 for row in rows: row = self._format_result(row) self._parseRow(row) count += 1 if numLines > 100 and (count % (numLines / 100)) == 0: utils.log("[%s] done parsing %s" % \ (self, utils.getStatusStr(count, numLines))) time.sleep(0.1) f.close() self._output.put(StopIteration) utils.log("[%s] finished parsing %d entities (filtered %d)" % (self, count, self.numFiltered))
def _parseEntity(self, sheet, index, numEntities): if numEntities > 100 and ((index - 1) % (numEntities / 100)) == 0: utils.log("[%s] done parsing %s" % \ (self.NAME, utils.getStatusStr(index - 1 - Globals.options.offset, numEntities))) time.sleep(0.1) row = sheet.row_values(index) entity = Entity() entity.subcategory = "restaurant" entity.title = row[1] entity.address = row[3] + ', ' + \ row[4] + ', ' + \ row[5] + ' ' + \ row[6] entity.openTable = { 'rid' : int(row[8]), 'reserveURL' : row[9], 'countryID' : row[10], 'metroName' : row[0], 'neighborhoodName' : row[2], } # don't make external calls to opentable in test mode if not Globals.options.test: result = OpenTableParser.parseEntity(entity) if result is None: return if entity is not None: #print entity.title #from pprint import pprint #pprint(entity.getDataAsDict()) self._output.put(entity)
def main(): options = parseCommandLine() sink = AppleEntitySink(options) appleAPI = AppleAPI(country='us') all_artists = set() all_albums = set() all_songs = set() pool = Pool(16) """ count = dbs['album_popularity_per_genre'].execute('SELECT COUNT(*) FROM "%s"' % \ dbs['album_popularity_per_genre'].table).fetchone()[0] rows = dbs['album_popularity_per_genre'].execute('SELECT * FROM "%s"' % \ dbs['album_popularity_per_genre'].table) rows = list(rows) utils.log("[%s] parsing %d rows" % ('albums', count)) for i in xrange(len(rows)): row = rows[i] row = dbs['album_popularity_per_genre']._format_result(row) pool.spawn(parse_album, row, appleAPI, sink, pool, all_artists, all_albums, all_songs) if count <= 100 or ((i - 1) % (count / 100)) == 0: utils.log("[%s] done parsing %s" % ('albums', utils.getStatusStr(i, count))) break """ count = dbs['song_popularity_per_genre'].execute('SELECT COUNT(*) FROM "%s"' % \ dbs['song_popularity_per_genre'].table).fetchone()[0] rows = dbs['song_popularity_per_genre'].execute('SELECT * FROM "%s"' % \ dbs['song_popularity_per_genre'].table) rows = list(rows) utils.log("[%s] parsing %d rows" % ('songs', count)) for i in xrange(len(rows)): row = rows[i] row = dbs['song_popularity_per_genre']._format_result(row) pool.spawn(parse_song, row, appleAPI, sink, pool, all_artists, all_albums, all_songs) if count <= 100 or ((i - 1) % (count / 100)) == 0: utils.log("[%s] done parsing %s" % ('songs', utils.getStatusStr(i, count))) break pool.join() print "artists: %d" % len(all_artists) print "albums: %d" % len(all_albums) print "songs: %d" % len(all_songs)
def add_entries(entries, hint, output, scale_factor=1.0): count = entries.count() done = 0 utils.log("[%s] processing %d entity titles..." % (hint, count)) for entry in entries: if "title" in entry: key = entry["title"].lower() # attempt to replace accented characters with their ascii equivalents key = unicodedata.normalize("NFKD", unicode(key)).encode("ascii", "ignore") key = re.sub("([^a-zA-Z0-9._ -])", "", key) key = key.strip() output.add(key, scale_factor) done += 1 if count <= 100 or ((done - 1) % (count / 100)) == 0: utils.log("[%s] done processing %s" % (hint, utils.getStatusStr(done, count)))
def _sample(self, iterable, func, print_progress=True, progress_delta=5, max_retries=0, retry_delay=0.05): progress_count = 100 / progress_delta ratio = self.options.sampleSetRatio count = 0 index = 0 try: count = len(iterable) except: try: count = iterable.count() except: count = utils.count(iterable) for obj in iterable: if print_progress and (count < progress_count or 0 == (index % (count / progress_count))): utils.log("%s : %s" % (self.__class__.__name__, utils.getStatusStr(index, count))) if random.random() < ratio: noop = self.options.noop retries = 0 while True: try: self.options.noop = (retries < max_retries) or noop func(obj) break except Exception, e: utils.printException() retries += 1 if noop or retries > max_retries: prefix = "ERROR" if noop else "UNRESOLVABLE ERROR" utils.log("%s: %s" % (prefix, str(e))) break time.sleep(retry_delay) retry_delay *= 2 finally: self.options.noop = noop
def _run(self): filename = "amazon_feeds.txt" path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) feed_file = file(path, "r") feeds = map(lambda t: t[0:-1], feed_file.readlines()) feed_file.close() num_feeds = len(feeds) utils.log("[%s] parsing %d feeds" % (self, num_feeds)) pool = Pool(128) for i in xrange(num_feeds): url = feeds[i] pool.spawn(self._parse_feed, pool, url) if num_feeds > 100 and (i % (num_feeds / 100)) == 0: utils.log("[%s] done parsing %s" % (self, utils.getStatusStr(i, num_feeds))) pool.join() self._output.put(StopIteration) utils.log("[%s] finished parsing %d feeds" % (self, num_feeds))
def run(mongo_notification_handler, mongo_host='localhost', mongo_port=27017): assert isinstance(mongo_notification_handler, AMongoNotificationHandler) conn = pymongo.Connection(mongo_host, mongo_port) db = conn.local oplog = db.oplog.rs schemas = defaultdict(set) for o in db.fts.schemas.find(): schemas[o['ns']] = schemas[o['ns']].union(o['fields']) progress_delta = 5 progress_count = 100 / progress_delta state = db.fts.find_one({'_id': 'state'}) first = True cursor = None count = 0 spec = {} if state and 'ts' in state: first = oplog.find_one() if first['ts'].time > state['ts'].time and first['ts'].inc > state['ts'].inc: __init(conn, mongo_notification_handler, schemas) else: spec['ts'] = { '$gt': state['ts'] } else: __init(conn, mongo_notification_handler, schemas) # TODO: address async issue here.. if not 'ts' in spec: try: # attempt to start pulling at the last occurrence of the target namespaces s = {"ns" : { "$in" : map(str, schemas.keys()) } } last = list(oplog.find(s).sort("$natural", -1).limit(1))[0] spec['ts'] = { '$gt': last['ts'] } except: # fallback to starting at the end of the oplog try: last = list(oplog.find().sort("$natural", -1).limit(1))[0] spec['ts'] = { '$gt': last['ts'] } except: # fallback to starting at the beginning of the oplog pass # poll the mongo oplog indefinitely while True: try: if not cursor or not cursor.alive: cursor = oplog.find(spec, tailable=True).sort("$natural", 1) count = cursor.count() docs = defaultdict(list) index = 0 for op in cursor: pprint(op) ns = op['ns'] if ns in schemas: spec['ts'] = { '$gt': op['ts'] } #pprint(op) if op['op'] == 'd': id = __extract_id(op['o']['_id']) mongo_notification_handler.delete(ns, id) elif op['op'] in ['i', 'u']: docs[ns].append(__extract_fields(op['o'], schemas[ns])) index += 1 if first and (count < progress_count or 0 == (index % (count / progress_count))): print "%s" % utils.getStatusStr(index, count) if docs: for ns, docs in docs.iteritems(): mongo_notification_handler.add(ns, docs) first = False db.fts.save({ '_id': 'state', 'ts': spec['ts']['$gt'] }) except AutoReconnect as e: pass time.sleep(1)
def _run(self): utils.log("[%s] initializing" % self) f, numLines, filename = self._open_file(countLines=False) table_format = epf.parse_table_format(f, filename) self.table_format = table_format stale = False self._buffer = [] self._buffer_threshold = 1024 # determine whether or not the db table already exists and attempt to # determine if it's up-to-date s.t. we won't recalculate it if it'd # be unnecessary. try: row0 = self.execute('SELECT * FROM %s LIMIT 1' % (self.table, ), error_okay=True).fetchone() if row0 is None: stale = True elif len(row0) != len(dict(table_format.cols)): stale = True except Exception: self.conn.rollback() #utils.printException() stale = True pass #f.close(); self._output.put(StopIteration); return if not stale: # table is usable as-is utils.log("[%s] %s.%s doesn't need to be recomputed" % (self, self.dbpath, self.table)) else: utils.log("[%s] opening '%s'" % (self, self._filename)) numLines = max(0, utils.getNumLines(f) - 8) table_format = epf.parse_table_format(f, filename) self.table_format = table_format utils.log("[%s] parsing ~%d rows from '%s'" % (self, numLines, self._filename)) # initialize table cols = [] # currently disabling primary keys for most tables found_primary = False #(len(table_format.primary_keys) != 1) for col in table_format.cols: cols.append('') for col in table_format.cols: primary = "" if not found_primary and col == self.primary and not self._sqlite: #if not found_primary and col in table_format.primary_keys: # TODO: handle the common case of multiple primary keys, which sqlite3 does not support # TODO: defining the primary key here as opposed to after insertion is much slower! primary = " PRIMARY KEY" found_primary = True col2 = table_format.cols[col] col_type = col2['type'] if not self._sqlite: # perform mapping between some MySQL types that Apple uses and # their postgres equivalents if col_type == 'DATETIME': col_type = 'VARCHAR(100)' elif col_type == 'LONGTEXT': col_type = 'VARCHAR(4000)' text = "%s %s%s" % (col, col_type, primary) index = col2['index'] cols[index] = text args = string.joinfields(cols, ', ') self.execute("DROP TABLE %s" % (self.table, ), error_okay=True) self.execute("CREATE TABLE %s (%s)" % (self.table, args), verbose=True) if self._sqlite: placeholder = '?' else: placeholder = '%s' values_str = '(%s)' % string.joinfields((placeholder for col in table_format.cols), ', ') self._cmd = 'INSERT INTO %s VALUES %s' % (self.table, values_str) count = 0 for row in epf.parse_rows(f, table_format): self._parseRowOld(row, table_format) count += 1 if numLines > 100 and (count % (numLines / 100)) == 0: num_rows = self.execute('SELECT COUNT(*) FROM %s' % (self.table, )).fetchone()[0] utils.log("[%s] done parsing %s -- %d rows" % \ (self, utils.getStatusStr(count, numLines), num_rows)) self._try_flush_buffer(force=True) if self.index: self.execute("CREATE INDEX %s on %s (%s)" % (self.index, self.table, self.index), verbose=True) utils.log("[%s] finished parsing %d rows" % (self, count)) f.close() self._output.put(StopIteration)
if modified: api._entityDB.updateEntity(entity) # TODO: handle new-style entity images #query = {"_id" : bson.objectid.ObjectId("4ea8803cfe4a1d2a4200081f")} query = {'image' : {'$regex' : r'^.*thetvdb.com.*$'}} #query = {'subcategory' : 'tv'} docs = api._entityDB._collection.find(query) count = docs.count() index = 0 progress_delta = 5 # report progress every 5% progress_count = 100 / progress_delta utils.log("processing %d entities" % count) for doc in docs: entity = api._entityDB._convertFromMongo(doc) pool.spawn(_process_entity, entity) if count > 1 and 0 == (index % (count / progress_count)): utils.log("\n\nPROGRESS: %s\n\n" % (utils.getStatusStr(index, count))) index += 1 pool.join()
def main(): options = parseCommandLine() """ ret = utils.shell(r"grep 'class.*(AppleEPFRelationalDB' sources/dumps/AppleEPFRelationalDB.py | sed 's/class \([^(]*\)(.*/\1/g'") ret = map(lambda r: r.strip() + "()", ret[0].split('\n')) for r in ret: cls = eval(r) cls._run() #cls.start() #cls.join() cls.close() """ sink = AppleEntitySink(options) appleAPI = AppleAPI(country='us') pool = Pool(32) offset = 0 done = 0 all_artists = set() all_albums = set() all_songs = set() count = options.count0 rows = options.album_popularity_per_genre.execute('SELECT * FROM "%s"' % \ options.album_popularity_per_genre.table) rows = list(rows) # loop through all albums utils.log("[%s] parsing %d rows" % ('albums', count)) for i in xrange(len(rows)): if offset < options.offset: offset += 1; continue if options.limit is not None and done > options.limit: break done += 1 row = rows[i] row = options.album_popularity_per_genre._format_result(row) pool.spawn(parse_album, row, appleAPI, sink, pool, all_artists, all_albums, all_songs) if options.limit <= 100 or ((done - 1) % (options.limit / 100)) == 0: utils.log("[%s] done parsing %s" % ('albums', utils.getStatusStr(done, options.limit))) """ count = options.count1 rows = options.song_popularity_per_genre.execute('SELECT * FROM "%s"' % \ options.song_popularity_per_genre.table) rows = list(rows) # loop through all songs utils.log("[%s] parsing %d rows" % ('songs', count)) for i in xrange(len(rows)): if offset < options.offset: offset += 1; continue if options.limit is not None and done > options.limit: break done += 1 row = rows[i] row = options.song_popularity_per_genre._format_result(row) pool.spawn(parse_song, row, appleAPI, sink, pool, all_artists, all_albums, all_songs) if options.limit <= 100 or ((done - 1) % (options.limit / 100)) == 0: utils.log("[%s] done parsing %s" % ('songs', utils.getStatusStr(done, options.limit))) """ pool.join() print "artists: %d" % len(all_artists) print "albums: %d" % len(all_albums) print "songs: %d" % len(all_songs)
def main(): options = parseCommandLine() stampedAPI = MongoStampedAPI() entityDB = stampedAPI._entityDB placesDB = stampedAPI._placesEntityDB autocompleteDB = S3AutocompleteDB() prefixes = set() wrapper = { 'time_sum' : 0.0, 'time_num' : 0, } def _add(orig_name, wrapper): try: if 0 == len(orig_name) or orig_name in prefixes: return name = encode_s3_name(orig_name) if 0 == len(name): return name = "search/v2/%s.json" % name print "searching %s" % orig_name.encode('ascii', 'replace') tries = 0 while True: try: t1 = time.time() results = stampedAPI.searchEntities(query=orig_name, limit=10, prefix=True, full=False) t2 = time.time() duration = (t2 - t1) wrapper['time_sum'] += duration wrapper['time_num'] += 1 break except: tries += 1 if tries >= 3: utils.printException() time.sleep(1) return time.sleep(1) """ if len(results) <= 1: i = len(orig_name) while i > 0: prefixes.add(orig_name[0:i]) i -= 1 if 0 == len(results): return False """ autosuggest = [] for item in results: item = HTTPEntityAutosuggest().importSchema(item[0], item[1]).dataExport() autosuggest.append(item) value = json.dumps(autosuggest, sort_keys=True) data = { 'name1' : orig_name.encode('ascii', 'replace'), 'name2' : name, 'num_r' : len(results) } pprint(data) sys.stdout.flush() if not options.noop: retries = 0 while True: try: autocompleteDB.add_key(name, value, content_type='application/json', apply_gzip=True) break except: retries += 1 if retries > 5: utils.printException() return time.sleep(1) except: utils.printException() time.sleep(1) return infile = file('autocomplete.txt', 'r') pool = Pool(4) done = 0 offset = 0 for line in infile: if offset < options.offset: offset += 1; continue if options.limit is not None and done > options.limit: break line = line[:-1] pool.spawn(_add, line, wrapper) done += 1 if options.limit <= 100 or ((done - 1) % (options.limit / 100)) == 0: utils.log("done processing %s (avg search time %s ms)" % (utils.getStatusStr(done, options.limit), 1000.0 * (wrapper['time_sum'] / (wrapper['time_num'] if wrapper['time_num'] > 0 else 1)))) pool.join() infile.close()