Ejemplo n.º 1
0
 def _run(self):
     utils.log("[%s] initializing" % self)
     f, numLines, filename = self._open_file(countLines=False)
     
     table_format = epf.parse_table_format(f, filename)
     self.table_format = table_format
     f.close()
     
     numLines = self.execute('SELECT COUNT(*) FROM "%s"' % self.table).fetchone()[0]
     utils.log("[%s] parsing ~%d entities from '%s'" % (self, numLines, self.table))
     
     rows  = self.execute('SELECT * FROM "%s"' % self.table)
     #self._globals['rows'] = rows; self._output.put(StopIteration); return
     count = 0
     
     for row in rows:
         row = self._format_result(row)
         self._parseRow(row)
         count += 1
         
         if numLines > 100 and (count % (numLines / 100)) == 0:
             utils.log("[%s] done parsing %s" % \
                 (self, utils.getStatusStr(count, numLines)))
             time.sleep(0.1)
     
     f.close()
     self._output.put(StopIteration)
     
     utils.log("[%s] finished parsing %d entities (filtered %d)" % (self, count, self.numFiltered))
Ejemplo n.º 2
0
 def _parseEntity(self, sheet, index, numEntities):
     if numEntities > 100 and ((index - 1) % (numEntities / 100)) == 0:
         utils.log("[%s] done parsing %s" % \
             (self.NAME, utils.getStatusStr(index - 1 - Globals.options.offset, numEntities)))
         time.sleep(0.1)
     
     row = sheet.row_values(index)
     
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title = row[1]
     entity.address = row[3] + ', ' + \
                      row[4] + ', ' + \
                      row[5] + ' ' + \
                      row[6]
     
     entity.openTable = {
         'rid' : int(row[8]), 
         'reserveURL' : row[9], 
         'countryID' : row[10], 
         'metroName' : row[0], 
         'neighborhoodName' : row[2], 
     }
     
     # don't make external calls to opentable in test mode
     if not Globals.options.test:
         result = OpenTableParser.parseEntity(entity)
         if result is None:
             return
     
     if entity is not None:
         #print entity.title
         #from pprint import pprint
         #pprint(entity.getDataAsDict())
         self._output.put(entity)
Ejemplo n.º 3
0
def main():
    options = parseCommandLine()
    
    sink = AppleEntitySink(options)
    appleAPI = AppleAPI(country='us')
    
    all_artists = set()
    all_albums  = set()
    all_songs   = set()
    
    pool = Pool(16)
    
    """
    count = dbs['album_popularity_per_genre'].execute('SELECT COUNT(*) FROM "%s"' % \
                                                      dbs['album_popularity_per_genre'].table).fetchone()[0]
    
    rows  = dbs['album_popularity_per_genre'].execute('SELECT * FROM "%s"' % \
                                                      dbs['album_popularity_per_genre'].table)
    rows  = list(rows)
    
    utils.log("[%s] parsing %d rows" % ('albums', count))
    for i in xrange(len(rows)):
        row = rows[i]
        row = dbs['album_popularity_per_genre']._format_result(row)
        pool.spawn(parse_album, row, appleAPI, sink, pool, all_artists, all_albums, all_songs)
        
        if count <= 100 or ((i - 1) % (count / 100)) == 0:
            utils.log("[%s] done parsing %s" % ('albums', utils.getStatusStr(i, count)))
        break
    """
    
    count = dbs['song_popularity_per_genre'].execute('SELECT COUNT(*) FROM "%s"' % \
                                                     dbs['song_popularity_per_genre'].table).fetchone()[0]
    rows  = dbs['song_popularity_per_genre'].execute('SELECT * FROM "%s"' % \
                                                     dbs['song_popularity_per_genre'].table)
    rows  = list(rows)
    
    utils.log("[%s] parsing %d rows" % ('songs', count))
    for i in xrange(len(rows)):
        row = rows[i]
        row = dbs['song_popularity_per_genre']._format_result(row)
        pool.spawn(parse_song, row, appleAPI, sink, pool, all_artists, all_albums, all_songs)
        
        if count <= 100 or ((i - 1) % (count / 100)) == 0:
            utils.log("[%s] done parsing %s" % ('songs', utils.getStatusStr(i, count)))
        break
    
    pool.join()
    
    print "artists: %d" % len(all_artists)
    print "albums:  %d" % len(all_albums)
    print "songs:   %d" % len(all_songs)
Ejemplo n.º 4
0
def add_entries(entries, hint, output, scale_factor=1.0):
    count = entries.count()
    done = 0

    utils.log("[%s] processing %d entity titles..." % (hint, count))
    for entry in entries:
        if "title" in entry:
            key = entry["title"].lower()

            # attempt to replace accented characters with their ascii equivalents
            key = unicodedata.normalize("NFKD", unicode(key)).encode("ascii", "ignore")
            key = re.sub("([^a-zA-Z0-9._ -])", "", key)
            key = key.strip()

            output.add(key, scale_factor)

            done += 1
            if count <= 100 or ((done - 1) % (count / 100)) == 0:
                utils.log("[%s] done processing %s" % (hint, utils.getStatusStr(done, count)))
Ejemplo n.º 5
0
    def _sample(self, iterable, func, print_progress=True, progress_delta=5, max_retries=0, retry_delay=0.05):
        progress_count = 100 / progress_delta
        ratio = self.options.sampleSetRatio
        count = 0
        index = 0

        try:
            count = len(iterable)
        except:
            try:
                count = iterable.count()
            except:
                count = utils.count(iterable)

        for obj in iterable:
            if print_progress and (count < progress_count or 0 == (index % (count / progress_count))):
                utils.log("%s : %s" % (self.__class__.__name__, utils.getStatusStr(index, count)))

            if random.random() < ratio:
                noop = self.options.noop
                retries = 0

                while True:
                    try:
                        self.options.noop = (retries < max_retries) or noop
                        func(obj)
                        break
                    except Exception, e:
                        utils.printException()
                        retries += 1

                        if noop or retries > max_retries:
                            prefix = "ERROR" if noop else "UNRESOLVABLE ERROR"
                            utils.log("%s: %s" % (prefix, str(e)))
                            break

                        time.sleep(retry_delay)
                        retry_delay *= 2
                    finally:
                        self.options.noop = noop
Ejemplo n.º 6
0
    def _run(self):
        filename = "amazon_feeds.txt"
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
        feed_file = file(path, "r")
        feeds = map(lambda t: t[0:-1], feed_file.readlines())
        feed_file.close()

        num_feeds = len(feeds)
        utils.log("[%s] parsing %d feeds" % (self, num_feeds))

        pool = Pool(128)

        for i in xrange(num_feeds):
            url = feeds[i]
            pool.spawn(self._parse_feed, pool, url)

            if num_feeds > 100 and (i % (num_feeds / 100)) == 0:
                utils.log("[%s] done parsing %s" % (self, utils.getStatusStr(i, num_feeds)))

        pool.join()
        self._output.put(StopIteration)
        utils.log("[%s] finished parsing %d feeds" % (self, num_feeds))
Ejemplo n.º 7
0
def run(mongo_notification_handler, 
        mongo_host='localhost', 
        mongo_port=27017):
    assert isinstance(mongo_notification_handler, AMongoNotificationHandler)
    
    conn  = pymongo.Connection(mongo_host, mongo_port)
    db    = conn.local
    oplog = db.oplog.rs
    
    schemas = defaultdict(set)
    for o in db.fts.schemas.find():
        schemas[o['ns']] = schemas[o['ns']].union(o['fields'])
    
    progress_delta = 5
    progress_count = 100 / progress_delta
    
    state  = db.fts.find_one({'_id': 'state'})
    first  = True
    cursor = None
    count  = 0
    spec   = {}
    
    if state and 'ts' in state:
        first = oplog.find_one()
        
        if first['ts'].time > state['ts'].time and first['ts'].inc > state['ts'].inc:
            __init(conn, mongo_notification_handler, schemas)
        else:
            spec['ts'] = { '$gt': state['ts'] }
    else:
        __init(conn, mongo_notification_handler, schemas)
    
    # TODO: address async issue here..
    
    if not 'ts' in spec:
        try:
            # attempt to start pulling at the last occurrence of the target namespaces
            s = {"ns" : { "$in" : map(str, schemas.keys()) } }
            
            last = list(oplog.find(s).sort("$natural", -1).limit(1))[0]
            spec['ts'] = { '$gt': last['ts'] }
        except:
            # fallback to starting at the end of the oplog
            try:
                last = list(oplog.find().sort("$natural", -1).limit(1))[0]
                spec['ts'] = { '$gt': last['ts'] }
            except:
                # fallback to starting at the beginning of the oplog
                pass
    
    # poll the mongo oplog indefinitely
    while True:
        try:
            if not cursor or not cursor.alive:
                cursor = oplog.find(spec, tailable=True).sort("$natural", 1)
                count  = cursor.count()
            
            docs    = defaultdict(list)
            index   = 0
            
            for op in cursor:
                pprint(op)
                ns = op['ns']
                
                if ns in schemas:
                    spec['ts'] = { '$gt': op['ts'] }
                    #pprint(op)
                    
                    if op['op'] == 'd':
                        id = __extract_id(op['o']['_id'])
                        
                        mongo_notification_handler.delete(ns, id)
                    elif op['op'] in ['i', 'u']:
                        docs[ns].append(__extract_fields(op['o'], schemas[ns]))
                
                index += 1
                
                if first and (count < progress_count or 0 == (index % (count / progress_count))):
                    print "%s" % utils.getStatusStr(index, count)
            
            if docs:
                for ns, docs in docs.iteritems():
                    mongo_notification_handler.add(ns, docs)
            
            first = False
            db.fts.save({ '_id': 'state', 'ts': spec['ts']['$gt'] })
        except AutoReconnect as e:
            pass
        
        time.sleep(1)
Ejemplo n.º 8
0
 def _run(self):
     utils.log("[%s] initializing" % self)
     f, numLines, filename = self._open_file(countLines=False)
     
     table_format = epf.parse_table_format(f, filename)
     self.table_format = table_format
     
     stale = False
     self._buffer = []
     self._buffer_threshold = 1024
     
     # determine whether or not the db table already exists and attempt to 
     # determine if it's up-to-date s.t. we won't recalculate it if it'd 
     # be unnecessary.
     try:
         row0 = self.execute('SELECT * FROM %s LIMIT 1' % (self.table, ), error_okay=True).fetchone()
         
         if row0 is None:
             stale = True
         elif len(row0) != len(dict(table_format.cols)):
             stale = True
     except Exception:
         self.conn.rollback()
         #utils.printException()
         stale = True
         pass
     
     #f.close(); self._output.put(StopIteration); return
     
     if not stale:
         # table is usable as-is
         utils.log("[%s] %s.%s doesn't need to be recomputed" % (self, self.dbpath, self.table))
     else:
         utils.log("[%s] opening '%s'" % (self, self._filename))
         
         numLines = max(0, utils.getNumLines(f) - 8)
         table_format = epf.parse_table_format(f, filename)
         self.table_format = table_format
         
         utils.log("[%s] parsing ~%d rows from '%s'" % (self, numLines, self._filename))
         
         # initialize table
         cols  = []
         
         # currently disabling primary keys for most tables
         found_primary = False #(len(table_format.primary_keys) != 1)
         
         for col in table_format.cols:
             cols.append('')
         
         for col in table_format.cols:
             primary = ""
             if not found_primary and col == self.primary and not self._sqlite:
             #if not found_primary and col in table_format.primary_keys:
                 # TODO: handle the common case of multiple primary keys, which sqlite3 does not support
                 # TODO: defining the primary key here as opposed to after insertion is much slower!
                 primary = " PRIMARY KEY"
                 found_primary = True
             
             col2  = table_format.cols[col]
             col_type = col2['type']
             
             if not self._sqlite:
                 # perform mapping between some MySQL types that Apple uses and 
                 # their postgres equivalents
                 if col_type == 'DATETIME':
                     col_type = 'VARCHAR(100)'
                 elif col_type == 'LONGTEXT':
                     col_type = 'VARCHAR(4000)'
             
             text  = "%s %s%s" % (col, col_type, primary)
             index = col2['index']
             cols[index] = text
         
         args = string.joinfields(cols, ', ')
         self.execute("DROP TABLE %s" % (self.table, ), error_okay=True)
         self.execute("CREATE TABLE %s (%s)" % (self.table, args), verbose=True)
         
         if self._sqlite:
             placeholder = '?'
         else:
             placeholder = '%s'
         
         values_str  = '(%s)' % string.joinfields((placeholder for col in table_format.cols), ', ')
         self._cmd   = 'INSERT INTO %s VALUES %s' % (self.table, values_str)
         
         count = 0
         for row in epf.parse_rows(f, table_format):
             self._parseRowOld(row, table_format)
             count += 1
             
             if numLines > 100 and (count % (numLines / 100)) == 0:
                 num_rows = self.execute('SELECT COUNT(*) FROM %s' % (self.table, )).fetchone()[0]
                 
                 utils.log("[%s] done parsing %s -- %d rows" % \
                     (self, utils.getStatusStr(count, numLines), num_rows))
         
         self._try_flush_buffer(force=True)
         
         if self.index:
             self.execute("CREATE INDEX %s on %s (%s)" % (self.index, self.table, self.index), verbose=True)
         
         utils.log("[%s] finished parsing %d rows" % (self, count))
     
     f.close()
     self._output.put(StopIteration)
Ejemplo n.º 9
0
                
                if modified:
                    api._entityDB.updateEntity(entity)
        
        # TODO: handle new-style entity images
        #query = {"_id" : bson.objectid.ObjectId("4ea8803cfe4a1d2a4200081f")}
        query = {'image' : {'$regex' : r'^.*thetvdb.com.*$'}}
        #query = {'subcategory' : 'tv'}
        
        docs  = api._entityDB._collection.find(query)
        count = docs.count()
        index = 0
        
        progress_delta = 5 # report progress every 5%
        progress_count = 100 / progress_delta
        
        utils.log("processing %d entities" % count)
        
        for doc in docs:
            entity = api._entityDB._convertFromMongo(doc)
            
            pool.spawn(_process_entity, entity)
            
            if count > 1 and 0 == (index % (count / progress_count)):
                utils.log("\n\nPROGRESS: %s\n\n" % (utils.getStatusStr(index, count)))
            
            index += 1
        
        pool.join()

Ejemplo n.º 10
0
def main():
    options = parseCommandLine()
    
    """
    ret = utils.shell(r"grep 'class.*(AppleEPFRelationalDB' sources/dumps/AppleEPFRelationalDB.py | sed 's/class \([^(]*\)(.*/\1/g'")
    ret = map(lambda r: r.strip() + "()", ret[0].split('\n'))
    
    for r in ret:
        cls = eval(r)
        
        cls._run()
        #cls.start()
        #cls.join()
        cls.close()
    """
    
    sink = AppleEntitySink(options)
    appleAPI = AppleAPI(country='us')
    
    pool   = Pool(32)
    offset = 0
    done   = 0
    
    all_artists = set()
    all_albums  = set()
    all_songs   = set()
    
    count = options.count0
    rows  = options.album_popularity_per_genre.execute('SELECT * FROM "%s"' % \
                                                       options.album_popularity_per_genre.table)
    rows  = list(rows)
    
    # loop through all albums
    utils.log("[%s] parsing %d rows" % ('albums', count))
    for i in xrange(len(rows)):
        if offset < options.offset: offset += 1; continue
        if options.limit is not None and done > options.limit: break
        done += 1
        
        row = rows[i]
        row = options.album_popularity_per_genre._format_result(row)
        pool.spawn(parse_album, row, appleAPI, sink, pool, all_artists, all_albums, all_songs)
        
        if options.limit <= 100 or ((done - 1) % (options.limit / 100)) == 0:
            utils.log("[%s] done parsing %s" % ('albums', utils.getStatusStr(done, options.limit)))
    
    """
    count = options.count1
    rows  = options.song_popularity_per_genre.execute('SELECT * FROM "%s"' % \
                                                      options.song_popularity_per_genre.table)
    rows  = list(rows)
    
    # loop through all songs
    utils.log("[%s] parsing %d rows" % ('songs', count))
    for i in xrange(len(rows)):
        if offset < options.offset: offset += 1; continue
        if options.limit is not None and done > options.limit: break
        done += 1
        
        row = rows[i]
        row = options.song_popularity_per_genre._format_result(row)
        pool.spawn(parse_song, row, appleAPI, sink, pool, all_artists, all_albums, all_songs)
        
        if options.limit <= 100 or ((done - 1) % (options.limit / 100)) == 0:
            utils.log("[%s] done parsing %s" % ('songs', utils.getStatusStr(done, options.limit)))
    """
    
    pool.join()
    
    print "artists: %d" % len(all_artists)
    print "albums:  %d" % len(all_albums)
    print "songs:   %d" % len(all_songs)
Ejemplo n.º 11
0
def main():
    options  = parseCommandLine()
    
    stampedAPI = MongoStampedAPI()
    entityDB   = stampedAPI._entityDB
    placesDB   = stampedAPI._placesEntityDB
    
    autocompleteDB = S3AutocompleteDB()
    
    prefixes = set()
    wrapper  = {
        'time_sum' : 0.0, 
        'time_num' : 0, 
    }
    
    def _add(orig_name, wrapper):
        try:
            if 0 == len(orig_name) or orig_name in prefixes:
                return
            
            name = encode_s3_name(orig_name)
            if 0 == len(name):
                return
            
            name = "search/v2/%s.json" % name
            
            print "searching %s" % orig_name.encode('ascii', 'replace')
            tries = 0
            
            while True:
                try:
                    t1 = time.time()
                    results = stampedAPI.searchEntities(query=orig_name, limit=10, prefix=True, full=False)
                    t2 = time.time()
                    duration = (t2 - t1)
                    
                    wrapper['time_sum'] += duration
                    wrapper['time_num'] += 1
                    
                    break
                except:
                    tries += 1
                    
                    if tries >= 3:
                        utils.printException()
                        time.sleep(1)
                        return
                    
                    time.sleep(1)
            
            """
            if len(results) <= 1:
                i = len(orig_name)
                
                while i > 0:
                    prefixes.add(orig_name[0:i])
                    i -= 1
                
                if 0 == len(results):
                    return False
            """
            
            autosuggest = []
            for item in results:
                item = HTTPEntityAutosuggest().importSchema(item[0], item[1]).dataExport()
                autosuggest.append(item)
            
            value = json.dumps(autosuggest, sort_keys=True)
            
            data  = {
                'name1' : orig_name.encode('ascii', 'replace'), 
                'name2' : name, 
                'num_r' : len(results)
            }
            
            pprint(data)
            sys.stdout.flush()
            
            if not options.noop:
                retries = 0
                while True:
                    try:
                        autocompleteDB.add_key(name, value, content_type='application/json', apply_gzip=True)
                        break
                    except:
                        retries += 1
                        if retries > 5:
                            utils.printException()
                            return
                        
                        time.sleep(1)
        except:
            utils.printException()
            time.sleep(1)
            return
    
    infile = file('autocomplete.txt', 'r')
    pool   = Pool(4)
    done   = 0
    offset = 0
    
    for line in infile:
        if offset < options.offset: offset += 1; continue
        if options.limit is not None and done > options.limit: break
        
        line = line[:-1]
        pool.spawn(_add, line, wrapper)
        
        done += 1
        if options.limit <= 100 or ((done - 1) % (options.limit / 100)) == 0:
            utils.log("done processing %s (avg search time %s ms)" % (utils.getStatusStr(done, options.limit), 1000.0 * (wrapper['time_sum'] / (wrapper['time_num'] if wrapper['time_num'] > 0 else 1))))
    
    pool.join()
    infile.close()