Beispiel #1
0
 def _open_file(self, countLines=True):
     filename = os.path.join(self._distro.apple_data_dir, self._filename)
     zipped = filename + ".gz"
     
     if not os.path.exists(filename) and not os.path.exists(zipped):
         if Globals.options.mount:
             raise Fail("ERROR: mount failed!")
         else:
             while (not os.path.exists(filename)) and (not os.path.exists(zipped)):
                 utils.log("waiting for mount to complete for file '%s'" % filename)
                 time.sleep(4)
     
     if os.path.exists(zipped):
         filename = zipped
     
     #utils.log("Opening Apple EPF file '%s'" % filename)
     if not os.path.exists(filename):
         utils.log("Apple EPF file '%s' does not exist!" % filename)
         raise Fail("Apple EPF file '%s' does not exist!" % filename)
     
     if filename.endswith(".gz"):
         f = gzip.open(filename, 'rb')
     else:
         f = open(filename, 'r+b')
     
     if countLines:
         numLines = max(0, utils.getNumLines(f) - 8)
     else:
         numLines = 0
     
     return f, numLines, filename
Beispiel #2
0
def parseCommandLine():
    usage   = "Usage: %prog [options] [sources]"
    version = "%prog " + __version__
    parser  = OptionParser(usage=usage, version=version)
    
    parser.add_option("-d", "--db", default=None, type="string", 
        action="store", dest="db", 
        help="db to connect to for output")
    
    parser.add_option("-n", "--noop", default=False, action="store_true", 
        help="run in noop mode without modifying anything")
    
    parser.add_option("-r", "--ratio", default=None, type="string", 
        action="store", dest="ratio", 
        help="where this crawler fits in to a distributed stack")
    
    parser.add_option("-o", "--offset", default=0, 
        type="int", dest="offset", 
        help="start index of entities to import")
    
    parser.add_option("-l", "--limit", default=None, type="int", 
        help="limits the number of entities to import")

    (options, args) = parser.parse_args()
    Globals.options = options
    
    if options.db:
        utils.init_db_config(options.db)
    
    infile = file('autocomplete.txt', 'r')
    options.count = utils.getNumLines(infile)
    infile.close()
    
    if options.ratio:
        num, den = options.ratio.split('/')
        num, den = int(num), int(den)
        num, den = float(num), float(den)
        
        options.offset = int(math.floor((options.count * (num - 1)) / den))
        options.limit  = int(math.ceil(options.count / den) + 1)
        
        utils.log("ratio %s) offset=%d, limit=%d" % (options.ratio, options.offset, options.limit))
    else:
        if options.limit is None:
            options.limit = options.count
    
    options.verbose = False
    return options
 def _run(self):
     utils.log("[%s] initializing" % self)
     f, numLines, filename = self._open_file(countLines=False)
     
     table_format = epf.parse_table_format(f, filename)
     self.table_format = table_format
     
     stale = False
     self._buffer = []
     self._buffer_threshold = 1024
     
     # determine whether or not the db table already exists and attempt to 
     # determine if it's up-to-date s.t. we won't recalculate it if it'd 
     # be unnecessary.
     try:
         row0 = self.execute('SELECT * FROM %s LIMIT 1' % (self.table, ), error_okay=True).fetchone()
         
         if row0 is None:
             stale = True
         elif len(row0) != len(dict(table_format.cols)):
             stale = True
     except Exception:
         self.conn.rollback()
         #utils.printException()
         stale = True
         pass
     
     #f.close(); self._output.put(StopIteration); return
     
     if not stale:
         # table is usable as-is
         utils.log("[%s] %s.%s doesn't need to be recomputed" % (self, self.dbpath, self.table))
     else:
         utils.log("[%s] opening '%s'" % (self, self._filename))
         
         numLines = max(0, utils.getNumLines(f) - 8)
         table_format = epf.parse_table_format(f, filename)
         self.table_format = table_format
         
         utils.log("[%s] parsing ~%d rows from '%s'" % (self, numLines, self._filename))
         
         # initialize table
         cols  = []
         
         # currently disabling primary keys for most tables
         found_primary = False #(len(table_format.primary_keys) != 1)
         
         for col in table_format.cols:
             cols.append('')
         
         for col in table_format.cols:
             primary = ""
             if not found_primary and col == self.primary and not self._sqlite:
             #if not found_primary and col in table_format.primary_keys:
                 # TODO: handle the common case of multiple primary keys, which sqlite3 does not support
                 # TODO: defining the primary key here as opposed to after insertion is much slower!
                 primary = " PRIMARY KEY"
                 found_primary = True
             
             col2  = table_format.cols[col]
             col_type = col2['type']
             
             if not self._sqlite:
                 # perform mapping between some MySQL types that Apple uses and 
                 # their postgres equivalents
                 if col_type == 'DATETIME':
                     col_type = 'VARCHAR(100)'
                 elif col_type == 'LONGTEXT':
                     col_type = 'VARCHAR(4000)'
             
             text  = "%s %s%s" % (col, col_type, primary)
             index = col2['index']
             cols[index] = text
         
         args = string.joinfields(cols, ', ')
         self.execute("DROP TABLE %s" % (self.table, ), error_okay=True)
         self.execute("CREATE TABLE %s (%s)" % (self.table, args), verbose=True)
         
         if self._sqlite:
             placeholder = '?'
         else:
             placeholder = '%s'
         
         values_str  = '(%s)' % string.joinfields((placeholder for col in table_format.cols), ', ')
         self._cmd   = 'INSERT INTO %s VALUES %s' % (self.table, values_str)
         
         count = 0
         for row in epf.parse_rows(f, table_format):
             self._parseRowOld(row, table_format)
             count += 1
             
             if numLines > 100 and (count % (numLines / 100)) == 0:
                 num_rows = self.execute('SELECT COUNT(*) FROM %s' % (self.table, )).fetchone()[0]
                 
                 utils.log("[%s] done parsing %s -- %d rows" % \
                     (self, utils.getStatusStr(count, numLines), num_rows))
         
         self._try_flush_buffer(force=True)
         
         if self.index:
             self.execute("CREATE INDEX %s on %s (%s)" % (self.index, self.table, self.index), verbose=True)
         
         utils.log("[%s] finished parsing %d rows" % (self, count))
     
     f.close()
     self._output.put(StopIteration)