def _open_file(self, countLines=True): filename = os.path.join(self._distro.apple_data_dir, self._filename) zipped = filename + ".gz" if not os.path.exists(filename) and not os.path.exists(zipped): if Globals.options.mount: raise Fail("ERROR: mount failed!") else: while (not os.path.exists(filename)) and (not os.path.exists(zipped)): utils.log("waiting for mount to complete for file '%s'" % filename) time.sleep(4) if os.path.exists(zipped): filename = zipped #utils.log("Opening Apple EPF file '%s'" % filename) if not os.path.exists(filename): utils.log("Apple EPF file '%s' does not exist!" % filename) raise Fail("Apple EPF file '%s' does not exist!" % filename) if filename.endswith(".gz"): f = gzip.open(filename, 'rb') else: f = open(filename, 'r+b') if countLines: numLines = max(0, utils.getNumLines(f) - 8) else: numLines = 0 return f, numLines, filename
def parseCommandLine(): usage = "Usage: %prog [options] [sources]" version = "%prog " + __version__ parser = OptionParser(usage=usage, version=version) parser.add_option("-d", "--db", default=None, type="string", action="store", dest="db", help="db to connect to for output") parser.add_option("-n", "--noop", default=False, action="store_true", help="run in noop mode without modifying anything") parser.add_option("-r", "--ratio", default=None, type="string", action="store", dest="ratio", help="where this crawler fits in to a distributed stack") parser.add_option("-o", "--offset", default=0, type="int", dest="offset", help="start index of entities to import") parser.add_option("-l", "--limit", default=None, type="int", help="limits the number of entities to import") (options, args) = parser.parse_args() Globals.options = options if options.db: utils.init_db_config(options.db) infile = file('autocomplete.txt', 'r') options.count = utils.getNumLines(infile) infile.close() if options.ratio: num, den = options.ratio.split('/') num, den = int(num), int(den) num, den = float(num), float(den) options.offset = int(math.floor((options.count * (num - 1)) / den)) options.limit = int(math.ceil(options.count / den) + 1) utils.log("ratio %s) offset=%d, limit=%d" % (options.ratio, options.offset, options.limit)) else: if options.limit is None: options.limit = options.count options.verbose = False return options
def _run(self): utils.log("[%s] initializing" % self) f, numLines, filename = self._open_file(countLines=False) table_format = epf.parse_table_format(f, filename) self.table_format = table_format stale = False self._buffer = [] self._buffer_threshold = 1024 # determine whether or not the db table already exists and attempt to # determine if it's up-to-date s.t. we won't recalculate it if it'd # be unnecessary. try: row0 = self.execute('SELECT * FROM %s LIMIT 1' % (self.table, ), error_okay=True).fetchone() if row0 is None: stale = True elif len(row0) != len(dict(table_format.cols)): stale = True except Exception: self.conn.rollback() #utils.printException() stale = True pass #f.close(); self._output.put(StopIteration); return if not stale: # table is usable as-is utils.log("[%s] %s.%s doesn't need to be recomputed" % (self, self.dbpath, self.table)) else: utils.log("[%s] opening '%s'" % (self, self._filename)) numLines = max(0, utils.getNumLines(f) - 8) table_format = epf.parse_table_format(f, filename) self.table_format = table_format utils.log("[%s] parsing ~%d rows from '%s'" % (self, numLines, self._filename)) # initialize table cols = [] # currently disabling primary keys for most tables found_primary = False #(len(table_format.primary_keys) != 1) for col in table_format.cols: cols.append('') for col in table_format.cols: primary = "" if not found_primary and col == self.primary and not self._sqlite: #if not found_primary and col in table_format.primary_keys: # TODO: handle the common case of multiple primary keys, which sqlite3 does not support # TODO: defining the primary key here as opposed to after insertion is much slower! primary = " PRIMARY KEY" found_primary = True col2 = table_format.cols[col] col_type = col2['type'] if not self._sqlite: # perform mapping between some MySQL types that Apple uses and # their postgres equivalents if col_type == 'DATETIME': col_type = 'VARCHAR(100)' elif col_type == 'LONGTEXT': col_type = 'VARCHAR(4000)' text = "%s %s%s" % (col, col_type, primary) index = col2['index'] cols[index] = text args = string.joinfields(cols, ', ') self.execute("DROP TABLE %s" % (self.table, ), error_okay=True) self.execute("CREATE TABLE %s (%s)" % (self.table, args), verbose=True) if self._sqlite: placeholder = '?' else: placeholder = '%s' values_str = '(%s)' % string.joinfields((placeholder for col in table_format.cols), ', ') self._cmd = 'INSERT INTO %s VALUES %s' % (self.table, values_str) count = 0 for row in epf.parse_rows(f, table_format): self._parseRowOld(row, table_format) count += 1 if numLines > 100 and (count % (numLines / 100)) == 0: num_rows = self.execute('SELECT COUNT(*) FROM %s' % (self.table, )).fetchone()[0] utils.log("[%s] done parsing %s -- %d rows" % \ (self, utils.getStatusStr(count, numLines), num_rows)) self._try_flush_buffer(force=True) if self.index: self.execute("CREATE INDEX %s on %s (%s)" % (self.index, self.table, self.index), verbose=True) utils.log("[%s] finished parsing %d rows" % (self, count)) f.close() self._output.put(StopIteration)