def _run(self): utils.log("[%s] initializing" % self) f, numLines, filename = self._open_file(countLines=False) table_format = epf.parse_table_format(f, filename) self.table_format = table_format f.close() numLines = self.execute('SELECT COUNT(*) FROM "%s"' % self.table).fetchone()[0] utils.log("[%s] parsing ~%d entities from '%s'" % (self, numLines, self.table)) rows = self.execute('SELECT * FROM "%s"' % self.table) #self._globals['rows'] = rows; self._output.put(StopIteration); return count = 0 for row in rows: row = self._format_result(row) self._parseRow(row) count += 1 if numLines > 100 and (count % (numLines / 100)) == 0: utils.log("[%s] done parsing %s" % \ (self, utils.getStatusStr(count, numLines))) time.sleep(0.1) f.close() self._output.put(StopIteration) utils.log("[%s] finished parsing %d entities (filtered %d)" % (self, count, self.numFiltered))
def _run(self): utils.log("[%s] initializing" % self) f, numLines, filename = self._open_file(countLines=False) table_format = epf.parse_table_format(f, filename) self.table_format = table_format stale = False self._buffer = [] self._buffer_threshold = 1024 # determine whether or not the db table already exists and attempt to # determine if it's up-to-date s.t. we won't recalculate it if it'd # be unnecessary. try: row0 = self.execute('SELECT * FROM %s LIMIT 1' % (self.table, ), error_okay=True).fetchone() if row0 is None: stale = True elif len(row0) != len(dict(table_format.cols)): stale = True except Exception: self.conn.rollback() #utils.printException() stale = True pass #f.close(); self._output.put(StopIteration); return if not stale: # table is usable as-is utils.log("[%s] %s.%s doesn't need to be recomputed" % (self, self.dbpath, self.table)) else: utils.log("[%s] opening '%s'" % (self, self._filename)) numLines = max(0, utils.getNumLines(f) - 8) table_format = epf.parse_table_format(f, filename) self.table_format = table_format utils.log("[%s] parsing ~%d rows from '%s'" % (self, numLines, self._filename)) # initialize table cols = [] # currently disabling primary keys for most tables found_primary = False #(len(table_format.primary_keys) != 1) for col in table_format.cols: cols.append('') for col in table_format.cols: primary = "" if not found_primary and col == self.primary and not self._sqlite: #if not found_primary and col in table_format.primary_keys: # TODO: handle the common case of multiple primary keys, which sqlite3 does not support # TODO: defining the primary key here as opposed to after insertion is much slower! primary = " PRIMARY KEY" found_primary = True col2 = table_format.cols[col] col_type = col2['type'] if not self._sqlite: # perform mapping between some MySQL types that Apple uses and # their postgres equivalents if col_type == 'DATETIME': col_type = 'VARCHAR(100)' elif col_type == 'LONGTEXT': col_type = 'VARCHAR(4000)' text = "%s %s%s" % (col, col_type, primary) index = col2['index'] cols[index] = text args = string.joinfields(cols, ', ') self.execute("DROP TABLE %s" % (self.table, ), error_okay=True) self.execute("CREATE TABLE %s (%s)" % (self.table, args), verbose=True) if self._sqlite: placeholder = '?' else: placeholder = '%s' values_str = '(%s)' % string.joinfields((placeholder for col in table_format.cols), ', ') self._cmd = 'INSERT INTO %s VALUES %s' % (self.table, values_str) count = 0 for row in epf.parse_rows(f, table_format): self._parseRowOld(row, table_format) count += 1 if numLines > 100 and (count % (numLines / 100)) == 0: num_rows = self.execute('SELECT COUNT(*) FROM %s' % (self.table, )).fetchone()[0] utils.log("[%s] done parsing %s -- %d rows" % \ (self, utils.getStatusStr(count, numLines), num_rows)) self._try_flush_buffer(force=True) if self.index: self.execute("CREATE INDEX %s on %s (%s)" % (self.index, self.table, self.index), verbose=True) utils.log("[%s] finished parsing %d rows" % (self, count)) f.close() self._output.put(StopIteration)