def row_iter(rows, reader): for row in rows: yield quoteRow(row, take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) for data in reader: yield quoteRow(IOTools.convertDictionary(data, map=options.map), take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$") parser.add_option( "-s", "--method=sort --sort-order", dest="sort", type="string", help="fields to take (in sorted order).") (options, args) = E.start(parser, add_csv_options=True) reader = csv.DictReader(E.stdin, dialect=options.csv_dialect) if options.sort: fields = options.sort.split(",") else: fields = None writer = csv.DictWriter(E.stdout, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') E.stdout.write("\t".join(fields) + "\n") for row in reader: row = IOTools.convertDictionary(row) writer.writerow(row) E.stop()
def run(infile, options, report_step=10000): options.tablename = quoteTableName(options.tablename, backend=options.backend) if options.map: m = {} for x in options.map: f, t = x.split(":") m[f] = t options.map = m else: options.map = {} existing_tables = set() quick_import_separator = "\t" if options.database_backend == "postgres": import psycopg2 raise NotImplementedError("needs refactoring for commandline options") dbhandle = psycopg2.connect(options.psql_connection) error = psycopg2.Error options.null = "NULL" options.string_value = "'%s'" options.text = "TEXT" options.index = "TEXT" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.database_backend == "mysql": import MySQLdb dbhandle = MySQLdb.connect(host=options.database_host, user=options.database_username, passwd=options.database_password, port=options.database_port, db=options.database_name) error = Exception options.null = "NULL" options.string_value = "%s" options.text = "TEXT" options.index = "VARCHAR(40)" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect(options.database_name) try: os.chmod(options.database_name, 0o664) except OSError as msg: E.warn("could not change permissions of database: %s" % msg) # Avoid the following error: # sqlite3.ProgrammingError: You must not use 8-bit bytestrings # unless you use a text_factory that can interpret 8-bit # bytestrings (like text_factory = str). It is highly # recommended that you instead just switch your application # to Unicode strings # Note: might be better to make csv2db unicode aware. dbhandle.text_factory = str error = sqlite3.OperationalError options.insert_many = True # False options.null = None # "NULL" options.text = "TEXT" options.index = "TEXT" options.string_value = "%s" # "'%s'" statement = "SELECT name FROM sqlite_master WHERE type='table'" cc = executewait(dbhandle, statement, error, options.retry) existing_tables = set([x[0] for x in cc]) cc.close() # use , as separator quick_import_statement = \ "sqlite3 %s '.import %%s %s'" % \ (options.database_name, options.tablename) quick_import_separator = "|" if options.header is not None: options.header = [x.strip() for x in options.header.split(",")] if options.utf: reader = CSV.UnicodeDictReader(infile, dialect=options.dialect, fieldnames=options.header) else: reader = csv.DictReader(CSV.CommentStripper(infile), dialect=options.dialect, fieldnames=options.header) if options.replace_header: try: next(reader) except StopIteration: pass E.info("reading %i columns to guess column types" % options.guess_size) rows = [] for row in reader: if None in row: raise ValueError("undefined columns in input file at row: %s" % row) try: rows.append(IOTools.convertDictionary(row, map=options.map)) except TypeError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) except ValueError as msg: E.warn("incomplete line? Type error in conversion: " "'%s' with data: %s" % (msg, str(row))) if len(rows) >= options.guess_size: break E.info("read %i rows for type guessing" % len(rows)) E.info("creating table") if len(rows) == 0: if options.allow_empty: if not reader.fieldnames: E.warn("no data - no table created") else: # create empty table and exit take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) E.info("empty table created") return else: raise ValueError("empty table") else: take, map_column2type, ignored = createTable( dbhandle, error, options.tablename, options, rows=rows, retry=options.retry, headers=reader.fieldnames, ignore_empty=options.ignore_empty, ignore_columns=options.ignore_columns, rename_columns=options.rename_columns, lowercase=options.lowercase, ignore_duplicates=options.ignore_duplicates, indices=options.indices, first_column=options.first_column, existing_tables=existing_tables, append=options.append) def row_iter(rows, reader): for row in rows: yield quoteRow(row, take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) for data in reader: yield quoteRow(IOTools.convertDictionary(data, map=options.map), take, map_column2type, options.missing_values, null=options.null, string_value=options.string_value) ninput = 0 E.info("inserting data") if options.insert_quick: E.info("using quick insert") outfile, filename = tempfile.mkstemp() E.debug("dumping data into %s" % filename) for d in row_iter(rows, reader): ninput += 1 os.write( outfile, quick_import_separator.join([str(d[x]) for x in take]) + "\n") if ninput % report_step == 0: E.info("iteration %i\n" % ninput) os.close(outfile) statement = quick_import_statement % filename E.debug(statement) # infinite loop possible while 1: retcode = E.run(statement, cwd=os.getcwd(), close_fds=True) if retcode != 0: E.warn("import error using statement: %s" % statement) if not options.retry: raise ValueError("import error using statement: %s" % statement) time.sleep(5) continue break os.remove(filename) # there is no way to insert NULL values into sqlite. The only # solution is to update all colums. for column in take: executewait( dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" % (options.tablename, column, column), error, options.retry) elif options.insert_many: data = [] for d in row_iter(rows, reader): ninput += 1 data.append([d[x] for x in take]) if ninput % report_step == 0: E.info("iteration %i" % ninput) statement = "INSERT INTO %s VALUES (%s)" % (options.tablename, ",".join("?" * len(take))) E.info("inserting %i rows" % len(data)) E.debug("multiple insert:\n# %s" % statement) while 1: try: dbhandle.executemany(statement, data) except error as msg: E.warn("import failed: msg=%s, statement=\n %s" % (msg, statement)) # TODO: check for database locked msg if not options.retry: raise error(msg) if not re.search("locked", str(msg)): raise error(msg) time.sleep(5) continue break else: # insert line by line (could not figure out how to do bulk loading with # subprocess and COPY FROM STDIN) statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename, ')s, %('.join(take)) # output data used for guessing: for d in row_iter(rows, reader): ninput += 1 E.debug("single insert:\n# %s" % (statement % d)) cc = executewait(dbhandle, statement, error, retry=options.retry, args=d) cc.close() if ninput % report_step == 0: E.info("iteration %i" % ninput) E.info("building indices") nindex = 0 for index in options.indices: nindex += 1 try: statement = "CREATE INDEX %s_index%i ON %s (%s)" % ( options.tablename, nindex, options.tablename, index) cc = executewait(dbhandle, statement, error, options.retry) cc.close() E.info("added index on column %s" % (index)) except error as msg: E.info("adding index on column %s failed: %s" % (index, msg)) statement = "SELECT COUNT(*) FROM %s" % (options.tablename) cc = executewait(dbhandle, statement, error, options.retry) result = cc.fetchone() cc.close() noutput = result[0] E.info("ninput=%i, noutput=%i, nskipped_columns=%i" % (ninput, noutput, len(ignored))) dbhandle.commit()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--outfile=", dest="output_filename", type="string", help="write to output filename.") parser.set_defaults(output_filename=None, ) (options, args) = E.start(parser, add_csv_options=True) if not options.output_filename: raise ValueError("please specify an output filename.") w = openpyxl.Workbook(optimized_write=True) for filename in args: lines = [ x for x in IOTools.open_file(filename, "r").readlines() if x[0] != "#" ] if len(lines) == 0: continue if options.loglevel >= 2: print("# read %i rows" % len(lines)) sys.stdout.flush() headers = lines[0][:-1].split("\t") ws = w.add_sheet(os.path.basename(filename)) cur_row = 0 ws.append(headers) cur_row += 1 reader = csv.DictReader(lines, dialect=options.csv_dialect) for row in reader: row = IOTools.convertDictionary(row) data = [row.get(headers[x], "") for x in range(len(headers))] ws.append(data) cur_row += 1 w.save(options.output_filename) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (options, args) = E.start(parser, add_csv_options=True) if len(args) != 2: raise ValueError("please specify two files to join") options.filename1, options.filename2 = args table1 = CSV.readTable(IOTools.open_file(options.filename1, "r")) table2 = CSV.readTable(IOTools.open_file(options.filename2, "r")) if options.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = options.stdout # build new field list new_fields = [] for x in options.join_fields1: new_fields.append(x) for x in fields1: if x not in options.join_fields1: new_fields.append(x) if x not in options.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if options.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=options.csv_dialect) print("\t".join(fields)) first_row = True for row in reader: row = IOTools.convertDictionary(row) writer.writerow(row) E.stop()