Beispiel #1
0
def run(infile, options, report_step=10000):

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    existing_tables = set()

    quick_import_separator = "\t"

    if options.database_backend == "postgres":
        import psycopg2
        raise NotImplementedError("needs refactoring for commandline options")
        dbhandle = psycopg2.connect(options.psql_connection)
        error = psycopg2.Error
        options.null = "NULL"
        options.string_value = "'%s'"
        options.text = "TEXT"
        options.index = "TEXT"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.database_backend == "mysql":
        import MySQLdb
        dbhandle = MySQLdb.connect(host=options.database_host,
                                   user=options.database_username,
                                   passwd=options.database_password,
                                   port=options.database_port,
                                   db=options.database_name)
        error = Exception
        options.null = "NULL"
        options.string_value = "%s"
        options.text = "TEXT"
        options.index = "VARCHAR(40)"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database_name)
        try:
            os.chmod(options.database_name, 0o664)
        except OSError as msg:
            E.warn("could not change permissions of database: %s" % msg)

        # Avoid the following error:
        # sqlite3.ProgrammingError: You must not use 8-bit bytestrings
        # unless you use a text_factory that can interpret 8-bit
        # bytestrings (like text_factory = str). It is highly
        # recommended that you instead just switch your application
        # to Unicode strings
        # Note: might be better to make csv2db unicode aware.
        dbhandle.text_factory = str

        error = sqlite3.OperationalError
        options.insert_many = True  # False
        options.null = None  # "NULL"
        options.text = "TEXT"
        options.index = "TEXT"
        options.string_value = "%s"  # "'%s'"

        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait(dbhandle, statement, error, options.retry)
        existing_tables = set([x[0] for x in cc])
        cc.close()

        # use , as separator
        quick_import_statement = \
            "sqlite3 %s '.import %%s %s'" % \
            (options.database_name, options.tablename)

        quick_import_separator = "|"

    if options.header is not None:
        options.header = [x.strip() for x in options.header.split(",")]

    if options.utf:
        reader = CSV.UnicodeDictReader(infile,
                                       dialect=options.dialect,
                                       fieldnames=options.header)
    else:
        reader = csv.DictReader(CSV.CommentStripper(infile),
                                dialect=options.dialect,
                                fieldnames=options.header)

    if options.replace_header:
        try:
            next(reader)
        except StopIteration:
            pass

    E.info("reading %i columns to guess column types" % options.guess_size)

    rows = []
    for row in reader:
        if None in row:
            raise ValueError("undefined columns in input file at row: %s" %
                             row)

        try:
            rows.append(IOTools.convertDictionary(row, map=options.map))
        except TypeError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))
        except ValueError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break

    E.info("read %i rows for type guessing" % len(rows))
    E.info("creating table")

    if len(rows) == 0:
        if options.allow_empty:
            if not reader.fieldnames:
                E.warn("no data - no table created")
            else:
                # create empty table and exit
                take, map_column2type, ignored = createTable(
                    dbhandle,
                    error,
                    options.tablename,
                    options,
                    retry=options.retry,
                    headers=reader.fieldnames,
                    ignore_empty=options.ignore_empty,
                    ignore_columns=options.ignore_columns,
                    rename_columns=options.rename_columns,
                    lowercase=options.lowercase,
                    ignore_duplicates=options.ignore_duplicates,
                    indices=options.indices,
                    first_column=options.first_column,
                    existing_tables=existing_tables,
                    append=options.append)
                E.info("empty table created")
            return
        else:
            raise ValueError("empty table")
    else:
        take, map_column2type, ignored = createTable(
            dbhandle,
            error,
            options.tablename,
            options,
            rows=rows,
            retry=options.retry,
            headers=reader.fieldnames,
            ignore_empty=options.ignore_empty,
            ignore_columns=options.ignore_columns,
            rename_columns=options.rename_columns,
            lowercase=options.lowercase,
            ignore_duplicates=options.ignore_duplicates,
            indices=options.indices,
            first_column=options.first_column,
            existing_tables=existing_tables,
            append=options.append)

    def row_iter(rows, reader):
        for row in rows:
            yield quoteRow(row,
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)
        for data in reader:
            yield quoteRow(IOTools.convertDictionary(data, map=options.map),
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)

    ninput = 0

    E.info("inserting data")

    if options.insert_quick:
        E.info("using quick insert")

        outfile, filename = tempfile.mkstemp()

        E.debug("dumping data into %s" % filename)

        for d in row_iter(rows, reader):

            ninput += 1
            os.write(
                outfile,
                quick_import_separator.join([str(d[x]) for x in take]) + "\n")

            if ninput % report_step == 0:
                E.info("iteration %i\n" % ninput)

        os.close(outfile)

        statement = quick_import_statement % filename
        E.debug(statement)

        # infinite loop possible
        while 1:

            retcode = E.run(statement, cwd=os.getcwd(), close_fds=True)

            if retcode != 0:
                E.warn("import error using statement: %s" % statement)

                if not options.retry:
                    raise ValueError("import error using statement: %s" %
                                     statement)

                time.sleep(5)
                continue

            break

        os.remove(filename)

        # there is no way to insert NULL values into sqlite. The only
        # solution is to update all colums.
        for column in take:
            executewait(
                dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" %
                (options.tablename, column, column), error, options.retry)

    elif options.insert_many:
        data = []
        for d in row_iter(rows, reader):
            ninput += 1

            data.append([d[x] for x in take])

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

        statement = "INSERT INTO %s VALUES (%s)" % (options.tablename,
                                                    ",".join("?" * len(take)))

        E.info("inserting %i rows" % len(data))
        E.debug("multiple insert:\n# %s" % statement)

        while 1:
            try:
                dbhandle.executemany(statement, data)
            except error as msg:
                E.warn("import failed: msg=%s, statement=\n  %s" %
                       (msg, statement))
                # TODO: check for database locked msg
                if not options.retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error(msg)
                time.sleep(5)
                continue
            break

    else:
        # insert line by line (could not figure out how to do bulk loading with
        # subprocess and COPY FROM STDIN)
        statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename,
                                                         ')s, %('.join(take))
        # output data used for guessing:
        for d in row_iter(rows, reader):

            ninput += 1
            E.debug("single insert:\n# %s" % (statement % d))
            cc = executewait(dbhandle,
                             statement,
                             error,
                             retry=options.retry,
                             args=d)
            cc.close()

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

    E.info("building indices")
    nindex = 0
    for index in options.indices:

        nindex += 1
        try:
            statement = "CREATE INDEX %s_index%i ON %s (%s)" % (
                options.tablename, nindex, options.tablename, index)
            cc = executewait(dbhandle, statement, error, options.retry)
            cc.close()
            E.info("added index on column %s" % (index))
        except error as msg:
            E.info("adding index on column %s failed: %s" % (index, msg))

    statement = "SELECT COUNT(*) FROM %s" % (options.tablename)
    cc = executewait(dbhandle, statement, error, options.retry)
    result = cc.fetchone()
    cc.close()

    noutput = result[0]

    E.info("ninput=%i, noutput=%i, nskipped_columns=%i" %
           (ninput, noutput, len(ignored)))

    dbhandle.commit()
Beispiel #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--remove",
                      dest="remove",
                      action="store_true",
                      help="remove specified columns, keep all others.")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option(
        "-l",
        "--large",
        dest="large",
        action="store_true",
        help=
        "large columns. Do not use native python CSV module [default=%default]."
    )

    parser.add_option("-f",
                      "--filename-fields",
                      dest="filename_fields",
                      type="string",
                      help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        large=False,
        filename_fields=None,
    )

    (options, args) = E.Start(parser, add_csv_options=True, quiet=True)

    input_fields = args

    if options.filename_fields:
        input_fields = [
            x[:-1].split("\t")[0] for x in [
                x for x in IOTools.openFile(options.filename_fields,
                                            "r").readlines() if x[0] != "#"
            ]
        ]

    if options.unique:
        outfile = UniqueBuffer(options.stdout)
    else:
        outfile = options.stdout

    while 1:
        line = options.stdin.readline()

        if not line:
            E.Stop()
            sys.exit(0)

        if line[0] == "#":
            continue

        first_line = line
        break

    old_fields = first_line[:-1].split("\t")

    fields = []
    for f in input_fields:
        # do pattern search
        if f[0] == "%" and f[-1] == "%":
            pattern = re.compile(f[1:-1])
            for o in old_fields:
                if pattern.search(o) and o not in fields:
                    fields.append(o)
        else:
            if f in old_fields:
                fields.append(f)

    if options.remove:
        fields = set(fields)
        fields = [x for x in old_fields if x not in fields]

    if options.large:
        reader = CSV.DictReaderLarge(CSV.CommentStripper(options.stdin),
                                     fieldnames=old_fields,
                                     dialect=options.csv_dialect)
    else:
        reader = csv.DictReader(CSV.CommentStripper(options.stdin),
                                fieldnames=old_fields,
                                dialect=options.csv_dialect)

    writer = csv.DictWriter(outfile,
                            fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    print("\t".join(fields))

    first_row = True
    ninput, noutput, nerrors = 0, 0, 0

    while 1:
        ninput += 1
        try:
            row = six.next(reader)
        except _csv.Error as msg:
            options.stderr.write("# error while parsing: %s\n" % (msg))
            nerrors += 1
            continue
        except StopIteration:
            break
        if not row:
            break
        writer.writerow(row)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.Stop()
Beispiel #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_cut.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--remove",
                      dest="remove",
                      action="store_true",
                      help="remove specified columns, keep all others.")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option(
        "-l",
        "--large",
        dest="large",
        action="store_true",
        help=
        "large columns. Do not use native python CSV module [default=%default]."
    )

    parser.add_option("-f",
                      "--filename-fields",
                      dest="filename_fields",
                      type="string",
                      help="filename with field information.")

    parser.set_defaults(
        remove=False,
        unique=False,
        filename_fields=None,
    )

    (options, args) = E.Start(parser, add_csv_options=True, quiet=True)

    statement = " ".join(args)

    if options.large:
        reader = CSV.DictReaderLarge(CSV.CommentStripper(sys.stdin),
                                     dialect=options.csv_dialect)
    else:
        reader = csv.DictReader(CSV.CommentStripper(sys.stdin),
                                dialect=options.csv_dialect)

    exec("f = lambda r: %s" % statement, globals())
    counter = E.Counter()
    writer = csv.DictWriter(options.stdout,
                            reader.fieldnames,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator)

    writer.writerow(dict((fn, fn) for fn in reader.fieldnames))

    while 1:
        counter.input += 1
        try:
            row = next(reader)
        except _csv.Error as msg:
            options.stderr.write("# error while parsing: %s\n" % (msg))
            counter.errors += 1
            continue
        except StopIteration:
            break

        if not row:
            break

        if f(row):
            writer.writerow(row)
            counter.output += 1
        else:
            counter.filtered += 1

    E.info("%s" % counter)

    E.Stop()