Exemple #1
0
 def row_iter(rows, reader):
     for row in rows:
         yield quoteRow(row,
                        take,
                        map_column2type,
                        options.missing_values,
                        null=options.null,
                        string_value=options.string_value)
     for data in reader:
         yield quoteRow(IOTools.convertDictionary(data, map=options.map),
                        take,
                        map_column2type,
                        options.missing_values,
                        null=options.null,
                        string_value=options.string_value)
Exemple #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$")

    parser.add_option(
        "-s", "--method=sort --sort-order", dest="sort", type="string",
        help="fields to take (in sorted order).")

    (options, args) = E.start(parser, add_csv_options=True)

    reader = csv.DictReader(E.stdin, dialect=options.csv_dialect)

    if options.sort:
        fields = options.sort.split(",")
    else:
        fields = None

    writer = csv.DictWriter(E.stdout,
                            fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    E.stdout.write("\t".join(fields) + "\n")

    for row in reader:
        row = IOTools.convertDictionary(row)
        writer.writerow(row)

    E.stop()
Exemple #3
0
def run(infile, options, report_step=10000):

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    existing_tables = set()

    quick_import_separator = "\t"

    if options.database_backend == "postgres":
        import psycopg2
        raise NotImplementedError("needs refactoring for commandline options")
        dbhandle = psycopg2.connect(options.psql_connection)
        error = psycopg2.Error
        options.null = "NULL"
        options.string_value = "'%s'"
        options.text = "TEXT"
        options.index = "TEXT"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.database_backend == "mysql":
        import MySQLdb
        dbhandle = MySQLdb.connect(host=options.database_host,
                                   user=options.database_username,
                                   passwd=options.database_password,
                                   port=options.database_port,
                                   db=options.database_name)
        error = Exception
        options.null = "NULL"
        options.string_value = "%s"
        options.text = "TEXT"
        options.index = "VARCHAR(40)"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database_name)
        try:
            os.chmod(options.database_name, 0o664)
        except OSError as msg:
            E.warn("could not change permissions of database: %s" % msg)

        # Avoid the following error:
        # sqlite3.ProgrammingError: You must not use 8-bit bytestrings
        # unless you use a text_factory that can interpret 8-bit
        # bytestrings (like text_factory = str). It is highly
        # recommended that you instead just switch your application
        # to Unicode strings
        # Note: might be better to make csv2db unicode aware.
        dbhandle.text_factory = str

        error = sqlite3.OperationalError
        options.insert_many = True  # False
        options.null = None  # "NULL"
        options.text = "TEXT"
        options.index = "TEXT"
        options.string_value = "%s"  # "'%s'"

        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait(dbhandle, statement, error, options.retry)
        existing_tables = set([x[0] for x in cc])
        cc.close()

        # use , as separator
        quick_import_statement = \
            "sqlite3 %s '.import %%s %s'" % \
            (options.database_name, options.tablename)

        quick_import_separator = "|"

    if options.header is not None:
        options.header = [x.strip() for x in options.header.split(",")]

    if options.utf:
        reader = CSV.UnicodeDictReader(infile,
                                       dialect=options.dialect,
                                       fieldnames=options.header)
    else:
        reader = csv.DictReader(CSV.CommentStripper(infile),
                                dialect=options.dialect,
                                fieldnames=options.header)

    if options.replace_header:
        try:
            next(reader)
        except StopIteration:
            pass

    E.info("reading %i columns to guess column types" % options.guess_size)

    rows = []
    for row in reader:
        if None in row:
            raise ValueError("undefined columns in input file at row: %s" %
                             row)

        try:
            rows.append(IOTools.convertDictionary(row, map=options.map))
        except TypeError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))
        except ValueError as msg:
            E.warn("incomplete line? Type error in conversion: "
                   "'%s' with data: %s" % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break

    E.info("read %i rows for type guessing" % len(rows))
    E.info("creating table")

    if len(rows) == 0:
        if options.allow_empty:
            if not reader.fieldnames:
                E.warn("no data - no table created")
            else:
                # create empty table and exit
                take, map_column2type, ignored = createTable(
                    dbhandle,
                    error,
                    options.tablename,
                    options,
                    retry=options.retry,
                    headers=reader.fieldnames,
                    ignore_empty=options.ignore_empty,
                    ignore_columns=options.ignore_columns,
                    rename_columns=options.rename_columns,
                    lowercase=options.lowercase,
                    ignore_duplicates=options.ignore_duplicates,
                    indices=options.indices,
                    first_column=options.first_column,
                    existing_tables=existing_tables,
                    append=options.append)
                E.info("empty table created")
            return
        else:
            raise ValueError("empty table")
    else:
        take, map_column2type, ignored = createTable(
            dbhandle,
            error,
            options.tablename,
            options,
            rows=rows,
            retry=options.retry,
            headers=reader.fieldnames,
            ignore_empty=options.ignore_empty,
            ignore_columns=options.ignore_columns,
            rename_columns=options.rename_columns,
            lowercase=options.lowercase,
            ignore_duplicates=options.ignore_duplicates,
            indices=options.indices,
            first_column=options.first_column,
            existing_tables=existing_tables,
            append=options.append)

    def row_iter(rows, reader):
        for row in rows:
            yield quoteRow(row,
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)
        for data in reader:
            yield quoteRow(IOTools.convertDictionary(data, map=options.map),
                           take,
                           map_column2type,
                           options.missing_values,
                           null=options.null,
                           string_value=options.string_value)

    ninput = 0

    E.info("inserting data")

    if options.insert_quick:
        E.info("using quick insert")

        outfile, filename = tempfile.mkstemp()

        E.debug("dumping data into %s" % filename)

        for d in row_iter(rows, reader):

            ninput += 1
            os.write(
                outfile,
                quick_import_separator.join([str(d[x]) for x in take]) + "\n")

            if ninput % report_step == 0:
                E.info("iteration %i\n" % ninput)

        os.close(outfile)

        statement = quick_import_statement % filename
        E.debug(statement)

        # infinite loop possible
        while 1:

            retcode = E.run(statement, cwd=os.getcwd(), close_fds=True)

            if retcode != 0:
                E.warn("import error using statement: %s" % statement)

                if not options.retry:
                    raise ValueError("import error using statement: %s" %
                                     statement)

                time.sleep(5)
                continue

            break

        os.remove(filename)

        # there is no way to insert NULL values into sqlite. The only
        # solution is to update all colums.
        for column in take:
            executewait(
                dbhandle, "UPDATE %s SET %s = NULL WHERE %s = 'None'" %
                (options.tablename, column, column), error, options.retry)

    elif options.insert_many:
        data = []
        for d in row_iter(rows, reader):
            ninput += 1

            data.append([d[x] for x in take])

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

        statement = "INSERT INTO %s VALUES (%s)" % (options.tablename,
                                                    ",".join("?" * len(take)))

        E.info("inserting %i rows" % len(data))
        E.debug("multiple insert:\n# %s" % statement)

        while 1:
            try:
                dbhandle.executemany(statement, data)
            except error as msg:
                E.warn("import failed: msg=%s, statement=\n  %s" %
                       (msg, statement))
                # TODO: check for database locked msg
                if not options.retry:
                    raise error(msg)
                if not re.search("locked", str(msg)):
                    raise error(msg)
                time.sleep(5)
                continue
            break

    else:
        # insert line by line (could not figure out how to do bulk loading with
        # subprocess and COPY FROM STDIN)
        statement = "INSERT INTO %s VALUES (%%(%s)s)" % (options.tablename,
                                                         ')s, %('.join(take))
        # output data used for guessing:
        for d in row_iter(rows, reader):

            ninput += 1
            E.debug("single insert:\n# %s" % (statement % d))
            cc = executewait(dbhandle,
                             statement,
                             error,
                             retry=options.retry,
                             args=d)
            cc.close()

            if ninput % report_step == 0:
                E.info("iteration %i" % ninput)

    E.info("building indices")
    nindex = 0
    for index in options.indices:

        nindex += 1
        try:
            statement = "CREATE INDEX %s_index%i ON %s (%s)" % (
                options.tablename, nindex, options.tablename, index)
            cc = executewait(dbhandle, statement, error, options.retry)
            cc.close()
            E.info("added index on column %s" % (index))
        except error as msg:
            E.info("adding index on column %s failed: %s" % (index, msg))

    statement = "SELECT COUNT(*) FROM %s" % (options.tablename)
    cc = executewait(dbhandle, statement, error, options.retry)
    result = cc.fetchone()
    cc.close()

    noutput = result[0]

    E.info("ninput=%i, noutput=%i, nskipped_columns=%i" %
           (ninput, noutput, len(ignored)))

    dbhandle.commit()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--outfile=",
                      dest="output_filename",
                      type="string",
                      help="write to output filename.")

    parser.set_defaults(output_filename=None, )

    (options, args) = E.start(parser, add_csv_options=True)

    if not options.output_filename:
        raise ValueError("please specify an output filename.")

    w = openpyxl.Workbook(optimized_write=True)

    for filename in args:

        lines = [
            x for x in IOTools.open_file(filename, "r").readlines()
            if x[0] != "#"
        ]

        if len(lines) == 0:
            continue

        if options.loglevel >= 2:
            print("# read %i rows" % len(lines))
            sys.stdout.flush()

        headers = lines[0][:-1].split("\t")

        ws = w.add_sheet(os.path.basename(filename))

        cur_row = 0

        ws.append(headers)

        cur_row += 1

        reader = csv.DictReader(lines, dialect=options.csv_dialect)

        for row in reader:
            row = IOTools.convertDictionary(row)

            data = [row.get(headers[x], "") for x in range(len(headers))]
            ws.append(data)

            cur_row += 1

    w.save(options.output_filename)

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    options.filename1, options.filename2 = args

    table1 = CSV.readTable(IOTools.open_file(options.filename1, "r"))
    table2 = CSV.readTable(IOTools.open_file(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines,
                                dialect=options.csv_dialect)

        print("\t".join(fields))

        first_row = True
        for row in reader:
            row = IOTools.convertDictionary(row)
            writer.writerow(row)

    E.stop()