Esempio n. 1
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--no-swissprot-version",
                      dest="no_swissprot_version",
                      action="store_true",
                      help="remove swissprot version information [%default]")

    parser.add_option("--no-pfam-version",
                      dest="no_pfam_version",
                      action="store_true",
                      help="remove pfam version information [%default]")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="add prefix to id [%default]")

    parser.set_defaults(no_swissprot_version=False,
                        no_pfam_version=False,
                        prefix="")

    (options, args) = E.Start(parser)

    rx_head = re.compile(">(\S+)\s+\S+\| (\S+) (\d+) a.a.")
    rx_domain = re.compile("(\S+) .* (PF\d+.\d+) (.*)  (.*)")
    options.stdout.write("nid\tstart\tend\tfamily\n")

    ninput, noutput, ndomains, nskipped = 0, 0, 0, 0
    for record in record_iterator(sys.stdin):
        ninput += 1
        try:
            id, acc, len = rx_head.match(record[0]).groups()
        except AttributeError, msg:
            E.warn("parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith("Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match(
                line).groups()

            for c in coordinates.split(" "):
                start, end = [int(x) for x in c.split("-")]
                start -= 1
                options.stdout.write(
                    options.prefix +
                    "\t".join(map(str, (acc, start, end, family))) + "\n")
                ndomains += 1
            noutput += 1
Esempio n. 2
0
def main():
    
    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE )

    parser.add_option( "--no-swissprot-version", dest="no_swissprot_version", action="store_true",
                       help="remove swissprot version information [%default]" )

    parser.add_option( "--no-pfam-version", dest="no_pfam_version", action="store_true",
                       help="remove pfam version information [%default]" )

    parser.add_option( "--prefix", dest="prefix", type="string",
                       help="add prefix to id [%default]" )

    parser.set_defaults( 
        no_swissprot_version = False,
        no_pfam_version = False,
        prefix = ""
        )

    (options,args) = E.Start( parser )

    rx_head = re.compile( ">(\S+)\s+\S+\| (\S+) (\d+) a.a.")
    rx_domain = re.compile( "(\S+) .* (PF\d+.\d+) (.*)  (.*)")
    options.stdout.write( "nid\tstart\tend\tfamily\n")
    
    ninput, noutput, ndomains, nskipped = 0,0,0,0
    for record in record_iterator( sys.stdin ):
        ninput += 1
        try:
            id, acc, len = rx_head.match( record[0] ).groups()
        except AttributeError, msg:
            E.warn( "parsing error in line `%s`" % record[0])
            nskipped += 1
            continue

        if options.no_swissprot_version: acc = acc.split(".")[0]
        for line in record[1:]:
            # no Pfam-B
            if line.startswith( "Pfam-B"): continue
            name, family, description, coordinates = rx_domain.match( line ).groups()
                
            for c in coordinates.split( " "):
                start,end = [ int(x) for x in c.split("-") ]
                start -= 1
                options.stdout.write( options.prefix + "\t".join( map(str, (acc, start, end, family) ) ) + "\n" )
                ndomains += 1
            noutput += 1
Esempio n. 3
0
def main():
    
    parser = optparse.OptionParser( version = "%prog version: $Id$", 
                                    usage = globals()["__doc__"])

    parser.add_option( "-n", "--nids", dest="filename_nids", type="string",
                       help="filename with nids[default=%default].")

    parser.add_option( "-c", "--column", dest="columns", type="int", action="append",
                       help="columns with nids to translate (1-based) [default=%default].")

    parser.add_option( "-d", "--is-domains", dest="is_domains", action="store_true",
                       help="translate domain ids [default=%default].")

    parser.add_option( "-i", "--invert", dest="invert", action="store_true",
                       help="invert mapping [default=%default].")

    parser.add_option( "-e", "--no-header", dest="no_header", action="store_true",
                       help="file has no header [default=%default].")

    parser.set_defaults( 
        filename_nids = "adda.nids",
        columns = [],
        is_domains = False,
        invert = False,
        noheader = False,
        )
    
    (options, args) = E.Start( parser )
    
    map_nid2pid = AddaIO.readMapPid2Nid( open(options.filename_nids, "r") )
    if options.invert:
        E.info( "inverting mapping" )
        map_nid2pid = dict( [ (int(x[1]),str(x[0])) for x in map_nid2pid.iteritems()] )

    if len(options.columns) == 0: options.columns = [1]
    columns = [x-1 for x in options.columns ]

    toTuple, toDomain = AddaIO.toTuple, AddaIO.toDomain
    first = not options.no_header
    is_domains = options.is_domains
    ninput, noutput, nskipped = 0, 0, 0
    for line in options.stdin:
        if line.startswith("#"):
            options.stdout.write(line)
            continue

        if first:
            options.stdout.write(line)
            first = False
            continue
        
        ninput += 1

        data = line[:-1].split("\t")
        for x in columns:
            if is_domains:
                try:
                    d = toTuple(data[x])
                except ValueError:
                    E.warn( "could not parse domain `%s`" % data[x])
                    nskipped += 1
                    break

                try:
                    data[x] = toDomain( (str(map_nid2pid[d[0]]),d[1],d[2]) )
                except (IndexError, KeyError):
                    E.warn( "could not map domain `%s`" % data[x])
                    nskipped += 1
                    break
            else:
                try:
                    data[x] = str(map_nid2pid[int(data[x])])
                except IndexError:
                    E.warn( "could not map nid `%s`" % data[x])
                    nskipped += 1
                    break
        else:
            options.stdout.write("%s\n" % "\t".join(data))
            noutput += 1

    E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))
    E.Stop()
Esempio n. 4
0
def main():

    parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE)

    parser.add_option( "--dialect", dest="dialect", type="string",
                      help="csv dialect to use [default=%default]." )

    parser.add_option("-m", "--map", dest="map", type="string", action="append",
                      help="explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]." )

    parser.add_option("-t", "--table", dest="tablename", type="string",
                      help="table name for all backends [default=%default]." )

    parser.add_option("-d", "--database", dest="database", type="string",
                      help="database name for sqlite3 [default=%default]." )

    parser.add_option("-l", "--lowercase", dest="lowercase", action="store_true",
                      help="force lower case column names [default=%default]." )

    parser.add_option("-u", "--ignore-duplicates", dest="ignore_duplicates", action="store_true",
                      help="ignore columns with duplicate names [default=%default]." )

    parser.add_option("-s", "--ignore-same", dest="ignore_same", action="store_true",
                      help="ignore columns with identical values [default=%default]." )
    
    parser.add_option("-e", "--ignore-empty", dest="ignore_empty", action="store_true",
                      help="ignore columns which are all empty [default=%default]." )

    parser.add_option("-q", "--quick", dest="insert_quick", action="store_true",
                      help="try quick file based import - needs to be supported by the backend [default=%default]." )

    parser.add_option("-b", "--backend", dest="backend", type="choice",
                      choices=("pg", "sqlite", "mysql" ),
                      help="database backend to choose [default=%default]." )

    parser.add_option("-i", "--index", dest="indices", type="string", action="append",
                      help="create an index for the named column [default=%default]." )

    parser.add_option("-a", "--allow-empty", dest="allow_empty", action="store_true",
                      help="allow empty table [default=%default]." )

    parser.add_option("--force-single", dest="force_single", action="store_true",
                      help="force upload line by line [default=%default]." )

    parser.set_defaults(
        map = [],
        dialect = "excel-tab",
        database = "csvdb",
        lowercase = False,
        tablename = "csv",
        from_file = False,
        ignore_duplicates= False,
        ignore_identical = False,
        ignore_empty = False,
        insert_many = False,
        force_single = False,
        guess_size = 1000,
        report_step = 10000,
        backend="pg",
        indices = [],
        missing_values = ("na", "NA", ),
        insert_quick = False,
        allow_empty = False,
        )

    (options, args) = E.Start( parser, 
                               add_psql_options = True,
                               add_mysql_options = True )

    options.tablename = quoteTableName( options.tablename, backend = options.backend )
    
    if options.map:
        m = {}
        for x in options.map:
            f,t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    index_mangle = str
    if options.backend == "pg":
        import pgdb
        dbhandle = pgdb.connect( options.psql_connection )
        error = pgdb.DatabaseError
        options.null = "NULL"
        options.string_value = "'%s'"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect( options.database )
        error = sqlite3.OperationalError
        options.insert_many = not options.force_single
        options.null = None # "NULL" 
        options.string_value = "%s" # "'%s'"

    elif options.backend == "mysql":
        import MySQLdb, _mysql
        error = (_mysql.OperationalError, _mysql.ProgrammingError )
        if options.port:
            dbhandle = MySQLdb.connect(host        = options.host,
                                       user        = options.user,
                                       passwd      = options.password,
                                       db          = options.database,
                                       port        = options.port )
        else:
            dbhandle = MySQLdb.connect(host        = options.host,
                                       user        = options.user,
                                       passwd      = options.password,
                                       db          = options.database,
                                       unix_socket = options.socket )
            
        options.insert_many = False # not options.force_single, fails with error
        options.null = "NULL" 
        options.string_value = "'%s'"
        index_mangle = lambda x: re.sub("[.]", "_", x )

    reader = CSV.DictReader( sys.stdin, dialect=options.dialect )

    rows = []
    for row in reader:

        try:
            rows.append( CSV.ConvertDictionary( row , map=options.map ))
        except TypeError, msg:
            E.warn( "incomplete line? Type error in conversion: '%s' with data: %s" % (msg, str(row) ) )

        if len(rows) >= options.guess_size:
            break
Esempio n. 5
0
def main():

    parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE)

    parser.add_option("--dialect",
                      dest="dialect",
                      type="string",
                      help="csv dialect to use [default=%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="map",
        type="string",
        action="append",
        help=
        "explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]."
    )

    parser.add_option("-t",
                      "--table",
                      dest="tablename",
                      type="string",
                      help="table name for all backends [default=%default].")

    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="database name for sqlite3 [default=%default].")

    parser.add_option("-l",
                      "--lowercase",
                      dest="lowercase",
                      action="store_true",
                      help="force lower case column names [default=%default].")

    parser.add_option(
        "-u",
        "--ignore-duplicates",
        dest="ignore_duplicates",
        action="store_true",
        help="ignore columns with duplicate names [default=%default].")

    parser.add_option(
        "-s",
        "--ignore-same",
        dest="ignore_same",
        action="store_true",
        help="ignore columns with identical values [default=%default].")

    parser.add_option(
        "-e",
        "--ignore-empty",
        dest="ignore_empty",
        action="store_true",
        help="ignore columns which are all empty [default=%default].")

    parser.add_option(
        "-q",
        "--quick",
        dest="insert_quick",
        action="store_true",
        help=
        "try quick file based import - needs to be supported by the backend [default=%default]."
    )

    parser.add_option("-b",
                      "--backend",
                      dest="backend",
                      type="choice",
                      choices=("pg", "sqlite", "mysql"),
                      help="database backend to choose [default=%default].")

    parser.add_option(
        "-i",
        "--index",
        dest="indices",
        type="string",
        action="append",
        help="create an index for the named column [default=%default].")

    parser.add_option("-a",
                      "--allow-empty",
                      dest="allow_empty",
                      action="store_true",
                      help="allow empty table [default=%default].")

    parser.add_option("--force-single",
                      dest="force_single",
                      action="store_true",
                      help="force upload line by line [default=%default].")

    parser.set_defaults(
        map=[],
        dialect="excel-tab",
        database="csvdb",
        lowercase=False,
        tablename="csv",
        from_file=False,
        ignore_duplicates=False,
        ignore_identical=False,
        ignore_empty=False,
        insert_many=False,
        force_single=False,
        guess_size=1000,
        report_step=10000,
        backend="pg",
        indices=[],
        missing_values=(
            "na",
            "NA",
        ),
        insert_quick=False,
        allow_empty=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_mysql_options=True)

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    index_mangle = str
    if options.backend == "pg":
        import pgdb
        dbhandle = pgdb.connect(options.psql_connection)
        error = pgdb.DatabaseError
        options.null = "NULL"
        options.string_value = "'%s'"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database)
        error = sqlite3.OperationalError
        options.insert_many = not options.force_single
        options.null = None  # "NULL"
        options.string_value = "%s"  # "'%s'"

    elif options.backend == "mysql":
        import MySQLdb, _mysql
        error = (_mysql.OperationalError, _mysql.ProgrammingError)
        if options.port:
            dbhandle = MySQLdb.connect(host=options.host,
                                       user=options.user,
                                       passwd=options.password,
                                       db=options.database,
                                       port=options.port)
        else:
            dbhandle = MySQLdb.connect(host=options.host,
                                       user=options.user,
                                       passwd=options.password,
                                       db=options.database,
                                       unix_socket=options.socket)

        options.insert_many = False  # not options.force_single, fails with error
        options.null = "NULL"
        options.string_value = "'%s'"
        index_mangle = lambda x: re.sub("[.]", "_", x)

    reader = CSV.DictReader(sys.stdin, dialect=options.dialect)

    rows = []
    for row in reader:

        try:
            rows.append(CSV.ConvertDictionary(row, map=options.map))
        except TypeError, msg:
            E.warn(
                "incomplete line? Type error in conversion: '%s' with data: %s"
                % (msg, str(row)))

        if len(rows) >= options.guess_size:
            break