def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--no-swissprot-version", dest="no_swissprot_version", action="store_true", help="remove swissprot version information [%default]") parser.add_option("--no-pfam-version", dest="no_pfam_version", action="store_true", help="remove pfam version information [%default]") parser.add_option("--prefix", dest="prefix", type="string", help="add prefix to id [%default]") parser.set_defaults(no_swissprot_version=False, no_pfam_version=False, prefix="") (options, args) = E.Start(parser) rx_head = re.compile(">(\S+)\s+\S+\| (\S+) (\d+) a.a.") rx_domain = re.compile("(\S+) .* (PF\d+.\d+) (.*) (.*)") options.stdout.write("nid\tstart\tend\tfamily\n") ninput, noutput, ndomains, nskipped = 0, 0, 0, 0 for record in record_iterator(sys.stdin): ninput += 1 try: id, acc, len = rx_head.match(record[0]).groups() except AttributeError, msg: E.warn("parsing error in line `%s`" % record[0]) nskipped += 1 continue if options.no_swissprot_version: acc = acc.split(".")[0] for line in record[1:]: # no Pfam-B if line.startswith("Pfam-B"): continue name, family, description, coordinates = rx_domain.match( line).groups() for c in coordinates.split(" "): start, end = [int(x) for x in c.split("-")] start -= 1 options.stdout.write( options.prefix + "\t".join(map(str, (acc, start, end, family))) + "\n") ndomains += 1 noutput += 1
def main(): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE ) parser.add_option( "--no-swissprot-version", dest="no_swissprot_version", action="store_true", help="remove swissprot version information [%default]" ) parser.add_option( "--no-pfam-version", dest="no_pfam_version", action="store_true", help="remove pfam version information [%default]" ) parser.add_option( "--prefix", dest="prefix", type="string", help="add prefix to id [%default]" ) parser.set_defaults( no_swissprot_version = False, no_pfam_version = False, prefix = "" ) (options,args) = E.Start( parser ) rx_head = re.compile( ">(\S+)\s+\S+\| (\S+) (\d+) a.a.") rx_domain = re.compile( "(\S+) .* (PF\d+.\d+) (.*) (.*)") options.stdout.write( "nid\tstart\tend\tfamily\n") ninput, noutput, ndomains, nskipped = 0,0,0,0 for record in record_iterator( sys.stdin ): ninput += 1 try: id, acc, len = rx_head.match( record[0] ).groups() except AttributeError, msg: E.warn( "parsing error in line `%s`" % record[0]) nskipped += 1 continue if options.no_swissprot_version: acc = acc.split(".")[0] for line in record[1:]: # no Pfam-B if line.startswith( "Pfam-B"): continue name, family, description, coordinates = rx_domain.match( line ).groups() for c in coordinates.split( " "): start,end = [ int(x) for x in c.split("-") ] start -= 1 options.stdout.write( options.prefix + "\t".join( map(str, (acc, start, end, family) ) ) + "\n" ) ndomains += 1 noutput += 1
def main(): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = globals()["__doc__"]) parser.add_option( "-n", "--nids", dest="filename_nids", type="string", help="filename with nids[default=%default].") parser.add_option( "-c", "--column", dest="columns", type="int", action="append", help="columns with nids to translate (1-based) [default=%default].") parser.add_option( "-d", "--is-domains", dest="is_domains", action="store_true", help="translate domain ids [default=%default].") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="invert mapping [default=%default].") parser.add_option( "-e", "--no-header", dest="no_header", action="store_true", help="file has no header [default=%default].") parser.set_defaults( filename_nids = "adda.nids", columns = [], is_domains = False, invert = False, noheader = False, ) (options, args) = E.Start( parser ) map_nid2pid = AddaIO.readMapPid2Nid( open(options.filename_nids, "r") ) if options.invert: E.info( "inverting mapping" ) map_nid2pid = dict( [ (int(x[1]),str(x[0])) for x in map_nid2pid.iteritems()] ) if len(options.columns) == 0: options.columns = [1] columns = [x-1 for x in options.columns ] toTuple, toDomain = AddaIO.toTuple, AddaIO.toDomain first = not options.no_header is_domains = options.is_domains ninput, noutput, nskipped = 0, 0, 0 for line in options.stdin: if line.startswith("#"): options.stdout.write(line) continue if first: options.stdout.write(line) first = False continue ninput += 1 data = line[:-1].split("\t") for x in columns: if is_domains: try: d = toTuple(data[x]) except ValueError: E.warn( "could not parse domain `%s`" % data[x]) nskipped += 1 break try: data[x] = toDomain( (str(map_nid2pid[d[0]]),d[1],d[2]) ) except (IndexError, KeyError): E.warn( "could not map domain `%s`" % data[x]) nskipped += 1 break else: try: data[x] = str(map_nid2pid[int(data[x])]) except IndexError: E.warn( "could not map nid `%s`" % data[x]) nskipped += 1 break else: options.stdout.write("%s\n" % "\t".join(data)) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(): parser = optparse.OptionParser( version = "%prog version: $Id$", usage = USAGE) parser.add_option( "--dialect", dest="dialect", type="string", help="csv dialect to use [default=%default]." ) parser.add_option("-m", "--map", dest="map", type="string", action="append", help="explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]." ) parser.add_option("-t", "--table", dest="tablename", type="string", help="table name for all backends [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="database name for sqlite3 [default=%default]." ) parser.add_option("-l", "--lowercase", dest="lowercase", action="store_true", help="force lower case column names [default=%default]." ) parser.add_option("-u", "--ignore-duplicates", dest="ignore_duplicates", action="store_true", help="ignore columns with duplicate names [default=%default]." ) parser.add_option("-s", "--ignore-same", dest="ignore_same", action="store_true", help="ignore columns with identical values [default=%default]." ) parser.add_option("-e", "--ignore-empty", dest="ignore_empty", action="store_true", help="ignore columns which are all empty [default=%default]." ) parser.add_option("-q", "--quick", dest="insert_quick", action="store_true", help="try quick file based import - needs to be supported by the backend [default=%default]." ) parser.add_option("-b", "--backend", dest="backend", type="choice", choices=("pg", "sqlite", "mysql" ), help="database backend to choose [default=%default]." ) parser.add_option("-i", "--index", dest="indices", type="string", action="append", help="create an index for the named column [default=%default]." ) parser.add_option("-a", "--allow-empty", dest="allow_empty", action="store_true", help="allow empty table [default=%default]." ) parser.add_option("--force-single", dest="force_single", action="store_true", help="force upload line by line [default=%default]." ) parser.set_defaults( map = [], dialect = "excel-tab", database = "csvdb", lowercase = False, tablename = "csv", from_file = False, ignore_duplicates= False, ignore_identical = False, ignore_empty = False, insert_many = False, force_single = False, guess_size = 1000, report_step = 10000, backend="pg", indices = [], missing_values = ("na", "NA", ), insert_quick = False, allow_empty = False, ) (options, args) = E.Start( parser, add_psql_options = True, add_mysql_options = True ) options.tablename = quoteTableName( options.tablename, backend = options.backend ) if options.map: m = {} for x in options.map: f,t = x.split(":") m[f] = t options.map = m else: options.map = {} index_mangle = str if options.backend == "pg": import pgdb dbhandle = pgdb.connect( options.psql_connection ) error = pgdb.DatabaseError options.null = "NULL" options.string_value = "'%s'" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect( options.database ) error = sqlite3.OperationalError options.insert_many = not options.force_single options.null = None # "NULL" options.string_value = "%s" # "'%s'" elif options.backend == "mysql": import MySQLdb, _mysql error = (_mysql.OperationalError, _mysql.ProgrammingError ) if options.port: dbhandle = MySQLdb.connect(host = options.host, user = options.user, passwd = options.password, db = options.database, port = options.port ) else: dbhandle = MySQLdb.connect(host = options.host, user = options.user, passwd = options.password, db = options.database, unix_socket = options.socket ) options.insert_many = False # not options.force_single, fails with error options.null = "NULL" options.string_value = "'%s'" index_mangle = lambda x: re.sub("[.]", "_", x ) reader = CSV.DictReader( sys.stdin, dialect=options.dialect ) rows = [] for row in reader: try: rows.append( CSV.ConvertDictionary( row , map=options.map )) except TypeError, msg: E.warn( "incomplete line? Type error in conversion: '%s' with data: %s" % (msg, str(row) ) ) if len(rows) >= options.guess_size: break
def main(): parser = optparse.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--dialect", dest="dialect", type="string", help="csv dialect to use [default=%default].") parser.add_option( "-m", "--map", dest="map", type="string", action="append", help= "explicit mapping function for columns The format is column:type (e.g.: length:int) [default=%default]." ) parser.add_option("-t", "--table", dest="tablename", type="string", help="table name for all backends [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="database name for sqlite3 [default=%default].") parser.add_option("-l", "--lowercase", dest="lowercase", action="store_true", help="force lower case column names [default=%default].") parser.add_option( "-u", "--ignore-duplicates", dest="ignore_duplicates", action="store_true", help="ignore columns with duplicate names [default=%default].") parser.add_option( "-s", "--ignore-same", dest="ignore_same", action="store_true", help="ignore columns with identical values [default=%default].") parser.add_option( "-e", "--ignore-empty", dest="ignore_empty", action="store_true", help="ignore columns which are all empty [default=%default].") parser.add_option( "-q", "--quick", dest="insert_quick", action="store_true", help= "try quick file based import - needs to be supported by the backend [default=%default]." ) parser.add_option("-b", "--backend", dest="backend", type="choice", choices=("pg", "sqlite", "mysql"), help="database backend to choose [default=%default].") parser.add_option( "-i", "--index", dest="indices", type="string", action="append", help="create an index for the named column [default=%default].") parser.add_option("-a", "--allow-empty", dest="allow_empty", action="store_true", help="allow empty table [default=%default].") parser.add_option("--force-single", dest="force_single", action="store_true", help="force upload line by line [default=%default].") parser.set_defaults( map=[], dialect="excel-tab", database="csvdb", lowercase=False, tablename="csv", from_file=False, ignore_duplicates=False, ignore_identical=False, ignore_empty=False, insert_many=False, force_single=False, guess_size=1000, report_step=10000, backend="pg", indices=[], missing_values=( "na", "NA", ), insert_quick=False, allow_empty=False, ) (options, args) = E.Start(parser, add_psql_options=True, add_mysql_options=True) options.tablename = quoteTableName(options.tablename, backend=options.backend) if options.map: m = {} for x in options.map: f, t = x.split(":") m[f] = t options.map = m else: options.map = {} index_mangle = str if options.backend == "pg": import pgdb dbhandle = pgdb.connect(options.psql_connection) error = pgdb.DatabaseError options.null = "NULL" options.string_value = "'%s'" if options.insert_quick: raise ValueError("quick import not implemented.") elif options.backend == "sqlite": import sqlite3 dbhandle = sqlite3.connect(options.database) error = sqlite3.OperationalError options.insert_many = not options.force_single options.null = None # "NULL" options.string_value = "%s" # "'%s'" elif options.backend == "mysql": import MySQLdb, _mysql error = (_mysql.OperationalError, _mysql.ProgrammingError) if options.port: dbhandle = MySQLdb.connect(host=options.host, user=options.user, passwd=options.password, db=options.database, port=options.port) else: dbhandle = MySQLdb.connect(host=options.host, user=options.user, passwd=options.password, db=options.database, unix_socket=options.socket) options.insert_many = False # not options.force_single, fails with error options.null = "NULL" options.string_value = "'%s'" index_mangle = lambda x: re.sub("[.]", "_", x) reader = CSV.DictReader(sys.stdin, dialect=options.dialect) rows = [] for row in reader: try: rows.append(CSV.ConvertDictionary(row, map=options.map)) except TypeError, msg: E.warn( "incomplete line? Type error in conversion: '%s' with data: %s" % (msg, str(row))) if len(rows) >= options.guess_size: break