def get_gtf(cur, species): #print "SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s" % (species) cur.execute("SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s", (species)) for row in cur.fetchall(): gtf = { 'seqname': row[0], 'source': 'transdecoder', 'feature': 'CDS', 'start': row[1], 'end': row[2], 'strand': row[3], 'frame': '.', 'gene_id': row[4], 'transcript_id': row[4] + '.1' } print flatten_GTF(gtf)
def get_gtf(cur, species): #print "SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s" % (species) cur.execute( "SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s", (species)) for row in cur.fetchall(): gtf = { 'seqname': row[0], 'source': 'transdecoder', 'feature': 'CDS', 'start': row[1], 'end': row[2], 'strand': row[3], 'frame': '.', 'gene_id': row[4], 'transcript_id': row[4] + '.1' } print flatten_GTF(gtf)
def main(argv): gtf_filename = '' species = '' seqfilename = '' usage = 'Blast2GTF.py -n <species_name> -g <gtf_outfile> -s <seq_file>' try: opts, args = getopt.getopt(argv,"hn:g:s:",["name=", "gtf=", "seqfile="]) if not opts: raise getopt.GetoptError('no opts') except getopt.GetoptError: print usage sys.exit(2) for opt, arg in opts: if opt == "-h": print usage sys.exit() elif opt in ("-n", "--name"): species = arg elif opt in ("-s", "--seqfile"): seqfilename = arg elif opt in ("-g", "--gtf"): gtf_filename = arg con = mdb.connect('localhost', 'root', '', 'Selaginella'); with con: #make a list of names to ensure that there are no duplicates name_list = {} #read dictionary of cluster membership (It would be far better to put this info into the db, but this is working, so I won't mess with it cluster_info = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", "SeqClusters.p") seq_groups = pickle.load( open( cluster_info, "rb" ) ) if gtf_filename: outfile = open(gtf_filename, 'wb') gtf_writer = csv.writer(outfile, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE) if seqfilename: seqfile = open(seqfilename, 'wb') cur = con.cursor() cur.execute("SELECT a.seqid, b.sequence FROM Species a, Ortholog_groups b WHERE a.seqid = b.seqid AND a.species= %s", (species)) #cur.execute("SELECT seqid, sequence FROM Ortholog_groups b WHERE seqid = 'UNCcomp100582_c0_seq1'") rows = cur.fetchall() for (seqid, seq) in rows: try: seq_record = SeqRecord(Seq(seq)) except TypeError: warnings.warn("Can't create seq object from %s for %s" % (seq, seqid)) continue if gtf_filename: #Write blast info to GTF file hit_list = {} hit_order = [] cur.execute("SELECT * FROM BLAST WHERE qseqid=%s", (seqid,)) for (id, qseqid, qlen, sacc, slen, pident, length, mismatch, gapopen, qstart, qend, qframe, sstart, send, sframe, evalue, bitscore, strand, hitnum) in cur.fetchall(): feature = { 'source': '1kp', 'feature': 'blast_hit', 'frame': '.', 'seqname': qseqid, 'score': float(bitscore), 'start': int(qstart), 'end': int(qend) } #Add strand information and reverse coordinates if on negative strand if strand == '1': feature['strand'] = '+' elif strand == '0': feature['strand'] = '-' else: sys.exit("Strand %s not recognized" % strand) #In cases where multiple hits to the same subject, combine by maximizing coordinates if sacc in hit_list: if hit_list[sacc]['start'] > feature['start']: hit_list[sacc]['start'] = feature['start'] if hit_list[sacc]['end'] < feature['end']: hit_list[sacc]['end'] = feature['end'] else: hit_order.append(sacc) hit_list[sacc] = feature #Scan through hits in order and write CDS features for non-overlapping ones for i in range(len(hit_order)): overlap = 0 sacc = hit_order[i] feature = hit_list[sacc] for j in range(i): overlap = max(overlap, hit_overlap(feature, hit_list[hit_order[j]])) if not overlap: if sacc in seq_groups: name = "%s_c%s_0" % (qseqid[0:qseqid.find('comp')], seq_groups[sacc].split("_")[1]) else: name = sacc + "_0" name_num = 1 while name in name_list: name = name.split("_")[0:-1] + [str(name_num)] name = "_".join(name) name_num = name_num + 1 name_list[name] = 1 feature['gene_id'] = name feature['transcript_id'] = feature['gene_id'] + '.1' feature['feature'] = 'CDS' feature['score'] = '.' if feature['strand'] == '+': (feature['start'], feature['end']) = get_orf_coords(seq_record, feature['start'], feature['end']) feature['frame'] = feature['start'] % 3 + 1 #print "feature start, feature end = %s, %s" % ( feature['start'], feature['end'] ) note = orf_integrity(Seq(seq[feature['start']-1:feature['end']])) if note: feature['note'] = note else: (orf_start, orf_end) = get_orf_coords(seq_record.reverse_complement(), len(seq_record) - qend + 1, len(seq_record) - qstart + 1) (feature['start'], feature['end']) = (len(seq_record) - orf_end + 1, len(seq_record) - orf_start + 1) feature['frame'] = ( len(seq_record) - feature['end'] + 1 ) % 3 + 1 orf_rev = Seq(seq[feature['start']-1:feature['end']] ) note = orf_integrity(orf_rev.reverse_complement()) if note: feature['note'] = note gtf_writer.writerow(flatten_GTF(feature)) if seqfilename: seqfile.write(">%s\n%s\n" % (seqid, seq))