예제 #1
0
def get_gtf(cur, species):
  #print "SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s" % (species)
  cur.execute("SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s", (species))
  for row in cur.fetchall():
      gtf = { 'seqname': row[0],
              'source': 'transdecoder',
              'feature': 'CDS',
              'start': row[1],
              'end': row[2],
              'strand': row[3],
              'frame': '.',
              'gene_id': row[4],
              'transcript_id': row[4] + '.1'
            }
      print flatten_GTF(gtf)  
예제 #2
0
def get_gtf(cur, species):
    #print "SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s" % (species)
    cur.execute(
        "SELECT seqID, start_pos, end_pos, strand, geneID FROM CodingSequences WHERE species = %s",
        (species))
    for row in cur.fetchall():
        gtf = {
            'seqname': row[0],
            'source': 'transdecoder',
            'feature': 'CDS',
            'start': row[1],
            'end': row[2],
            'strand': row[3],
            'frame': '.',
            'gene_id': row[4],
            'transcript_id': row[4] + '.1'
        }
        print flatten_GTF(gtf)
예제 #3
0
파일: Blast2GTF.py 프로젝트: hobrien/Python
def main(argv):
  gtf_filename = ''
  species = ''
  seqfilename = ''
  usage = 'Blast2GTF.py -n <species_name> -g <gtf_outfile> -s <seq_file>'
  try:
    opts, args = getopt.getopt(argv,"hn:g:s:",["name=", "gtf=", "seqfile="])
    if not opts:
      raise getopt.GetoptError('no opts')
  except getopt.GetoptError:
    print usage
    sys.exit(2)
  for opt, arg in opts:
    if opt == "-h":
       print usage
       sys.exit()
    elif opt in ("-n", "--name"):
       species = arg
    elif opt in ("-s", "--seqfile"):
       seqfilename = arg
    elif opt in ("-g", "--gtf"):
       gtf_filename = arg
  
  
  con = mdb.connect('localhost', 'root', '', 'Selaginella');
  with con:
    #make a list of names to ensure that there are no duplicates
    name_list = {}
      
    #read dictionary of cluster membership (It would be far better to put this info into the db, but this is working, so I won't mess with it
    cluster_info = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "RefSeq", "SeqClusters.p")
    seq_groups = pickle.load( open( cluster_info, "rb" ) )
    
    if gtf_filename:
      outfile = open(gtf_filename, 'wb')
      gtf_writer = csv.writer(outfile, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE)
    
    if seqfilename:
      seqfile = open(seqfilename, 'wb')
    cur = con.cursor()
    cur.execute("SELECT a.seqid, b.sequence FROM Species a, Ortholog_groups b WHERE a.seqid = b.seqid AND a.species= %s", (species))
    #cur.execute("SELECT seqid, sequence FROM Ortholog_groups b WHERE seqid = 'UNCcomp100582_c0_seq1'")
    rows = cur.fetchall()
    for (seqid, seq) in rows:
      try:
        seq_record = SeqRecord(Seq(seq))
      except TypeError:
        warnings.warn("Can't create seq object from %s for %s" % (seq, seqid))
        continue
      if gtf_filename:       #Write blast info to GTF file
        hit_list = {}
        hit_order = []
        cur.execute("SELECT * FROM BLAST WHERE qseqid=%s", (seqid,))
        for (id, qseqid, qlen, sacc, slen, pident, length, mismatch, gapopen, qstart, qend, qframe, sstart, send, sframe, evalue, bitscore, strand, hitnum) in cur.fetchall():
          feature = {
            'source': '1kp',
            'feature': 'blast_hit',
            'frame': '.',
            'seqname': qseqid,
            'score': float(bitscore),
            'start': int(qstart),
            'end': int(qend)
                   }
          #Add strand information and reverse coordinates if on negative strand
          if strand == '1':
            feature['strand'] = '+'
          elif strand == '0':
            feature['strand'] = '-'
          else:
            sys.exit("Strand %s not recognized" % strand)
                 
          #In cases where multiple hits to the same subject, combine by maximizing coordinates
          if sacc in hit_list:
            if hit_list[sacc]['start'] > feature['start']:
              hit_list[sacc]['start'] = feature['start']
            if hit_list[sacc]['end'] < feature['end']:
              hit_list[sacc]['end'] = feature['end']
          else:
            hit_order.append(sacc)
            hit_list[sacc] = feature
        
        #Scan through hits in order and write CDS features for non-overlapping ones
        for i in range(len(hit_order)):    
            overlap = 0
            sacc = hit_order[i]
            feature = hit_list[sacc]
            for j in range(i):
              overlap = max(overlap, hit_overlap(feature, hit_list[hit_order[j]]))
            if not overlap:
              if sacc in seq_groups:
                name = "%s_c%s_0" % (qseqid[0:qseqid.find('comp')], seq_groups[sacc].split("_")[1])
              else:
                name = sacc + "_0"
              name_num = 1                         
              while name in name_list:
                name = name.split("_")[0:-1] + [str(name_num)]
                name = "_".join(name)
                name_num = name_num + 1
              name_list[name] = 1
              feature['gene_id'] = name
              feature['transcript_id'] =  feature['gene_id'] + '.1'
              feature['feature'] = 'CDS'
              feature['score'] = '.'
              if feature['strand'] == '+':
                (feature['start'], feature['end']) = get_orf_coords(seq_record, feature['start'], feature['end'])
                feature['frame'] = feature['start'] % 3 + 1
                #print "feature start, feature end = %s, %s" % ( feature['start'], feature['end'] )
                note = orf_integrity(Seq(seq[feature['start']-1:feature['end']]))
                if note: feature['note'] = note
              else:
                (orf_start, orf_end) = get_orf_coords(seq_record.reverse_complement(), len(seq_record) - qend + 1, len(seq_record) - qstart + 1)    
                (feature['start'], feature['end']) = (len(seq_record) - orf_end + 1, len(seq_record) - orf_start + 1)            
                feature['frame'] = ( len(seq_record) - feature['end'] + 1 ) % 3 + 1
                orf_rev = Seq(seq[feature['start']-1:feature['end']] )          
                note = orf_integrity(orf_rev.reverse_complement())
                if note: feature['note'] = note
              gtf_writer.writerow(flatten_GTF(feature))

      if seqfilename:
        seqfile.write(">%s\n%s\n" % (seqid, seq))