Beispiel #1
0
def cns_opp_strand(cnss, qgene, sgene):
    cnss = list(cnss)
    cnss = map(change_orient,cnss)
    sgene[0] *= -1
    sgene[1] *= -1

    cnss = [(c[0], c[1], c[2], c[3],c[-2]) for c in remove_crossing_cnss(cnss, qgene, sgene)]
    cnss_fixed = [(c[0], c[1], -c[2], -c[3],c[-1]) for c in cnss]
    return cnss_fixed
Beispiel #2
0
def remove_crossing_hits(exon_hits,qfeat,sfeat):
        """uses find cns remove overlaping hits and corssing on each exon"""
        qgene =[qfeat['start'], qfeat['end']]
        sgene =[sfeat['start'], sfeat['end']]
        orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
        exon_hits = list(exon_hits)
        if orient == -1:
            for i, hit in enumerate(exon_hits):
                hit = list(hit)
                hit[2] *= -1
                hit[3] *= -1
                exon_hits[i] = tuple(hit)
            sgene[0] *= -1
            sgene[1] *= -1
        non_crossing_hits = [(c[0], c[1], c[2], c[3], c[-2]) for c in remove_crossing_cnss(exon_hits,qgene,sgene)]
        if orient == -1:
            non_crossing_hits == [(c[0],c[1],-c[2], -c[3], c[-1]) for c in remove_crossing_cnss(exon_hits,qgene,sgene)]
        #non_crossing_dict = {str(locs):psudo[str(locs)] for locs in non_crossing_hits}
        return non_crossing_hits
Beispiel #3
0
def protein_parse(hit,gene,gene_bed, hit_bed):
    "creates a protein fasta and non translated exon fasta \
            blastx them and parse the results"
    hit_fasta = "{0}q.fasta".format('/Users/gturco/code/freeling_lab/pseudo/data/rice_v6_setaria64/')
    gene_fasta = "{0}s.fasta".format('/Users/gturco/code/freeling_lab/pseudo/data/rice_v6_setaria64/')
    if len(re.findall('X',gene_bed.row_cds_sequence(gene['accn']))) >0:
        return "masked", "masked","masked"
    else:
        protein_fasta(hit_bed,hit,False,hit_fasta)
        protein_fasta(gene_bed,gene,True,gene_fasta)
        #cmd = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastx -G 11 -E 1 -W 3 -e 0.001 -D 1 -i {0} -j {1}  | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR'".format(hit_fasta,gene_fasta)
        cmd = "/Users/gturco/ncbi-blast-2.2.25+/bin/blastx -gapopen 11 -gapextend 1  -word_size 3 -evalue 0.001 -outfmt '7 gaps qframe std' -query {0} -subject {1} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR'".format(hit_fasta,gene_fasta)
        #print >>sys.stderr, "{1} {2} cmd : {0} ".format(cmd,gene,hit)
        res = commands.getoutput(cmd)
        print >>sys.stderr, res
        frame_dict = {'1':{"alignment":[],"qstart":[],"gaps":[]},
                '2':{"alignment":[],"qstart":[],"gaps":[]},
                '3':{"alignment":[],"qstart":[],"gaps":[]},'-1':{"alignment":[],"qstart":[],"gaps":[]},
                '-2':{"alignment":[],"qstart":[],"gaps":[]},
               '-3':{"alignment":[],"qstart":[],"gaps":[]}}
        qhit =[hit['start'], hit['end']]
        sgene =[gene['start'], gene['end']]
        locs_list = []
        for line in res.split("\n"):
            if not line: continue
            if "WARNING:" in line: continue
            if "ERROR" in line: continue
            line = line.split("\t")
            locs = map(int, line[8:12])
            locs.extend(map(float, line[12:]))
            frame = line[1]
            gaps = line[0]
            qstart = min(locs[0],locs[1])
            length = line[5]
            frame_dict[frame]["alignment"].append(length)
            frame_dict[frame]["qstart"].append(qstart)
            frame_dict[frame]["gaps"].append(gaps)
       #frame_lengths = [(sum(frame_dict[key]),key) for key in frame_dict.keys()]
       #frame_lengths.sort()
       #largest_frame = frame_lengths[-1][1]
       #if largest_frame < len....:
       #    frame_shift
      #find stop codon from largest frame + start site....

            locs = tuple(locs)
            locs_list.update((locs,))
        #print >>sys.stderr, "locs_list: {0}".format(locs_list)
        #non_crossing = [(c[0], c[1], c[2], c[3]) for c in remove_intersecting_hits(list(locs_list))]
        non_crossing = [(c[0], c[1], c[2], c[3], c[4]) for c in remove_crossing_cnss(list(locs_list),qhit,sgene)]
        frame_shift = False
        if len(non_crossing) > 1:
            frame_shift = False
        total_hit_len = sum([abs(q_start-q_end) for q_start,q_end,s_start,s_end, evalu in non_crossing])
        total_gene_len = sum(abs(s_start-s_end) for q_start,q_end, s_start,s_end, evalu in non_crossing)
        print >>sys.stderr,non_crossing
        ref_hit_len = len(hit_bed.row_cds_sequence(hit['accn']))
        ref_gene_len = len(gene_bed.row_cds_sequence(gene['accn']))
        print >>sys.stderr,"hit_total {0} \n gene_len {1}".format(total_hit_len,ref_hit_len)
        #print >>sys.stderr, total_hit_len
        hit_len = total_hit_len/float(ref_hit_len)
        gene_len = total_gene_len/float(ref_gene_len/3)
        return hit_len, gene_len, frame_shift
Beispiel #4
0
def parse_blast(blast_str, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta):
    blast = []
    slope = orient

    qgene = [qfeat['start'], qfeat['end']]
    sgene = [sfeat['start'], sfeat['end']]

    sgene = sgene[::slope]
    center = sum(qgene)/2., sum(sgene)/2.



    intercept = center[1] - slope * center[0]
    x = np.linspace(qgene[0] - qpad, qgene[1] + qpad, 50)
    y = slope * x + intercept

    feats_nearby = get_feats_nearby(qgene,sgene,qfeat,sfeat,x,y,qbed,sbed)    
    qgene_space_poly,qgene_poly,sgene_space_poly,sgene_poly = get_genespace(qfeat,sfeat,qgene,sgene)
    intronic_removed = 0
    
    cnss = set([])
    for line in blast_str.split("\n"):
        if "WARNING:" in line: continue
        if "ERROR" in line: continue
        if line == '': continue
        line = line.split("\t")
        if float(line[-1]) < 29.5: continue #finds 15/15 match
       # if float(line[-1]) < 33.4: continue #finds 17/17 match
        locs = map(int, line[6:10])
        locs.extend(map(float, line[10:]))

        xx = locs[:2]
        yy = locs[2:4]

        
        #######################################################
        # MAIZE BOWTIE : JUST 5 PRIME 3 PRIME
        #######################################################
        qcenter = sum(qgene)/2
        scenter = sum(sgene)/2 * orient
        qcns_center = sum(xx)/2
        scns_center = sum(yy)/2 * orient
        if  scns_center > scenter and qcns_center < qcenter: continue
        if qcns_center > qcns_center and scns_center < scenter : continue
        
        
        # to be saved. a hit must either be in an intron in both
        # genes, or in neither.

        ##########################################################
        # DEAL WITH INTRONIC cnss in the gene of interest.
        ##########################################################
        xls = LineString([(0, locs[0]), (0, locs[1])])
        yls = LineString([(0, locs[2]), (0, locs[3])])

        locs = tuple(locs) # make it hashable.
        if qgene_poly.intersects(xls) and sgene_poly.intersects(yls):
            cnss.update((locs,))
            continue
        # has to be both or neither.
        if qgene_space_poly.intersects(xls) or sgene_space_poly.intersects(yls):
            intronic_removed += 1
            continue
        ##########################################################

        ###############################################################
        # for all other genes, if it's in an intron, we dont keep it.
        ###############################################################
        intronic = False
        # get rid of stuff that overlaps another gene:
        for sub, (start, stop) in (('q', locs[:2]), ('s', locs[2:4])):
            feats = feats_nearby[sub]
            if feats is None: continue
            # the current hsp is overlapping another gene. we dont want that...
            if feats.contains(Point(0, start)) or feats.contains(Point(0, stop)):
                intronic = True
                break
        if intronic: continue

        ##########################################################
        cnss.update((locs,))

    # cant cross with < 2 cnss.
    # get rid of the eval, bitscore stuff.
    if len(cnss) < 2: return [(c[0], c[1], c[2], c[3],c[-1]) for c in cnss]
    cnss = list(cnss)
    ####################################################################################
    #########split cns into groups based on inversion, seq marks in maize ##########
    #################################################################################
    def group_cns(cnss, group):
      """input list of cns and list of groups , this puts the cns in a dictionary fmt key = group
      values = cns that fall within range of group"""
      for cns in cnss:
        if cns[2] in range(group[0],group[1]): # group start and end pos
          key = group
          cns_groups.setdefault(key, []).append(cns)

    cns_groups = {}
    inversion_groups = find_inversions(unmasked_fasta, sfeat, spad)
    [group_cns(cnss, group) for group in inversion_groups] # creates dict where key = group value is appended cns
    # for each goup of cns values run the followiung
    cns_by_group = []
    for key in cns_groups.keys():
      # # first group, groups into smaller groups on strand
      values = cns_groups[key]

      opp_strand = []
      same_strand = []
      for cns in values:
          if slope == 1 and cns[2] > cns[3]:
              opp_strand.append(cns)
          elif slope == -1 and cns[2] < cns[3]:
              opp_strand.append(cns)
          else:
              same_strand.append(cns)
      # need to flip to negative so the overlapping stuff still works.
      if orient == -1:
          same_strand = map(change_orient, same_strand)
          opp_strand = map(change_orient, opp_strand)
          sgene[0] *= -1
          sgene[1] *= -1
      if abs(sgene[1]) in range(key[0], key[1]): # if the cns fall in same group as gene we know its same stand  as gene and dont need to run rest
        cnss_same_strand = [(c[0], c[1], c[2], c[3],c[-1]) for c in remove_crossing_cnss(same_strand, qgene, sgene)]
        map(cns_by_group.append, cnss_same_strand)
      else:
        cnss_same_strand = [(c[0], c[1], c[2], c[3],c[-1]) for c in remove_crossing_cnss(same_strand, qgene, sgene)]
        cnss_opp_strand = cns_opp_strand(opp_strand, qgene, sgene) # alternitive for cns on opp strand
        if len(cnss_same_strand) < len(cnss_opp_strand):
          map(cns_by_group.append, cnss_opp_strand)
        else: # what about if they are the same, use non reverse complment
          map(cns_by_group.append, cnss_same_strand)
    if orient == -1:
        cns_by_group = [(c[0], c[1], -c[2], -c[3],c[-1]) for c in cns_by_group]

    return cns_by_group