コード例 #1
0
ファイル: assign.py プロジェクト: yuzhenpeng/find_cns
def main(cnsfile, qbed_file, sbed_file, pairsfile, pairs_fmt, qdsid, sdsid,qpad,spad):
    qcns_file = qbed_file.replace(".nolocaldups", "_cns.gff")
    assert qcns_file != qbed_file
    qcns_gff = open(qcns_file, 'w')
    print >>qcns_gff, "##gff-version 3"
    if sbed_file != qbed_file:
        scns_file = sbed_file.replace(".nolocaldups", "_cns.gff")
        assert scns_file != sbed_file
        scns_gff = open(scns_file, 'w')
        print >>scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff
    
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()


    cnsdict, evaldict = get_cns_dict(cnsfile)
    qpair_map, spair_map = make_pair_maps(pairsfile, pairs_fmt, qbed, sbed)
    out = sys.stdout

    fmt = "%(cns_id)s,%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(eval)s,%(link)s"

    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict,qbed, sbed, qpair_map, spair_map):
        d = cns_fmt_dict(cns, qfeat, sfeat, evaldict)
        d['cns_id'] = cns_id(d)
        if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']
        d['link'] = cns_link(d, qdsid, sdsid,qpad,spad)
        print >>out, fmt % d
        write_gff(d, qcns_gff, scns_gff)
コード例 #2
0
ファイル: merge_test.py プロジェクト: yuzhenpeng/find_cns
 def setUp(self):
     self.old_bed = Bed("data/rice_t_sorghum_v1/sorghum_v1.bed")
     self.missed_bed = Bed(
         "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.bed")
     self.matches = "data/rice_t_sorghum_v1/missed_sorghum_v1_from_rice_b.matches.txt"
     self.missed_genes = parse_missed_genes(self.matches)
     self.missed_genes_grouped, self.missed_genes_dict = group_genes_in_bed(
         self.missed_genes, self.old_bed, self.missed_bed)
コード例 #3
0
def main(cns_path, fmt, query_bed_path, subject_bed_path):
    cns_dic = cns_to_dic(cns_path, fmt)
    query_bed = Bed(query_bed_path)
    subject_bed = Bed(subject_bed_path)
    utr_dict = {}
    for cns in cns_dic:
        cns['qstop'] = int(cns['qstop'])
        cns['qstart'] = int(cns['qstart'])
        cns['sstop'] = int(cns['sstop'])
        cns['sstart'] = int(cns['sstart'])

        qfeat = query_bed.accn(cns['qaccn'])
        sfeat = subject_bed.accn(cns['saccn'])
        qgene_space_start = min(qfeat['locs'])[0]
        qgene_space_end = max(qfeat['locs'])[1]
        qgene_space_poly = LineString([(0.0, qgene_space_start),
                                       (0.0, qgene_space_end)])
        qgene_poly = LineString([(0.0, qfeat['start']), (0.0, qfeat['end'])])
        sgene_poly = LineString([(0.0, sfeat['start']), (0.0, sfeat['end'])])
        # if intron of one dont need to check other
        qcns = LineString([(0, cns['qstart']), (0, cns['qstop'])])
        scns = LineString([(0, cns['sstart']), (0, cns['sstop'])])
        cns_type(cns, qgene_space_poly, qgene_poly, sgene_poly, scns, qcns,
                 qgene_space_start, qfeat)
        create_utr_list(utr_dict, qfeat, cns, "q")
        create_utr_list(utr_dict, sfeat, cns, "s")
    for cns in cns_dic:
        if cns['type'] == "5-prox_dist":
            qgene_start = min(utr_dict[cns['qaccn']])
            qgene_stop = max(utr_dict[cns['qaccn']])
            # sstart = min(utr_dict[cns['saccn']])
            # sstop =  max(utr_dict[cns['saccn']])
            five_diff_pos = abs(qgene_start - cns["qstop"])
            five_diff_neg = abs(qgene_stop - cns["qstart"])
            if five_diff_pos <= 1000 and cns[
                    "qstrand"] == "+" or five_diff_neg <= 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "5-proximal"
            elif five_diff_pos > 1000 and cns[
                    "qstrand"] == "+" or five_diff_neg > 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "5-distal"
        elif cns['type'] == "3-prox_dist":
            qgene_start = min(utr_dict[cns['qaccn']])
            qgene_stop = max(utr_dict[cns['qaccn']])
            three_diff_pos = abs(cns["qstart"] - qgene_stop)
            three_diff_neg = abs(cns["qstop"] - qgene_start)
            if three_diff_pos <= 1000 and cns[
                    "qstrand"] == "+" or three_diff_neg <= 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "3-proximal"
            elif three_diff_pos > 1000 and cns[
                    "qstrand"] == "+" or three_diff_neg > 1000 and cns[
                        "qstrand"] == "-":
                cns["type"] = "3-distal"
    return cns_dic
コード例 #4
0
ファイル: test_assign.py プロジェクト: yuzhenpeng/find_cns
 def setUp(self):
     self.cns_filename = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.cns.txt"
     self.pairsfile = "data/rice_v6_sorghum_v1/rice_v6_sorghum_v1.pairs.txt"
     self.qbed = Bed("data/rice_v6_sorghum_v1/rice_v6.bed")
     self.qbed.fill_dict()
     self.sbed = Bed("data/rice_v6_sorghum_v1/sorghum_v1.bed")
     self.sbed.fill_dict()
     self.cns_dict, self.evalue_dict = get_cns_dict(self.cns_filename)
     self.qpair_map, self.spair_map = make_pair_maps(
         self.pairsfile, "pair", self.qbed, self.sbed)
コード例 #5
0
    def setUp(self):
        handle = open(
            '/Users/gturco/code/freeling_lab/find_cns_gturco/pipeline/tests/blast_3.txt'
        )
        fh = handle.readlines()
        self.blast_str = ' , '.join(fh)
        self.unmasked_fasta = Fasta('/Users/gturco/find_cns/maize_v2_UM.fasta')

        self.qbed = Bed('/Users/gturco/rice_maize/rice_v6.bed')
        self.qbed.fill_dict()
        self.sbed = Bed('/Users/gturco/maize/maize_v2.bed',
                        '/Users/gturco/maize/maize_v2.fasta')
        self.sbed.fill_dict()
        self.sfeat = self.sbed.accn('GRMZM2G086714')
        self.qfeat = self.qbed.accn('Os09g27050')
コード例 #6
0
ファイル: assign_region.py プロジェクト: yuzhenpeng/find_cns
def main(cnsfile, qbed_file, sbed_file, pairsfile, pck, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    qpair_map = make_pair_maps(pairsfile, 'pair', qbed, sbed)
    out = sys.stdout
    
    fmt = "%(saccn)s,%(saccnL)s,%(saccnR)s,%(schr)s,%(sstart)i,%(sstop)i," + \
                     "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(link)s" 
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, saccn, saccn_l, saccn_r, qfeat in assign(cnsdict, qbed, qpair_map): 
        d = cns_fmt_dict(cns, qfeat, saccn, saccn_l, saccn_r)
        d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr,qfeat, pck, sbed, qbed, sorg, qorg, padding)
        print >>out, fmt % d
コード例 #7
0
def main_gene(feature_file, query_list_pos, query_list_neg):
    cds = []
    three_all = []
    five_all = []
    feature_bed = Bed(feature_file)
    for feature in feature_bed:
        exon_meth = []
        for e in feature['locs']:
            for i in range(e[0], e[1] + 1):
                if feature["strand"] == "+":
                    matches = query_list_pos[feature['seqid']].find(i, i)
                else:
                    matches = query_list_neg[feature['seqid']].find(i, i)

                exon_meth.append(len(matches))
        cds.append(sum(exon_meth))
        if feature["strand"] == "+":
            five_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0]))
            three_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][-1][1]),
                int(feature['locs'][-1][1]) + 300)
        elif feature["strand"] == "-":
            three_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][0][0]) - 300, int(feature['locs'][0][0]))
            five_prime = query_list_pos[feature['seqid']].find(
                int(feature['locs'][-1][1]),
                int(feature['locs'][-1][1]) + 300)
        three_all.append(len(three_prime))
        five_all.append(len(five_prime))
    return cds, three_all, five_all
コード例 #8
0
ファイル: assign_qfeat.py プロジェクト: yuzhenpeng/find_cns
def main(cnsfile, qbed_file, sbed_file, qorg, sorg, padding):
    qbed = Bed(qbed_file); qbed.fill_dict()
    sbed = Bed(sbed_file); sbed.fill_dict()
    cnsdict = get_cns_dict(cnsfile)
    out = sys.stdout
    
    fmt = "%(qaccn)s,%(qchr)s,%(qstart)i,%(qstop)i,%(qstrand)s," + \
                       "%(saccn)s,%(schr)s,%(sstart)i,%(sstop)i,%(sstrand)s,%(link)s"
                     
    print >>out, "#" + fmt.replace("%(","").replace(")s","").replace(")i","")
    for cns, qfeat, sfeat in assign(cnsdict, qbed, sbed): 
        d = cns_fmt_dict(cns, qfeat, sfeat)
	if d['sstop'] < d['sstart']:
            d['sstop'], d['sstart'] = d['sstart'], d['sstop']        
	d['link'] = assign_url(cns.sstart, cns.schr, cns.qstart, cns.qchr, sorg, qorg, padding)
        print >>out, fmt % d
コード例 #9
0
ファイル: cns_location.py プロジェクト: yuzhenpeng/find_cns
def utr_present(cns_pck, query_bed_path, UTR):
    "checks to see if qaccn has utr region"
    db = MySQLdb.connect(host="127.0.0.1", user="******", db="rice_gene_table")
    cursor = db.cursor()
    cns_handle = open(cns_pck)
    cns_pickle = pickle.load(cns_handle)
    query_bed = Bed(query_bed_path)
    for cns in cns_pickle:
        qfeat = query_bed.accn(cns['qaccn'])
        if qfeat['strand'] == "+":
            end = qfeat['end']
            start = qfeat["start"]
        else:
            end = qfeat['start']
            start = qfeat["end"]
        if UTR == 3:
            if end == min(qfeat['locs'])[0] or end == max(qfeat['locs'])[1]:
                stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.3_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(
                    cns['qaccn'])
                print stmt
                cursor.execute(stmt)
        elif UTR == 5:
            if start == min(qfeat['locs'])[0] or start == max(
                    qfeat['locs'])[1]:
                stmt = "update MUSA_GENE_LIST_copy set MUSA_GENE_LIST_copy.5_UTR = 'ND' where MUSA_GENE_LIST_copy.Rice_MSU6_genes = '{0}'".format(
                    cns['qaccn'])
                print stmt
                cursor.execute(stmt)
コード例 #10
0
def print_bed(flist, old_path):
    ipath, ext = op.splitext(old_path)
    path = "%s.with_new%s" % (ipath, ext)

    print >> sys.stderr, "writing to: %s.with_new%s" % (ipath, ext)
    fh = open(path, 'wb')
    seen = {}

    for item in flist:
        # convert the locs to a tuple.
        #print >>sys.stderr, item
        item = list(item)
        item[6] = tuple(item[6])
        item = tuple(item)
        if item in seen: continue
        seen[item] = 1
        locs = item[6]  # tuple(sorted([item[1], item[2]]))

        row = dict(accn=item[3],
                   start=item[1],
                   end=item[2],
                   seqid=item[0],
                   locs=locs,
                   score='.',
                   strand=item[5],
                   rgb='.',
                   thickstart='.',
                   thickend=".")
        print >> fh, Bed.row_string(row)
    fh.close()
    return Bed(path)
コード例 #11
0
def freq(feature_file,window_size,interval,meth_data):
  features = Bed(feature_file)
  for feature in features:
        region = range(int(feature["start"]),int(feature["end"])+1)
        for window_start in region[::interval]:
            window_end = window_start + window_size
            if window_end > region[-1]:
                matches = meth_data[feature['seqid']].find(window_start, region[-1])
            else:
                matches = meth_data[feature['seqid']].find(window_start,window_end)
            if len(matches) < 15 : continue
            kw(matches,feature['seqid'],window_start,window_end)
コード例 #12
0
ファイル: merge.py プロジェクト: yuzhenpeng/find_cns
def main(missed, fh_match, org_bed):
    """first megers all hits to the same gene... then updates the entire bed
    file output: all_ORG.bed """

    merge_same_hits(missed, fh_match, org_bed)
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    missed2 = '{0}/missed_from_{1}'.format(dirc, org)
    merge_fh = "{0}/all_{1}".format(dirc, org)
    print missed2
    merge(org_bed, Bed(missed2), merge_fh)
コード例 #13
0
def merge_flat(new_name, aflat, bflat):
    """take 2 flat files and return a new one that is the union of the 2
      existing"""
    seen = {}
    both = []
    for flat in (aflat, bflat):
        for row in flat:
            key = row['seqid'], row['accn']
            if key in seen: continue
            seen[key] = True
            both.append(row)
            both.sort(key=lambda a: (a['seqid'], a['start']))
    fh = open(new_name, "w")
    #print >>fh, "\t".join(Flat.names)
    for b in both:
        print >> fh, Bed.row_string(b)
    fh.close()
    return Bed(fh.name)
コード例 #14
0
def main(cns_file, bedpath, fastapath):
    genespace = get_genespace(cns_file)
    bed = Bed(bedpath)
    f = Fasta(fastapath)
    handles = [
        '3_utr', '5_utr', 'intronic', '5_prox', '5_distal', '3_prox',
        '3_distal'
    ]
    fhs = open_files(handles)
    for gene in genespace.keys():
        #cnsspace = genespace[gene]
        try:
            accn = bed.accn(gene)
        except KeyError:
            continue
        cnsspace = [(max(0, accn['start'] - 12000), accn['end'] + 12000)]
        #print "GENESPACE {0}".format(cnsspace)
        locs = accn['locs']
        locs.sort()
        cnsspace.sort()
        write_to_pos_fasta(bed, accn, locs, cnsspace, fhs, f)
コード例 #15
0
def loadintointersect(bed_file):
    query_list_pos = {}
    query_list_neg = {}
    feature_list = Bed(bed_file)
    for feature in feature_list:
        ##    if float(feature['accn']) < .4: continue
        if feature["strand"] == "+":
            ### ADD one because bed adds one too number
            if feature['seqid'] not in list(query_list_pos):
                query_list_pos[feature['seqid']] = Intersecter()
            query_list_pos[feature['seqid']].add_interval(
                Feature(int(feature['start'] - 1),
                        int(feature['start'] - 1),
                        name=feature['strand']))
        elif feature["strand"] == "-":
            if feature['seqid'] not in list(query_list_neg):
                query_list_neg[feature['seqid']] = Intersecter()
            query_list_neg[feature['seqid']].add_interval(
                Feature(int(feature['start'] - 1),
                        int(feature['start'] - 1),
                        name=feature['strand']))
    return query_list_pos, query_list_neg
コード例 #16
0
def main(feature_bed, query_list_pos, query_list_neg, fasta_file, mtype, rand):
    features = Bed(feature_bed)
    fasta = Fasta(fasta_file)
    All_sites = defaultdict(list)
    r = {}
    cgene = {}
    for feature in features:
        rc = feature["strand"] == "-"
        if feature["strand"] == "+":
            TSS_region = range(
                int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0]))
            TTS_region = range(int(feature['locs'][-1][1]),
                               int(feature['locs'][-1][1]) + 2000)
            TSS_sites = get_matchs(query_list_pos, feature['seqid'],
                                   TSS_region,
                                   fasta["chromosome_" + feature["seqid"]],
                                   -2000, rc)
            TE_sites = get_matchs(query_list_pos, feature['seqid'], TTS_region,
                                  fasta["chromosome_" + feature['seqid']],
                                  1000, rc)
            gene_body, rebin = get_genebody(
                query_list_pos, feature,
                fasta["chromosome_" + feature["seqid"]], rc, rand)
            r[feature["accn"]] = rebin
            cgene[feature["accn"]] = gene_body

            #       [All_sites[str(region)].append(freq) for region,freq in TSS_sites]
            # 	[All_sites[str(region)].append(freq) for region,freq in TE_sites]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TSS_sites
            ]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TE_sites
            ]

        if feature["strand"] == "-":
            TTS_region = range(
                int(feature['locs'][0][0]) - 2000, int(feature['locs'][0][0]))
            TSS_region = range(int(feature['locs'][-1][1]),
                               int(feature['locs'][-1][1]) + 2000)
            TSS_sites = get_matchs(query_list_neg, feature['seqid'],
                                   TSS_region,
                                   fasta["chromosome_" + feature["seqid"]],
                                   -2000, rc)
            TE_sites = get_matchs(query_list_neg, feature['seqid'], TTS_region,
                                  fasta["chromosome_" + feature['seqid']],
                                  1100, rc)

            ###RV complent
            gene_body, rebin = get_genebody(
                query_list_neg, feature,
                fasta["chromosome_" + feature["seqid"]], rc, rand)
            r[feature["accn"]] = rebin
            cgene[feature["accn"]] = gene_body

            ##[All_sites[str(region)].append(freq) for region,freq in TSS_sites]
            ##[All_sites[str(region)].append(freq) for region,freq in TE_sites]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TSS_sites
            ]
            [
                All_sites[feature["accn"]].append((region, freq))
                for region, freq in TE_sites
            ]

    return All_sites, r, cgene
コード例 #17
0
        if strand == '-':
            my_seq = fasta
            fasta = str(Seq(my_seq).reverse_complement())
        if len(fasta) == 0:
            #print start,stop,accn['accn']
            continue
        seq_w = "{0}\n".format(fasta)
        new_fasta.write(w)
        new_fasta.write(seq_w)


####### tair ##########
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/thaliana_v8.fasta","/Users/gt/thaliana_v8_control_SB.fasta")
x = random_noncoding(
    '/Users/gt/Desktop/tmp.csv',
    Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_golden/thaliana_v8.with_new_cns_mask.bed'
        ),
    "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta",
    "/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta"
)

######### rice,sorg,set #####
##### took out strand info used N to mask bed also ########

#x =
#random_noncoding('/Users/gt/Desktop/paper/G-box-seq/rice_rice/tmp.csv',Bed('/Users/gt/Desktop/paper/G-box-seq/rice.with_new_cns_mask.bed'),"/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_j.fasta","/Users/gt/Desktop/paper/G-box-seq/rice_rice/rice_rice_control_fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#x = random_noncoding('/Users/gt/Desktop/tmp.csv',Bed('/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8.with_new_cns_mask.bed'),"/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/thaliana_v8.fasta","/Users/gt/Desktop/freelinglab/genomes/tair 8/tair_8/tair_8_mine/thaliana_v8_control_SB.fasta")
#
コード例 #18
0
        seq = f[seqid][start:end]
        if "X" in seq:
            print accn, seqid, start, end
        if len(seq) < 15 and len(seq) > 0:
            print "OH NO!!!!!!"
        w = ">cns{0}\n".format(n)
        seq_w = "{0}\n".format(seq)
        new_fasta.write(w)
        new_fasta.write(seq_w)


dict_size = gene_size_dict('/Users/gturco/Desktop/rice_sorg_size.tsv')
#dict_size = gene_size_dict("test_file")
x = random_noncoding(
    dict_size,
    Bed('/Users/gturco/data/paper3/rice_b_sorghum_v1.nolocaldups.with_new_cns_mask.bed'
        ))
print len(x)
#####print x
get_seq(x, "/Users/gturco/data/paper3/rice_b.fasta",
        "/Users/gturco/test.fasta")
##
##### seq for cns
#handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv")
#fh = handle.read()
#cns_list = []
#for line in fh.split("\n")[:-1]:
#    if line[0] == "#": continue
#    cns_id,accn,seqid,start,end,strand = line.split(",")[:6]
#    cns_list.append((seqid,int(start),int(end)))
#
#len(cns_list)
コード例 #19
0
            print "OH NO!!!!!!"
        w = ">cns{0}\n".format(n)
        seq_w = "{0}\n".format(seq)
        new_fasta.write(w)
        new_fasta.write(seq_w)


######## rice_set ##########
#dict_size = gene_size_dict('/Users/gt/tmp.tsv')
#x = random_noncoding(dict_size,Bed('/Users/gt/data/paper4/rice_j_setaria_n/rice_j_set.nolocaldups.with_new_cns_mask.bed'))
#get_seq(x,"/Users/gt/data/paper4/rice_j.fasta","/Users/gt/data/paper4/rice_j_setaria_n/testing.fasta")
####### rice_sorg #########
dict_size = gene_size_dict('/Users/gt/tmp.tsv')
x = random_noncoding(
    dict_size,
    Bed('/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorg.nolocaldups.with_new_cns_mask.bed'
        ))
get_seq(x, "/Users/gt/data/paper4/rice_j.fasta",
        "/Users/gt/data/paper4/rice_j_sorghum_n/testing.fasta")

##### seq for cns
#handle = open("/Users/gturco/data/paper3/rice_b_sorghum_v1.cns.assigned_real.csv")
#fh = handle.read()
#cns_list = []
#for line in fh.split("\n")[:-1]:
#    if line[0] == "#": continue
#    cns_id,accn,seqid,start,end,strand = line.split(",")[:6]
#    cns_list.append((seqid,int(start),int(end)))
#
#len(cns_list)
#get_seq(cns_list,"/Users/gturco/data/paper3/rice_b.fasta","/Users/gturco/test_cns.fasta")
##
コード例 #20
0
def main(qbed_path, sbed_path, cnsfile, dist, orthology_path):
    """
    here, we remove cnss that have been called proteins/rnas from 
    the cns list, and add them to the bed files.
    AND have to do the preliminary assignment of cnss that remain to the new-genes
    that _were_ cnss. the proper assignment is then handled in assign.py
    """
    qcns_file = qbed_path.replace(".bed", "_cns.gff")
    assert qcns_file != qbed_path
    qcns_gff = open(qcns_file, 'w')
    print >> qcns_gff, "##gff-version 3"
    if sbed_path != qbed_path:
        scns_file = sbed_path.replace(".bed", "_cns.gff")
        assert scns_file != sbed_path
        scns_gff = open(scns_file, 'w')
        print >> scns_gff, "##gff-version 3"
    else:
        scns_gff = qcns_gff

    qrawbed = RawBed(qbed_path)
    srawbed = RawBed(sbed_path)

    ortho_trees = read_orthos_to_trees(orthology_path, qrawbed, srawbed)

    qbed = Bed(qbed_path)
    qbed.fill_dict()
    sbed = Bed(sbed_path)
    sbed.fill_dict()

    name, ext = op.splitext(cnsfile)
    real_cns_fh = open("%s.real%s" % (name, ext), "w")
    print >> sys.stderr, "writing to:", real_cns_fh.name
    outdir = op.dirname(cnsfile)
    print >> real_cns_fh, "#qseqid,qaccn,sseqid,saccn,qstart,qend,sstart,send,eval"

    crna = read_cns_to_rna(outdir)
    cpro = read_cns_to_protein_exons(outdir)

    #cns_items = list(parse_raw_cns(cnsfile))
    proteins = collections.defaultdict(list)
    rnas = collections.defaultdict(list)
    real_cns_items = []
    for cnsi in CNS.parse_raw_line(cnsfile):
        cns_id = cnsi.cns_id
        cns = cnsi.to_dict()
        key = (cns['qseqid'], cns['sseqid'])
        if cns_id in cpro:
            proteins[key].append((cns, cpro[cns_id]))
        elif cns_id in crna:
            rnas[key].append((cns, crna[cns_id]))
        else:
            real_cns_items.append((cns_id, cns))
    p_trees = fill_tree(proteins)
    r_trees = fill_tree(rnas)

    def assign_new_names(prs, protein_or_rna):
        n = {}
        for seqid_pair, li in prs.iteritems():
            if not seqid_pair in n: n[seqid_pair] = []
            for gnew, info in li[:]:
                new_qname = "%(qseqid)s_%(qstart)i_%(qend)i_cns" % gnew
                new_sname = "%(sseqid)s_%(sstart)i_%(send)i_cns" % gnew
                # and give them both an id so we know they were a pair.
                new_qname += "_%s" % (protein_or_rna)
                new_sname += "_%s" % (protein_or_rna)
                #print >>sys.stderr, gnew['qaccn'], cns["qaccn"]
                try:
                    qstrand = qbed.d[gnew['qaccn']]['strand']
                    sstrand = sbed.d[gnew['saccn']]['strand']
                except:
                    print >> sys.stderr, gnew
                    raise
                gnew['qaccn'] = new_qname
                gnew['saccn'] = new_sname
                gnew['qstrand'] = qstrand
                gnew['sstrand'] = sstrand
                n[seqid_pair].append((gnew, info))
        return n

    nproteins = assign_new_names(proteins, "protein")
    nrnas = assign_new_names(rnas, "rna")

    cns_seen = {}
    # go through the remaining cnss, print and assign them to the new
    # genes (previously cnss) in within dist.
    for cns_id, cns in real_cns_items:
        print >> real_cns_fh, cns_to_str(cns)
        key = (cns['qseqid'], cns['sseqid'])

        for pnew, info in get_new(cns, p_trees, key, nproteins, dist + 1000):
            cns['qaccn'] = pnew['qaccn']
            cns['saccn'] = pnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >> real_cns_fh, cns_str

        for rnew, info in get_new(cns, r_trees, key, nrnas, dist + 1000):
            cns['qaccn'] = rnew['qaccn']
            cns['saccn'] = rnew['saccn']
            cns_str = cns_to_str(cns)
            if cns_str in cns_seen: continue
            cns_seen[cns_str] = 1
            print >> real_cns_fh, cns_str

    qbed_list, qnew_pairs = merge_bed(qbed, nproteins, nrnas, ortho_trees, 'q')
    print >> sys.stderr, len(qnew_pairs)
    # dont need to do the orthos 2x so send in empty dict.
    sbed_list, snew_pairs_unused = merge_bed(sbed, nproteins, nrnas, {}, 's')

    # if it's the same org, we add the new cnss again to the same we send in both lists.
    # print_bed handles the repeats.
    if qbed.path == sbed.path:
        qbed_new = sbed_new = print_bed(qbed_list + sbed_list, qbed.path)
    else:
        qbed_new = print_bed(qbed_list, qbed.path)
        sbed_new = print_bed(sbed_list, sbed.path)

    return qbed_new.path, sbed_new.path, qnew_pairs
コード例 #21
0
def main(old_bed, new_bed1, new_bed2, merge_path):
    merge_fh = open(merge_path, 'w')
    new_genes_1 = new_genes(old_bed, new_bed1)
    new_genes_2 = new_genes(old_bed, new_bed2)
    print len(new_genes_1), len(new_genes_2)
    all_overlap = []
    for new_gene in new_genes_1:
        #print new_gene['accn']
        ### does it overlapp with any of the other new genes....
        overlapping_genes = overlapping(new_gene['start'], new_gene['end'],
                                        new_gene['strand'], new_gene['seqid'],
                                        new_genes_2)
        if overlapping_genes == 0:
            write_bed(gene, merge_fh)
            continue
        ### append all overlaping accns
        all_overlap.extend(overlapping_genes)
        for overlapping_gene in overlapping_genes:
            new_gene = update_locs(new_gene, overlapping_gene)
        merged_gene = merge_feats(new_gene)
        write_bed(new_gene, merge_fh)
        #### if it does merge the numbers
    for new_gene2 in new_genes_2:
        if new_gene2 not in all_overlap: write_bed(new_gene2, merge_fh)
    write_old_bed(new_genes_1, new_genes_2, old_bed, merge_fh)


main(Bed("data/rice_v6_setaria64/rice_v6.bed"),
     Bed("data/rice_v6_setaria64/rice_v6.all2.bed"),
     Bed("data/rice_v6_sorghum_v1/rice_v6.all2.bed"), "test")
コード例 #22
0
            print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'],
                             ",".join(map(lambda l: ",".join(map(str,l)), cnss)))

    return None

if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser("usage: %prog [options] ")
    parser.add_option("-F", dest="mask", help="blast mask simple sequence [default: F]", default="F")
    parser.add_option("-n", dest="ncpu", help="parallelize to this many cores", type='int', default=8)
    parser.add_option("-q", dest="qfasta", help="path to genomic query fasta")
    parser.add_option("--qbed", dest="qbed", help="query bed file")
    parser.add_option("-s", dest="sfasta", help="path to genomic subject fasta")
    parser.add_option("--sbed", dest="sbed", help="subject bed file")
    parser.add_option("-p", dest="pairs", help="the pairs file. output from dagchainer")
    choices = ("dag", "cluster", "pair", 'qa', 'raw', 'pck')
    parser.add_option("--pair_fmt", dest="pair_fmt", default='raw',
                          help="format of the pairs, one of: %s" % str(choices),
                          choices=choices)
    (options, _) = parser.parse_args()


    if not (options.qfasta and options.sfasta and options.sbed and options.qbed):
        sys.exit(parser.print_help())

    qbed = Bed(options.qbed, options.qfasta); qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta); sbed.fill_dict()
    assert options.mask in 'FT'

    main(qbed, sbed, options.pairs, options.pair_fmt, options.mask, options.ncpu)
コード例 #23
0
    write_new_bed(new_genes_final, old_bed, missed_genes, out_file)
    sort_file = "sort -n -k 1 -k 2 {0} -o {0}".format(out_file)
    commands.getstatusoutput(sort_file)


if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser("usage: %prog [options] ")
    parser.add_option("--missed_bed",
                      dest="new_bed",
                      help="missed ORGA from ORGB bed file from coanno ")
    parser.add_option(
        "--missed_matches",
        dest="missed_genes",
        help="missed ORGA from ORGB matches.txt file from coanno")
    parser.add_option("--old_bed",
                      dest="old_bed",
                      help="orginal bed file for ORG")
    parser.add_option("--out",
                      dest="out_fh",
                      help="out_file: where the new merged bed should go")
    (options, _) = parser.parse_args()

    new_bed = Bed(options.new_bed)
    old_bed = Bed(options.old_bed)

    main(options.missed_genes, old_bed, new_bed, options.out_fh)

#merge_same_hits(Bed('data/athaliana_lyrata2/missed_lyrata_from_athaliana.bed'),'data/athaliana_lyrata2/missed_lyrata_from_athaliana.matches.txt',Bed('data/athaliana_lyrata2/lyrata.bed'))
#merge(Bed('data/athaliana_lyrata2/lyrata.bed'),Bed('data/athaliana_lyrata2/missed_from_lyrata.bed'),'data/athaliana_lyrata2/lyrata.all.bed')
コード例 #24
0
ファイル: cleanup.py プロジェクト: yuzhenpeng/find_cns
 def __init__(self, filename, bed):
     self.filename = filename
     self.bed = Bed(bed)
     self.bed.fill_dict()
コード例 #25
0
ファイル: localdup_maize.py プロジェクト: yuzhenpeng/find_cns
                      type='string',
                      help="path to query localdup_file")
    parser.add_option("--sdups",
                      dest="sdups",
                      type='string',
                      help="path to subject localdup_file")
    parser.add_option("--cns_file",
                      dest="cns_file",
                      type='string',
                      help="path to cns file cns.txt")
    parser.add_option("--UMfasta",
                      dest="unmasked_fasta",
                      help="path to unmasked fasta file file")
    (options, _) = parser.parse_args()

    qbed = Bed(options.qbed, options.qfasta)
    qbed.fill_dict()
    sbed = Bed(options.sbed, options.sfasta)
    sbed.fill_dict()
    unmasked_fasta = Fasta(options.unmasked_fasta)
    assert options.mask in 'FT'

    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    #pairs_to_qa("{0}.local".format(options.pairs),'pair',"{0}.nolocaldups.local".format(qbed.path.split(".")[0]),"{0}.nolocaldups.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))

    import logging
    LOG_FILENAME = path.dirname(options.qfasta) + "dup_rdups.log"
    logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO)

    main(options.cns_file, options.qdups, options.sdups, options.pairs,
コード例 #26
0
        #     print interval_list[0].find(0,100000000)
        #     print interval_list[0].find(3577840,3577841)
        #     print three_prom
        #     print gene_body
        #     print five_prom
        #three_prom = [i for i in three_prom if i.name == gene['strand']]
        #five_prom = [i for i in five_prom if i.name == gene['strand']]
        #gene_body = [i for i in gene_body if i.name == gene['strand']]

        if len(three_prom) > 0:
            l = "{0}\t3_prom\t{1}\t{2}\t{3}\n".format(
                gene_name, three_prom_p, three_prom_p * len(three_prom),
                sum(int(sig.name) for sig in three_prom))
            out.write(l)
        if len(five_prom) > 0:
            l = "{0}\t5_prom\t{1}\t{2}\t{3}\n".format(
                gene_name, five_prom_p, five_prom_p * len(five_prom),
                sum(int(sig.name) for sig in five_prom))
            out.write(l)
        if len(gene_body) > 0:
            l = "{0}\tgene_body\t{1}\t{2}\t{3}\n".format(
                gene_name, gene_body_p, gene_body_p * len(gene_body),
                sum(int(sig.name) for sig in gene_body))
            out.write(l)


genelist = Bed("sorg.bed")
interval_list = insert_queries("DMR_NONVAS_CG_HYPO")

find_intersections(3000, interval_list, genelist, "DMR_nonvas.genes")
コード例 #27
0
    parser.add_option("--paralogy",  dest="paralogy",  help="paralogy file")
    parser.add_option("--orthology",  dest="orthology",  help="orthology file")

    opts, _ = parser.parse_args()

    if not (opts.qflat_all and opts.sflat_all and opts.datasheet):
        print "A"
        sys.exit(parser.print_help())
    if not (opts.qdsgid and opts.qorg and opts.sorg):
        print "B"
        sys.exit(parser.print_help())
    if not (opts. qdups and opts.sdups and opts.paralogy and opts.orthology):
        print "C"
        sys.exit(parser.print_help())

    qflat_new = Bed(opts.qflat_new)
    sflat_new = qflat_new if opts.qflat_new == opts.sflat_new else Bed(opts.sflat_new)

    qflat_all = Bed(opts.qflat_all)
    sflat_all = qflat_all if opts.qflat_all == opts.sflat_all else Bed(opts.sflat_all)

    qfpath = "%s.all%s" % op.splitext(qflat_new.path)
    sfpath = "%s.all%s" % op.splitext(sflat_new.path)

    qflat = merge_flat(qfpath, qflat_all, qflat_new)
    sflat = merge_flat(sfpath, sflat_all, sflat_new)

    
    qdups = parse_dups(opts.qdups, qflat)
    sdups = parse_dups(opts.sdups, sflat)
    qlocaldups = parse_dups(opts.qlocaldups,qflat)
コード例 #28
0
        #print line.split(",")[:5]
        cns_id, accn, seqid, start, end, strand = line.split(",")[:6]
        w = '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t.\t.\t.\t1\t{6}\t0\n'.format(
            seqid, start, end, cns_id, (int(end) - int(start)), strand,
            (int(end) - int(start) + 1))
        cns_bed.write(w)


#cns_to_bed("/Users/gt/data/paper4/rice_j_setaria_n/rice_j_setaria_n.cns.assigned_real.csv","/Users/gt/data/paper4/rice_j_setaria_n/rice_j_setaria_n_cns.bed")
#merge_flat("/Users/gt/data/paper4/rice_j_setaria_n/rice_j_set.nolocaldups.with_new_cns.bed",Bed("/Users/gt/data/paper4/rice_j_setaria_n/rice_j.nolocaldups.with_new.all.local"),Bed("/Users/gt/data/paper4/rice_j_setaria_n/rice_j_setaria_n_cns.bed"))
cns_to_bed(
    "/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorghum_n.cns.assigned_real.csv",
    "/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorghum_n_cns.bed")
merge_flat(
    "/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorg.nolocaldups.with_new_cns.bed",
    Bed("/Users/gt/data/paper4/rice_j_sorghum_n/rice_j.nolocaldups.with_new.all.local"
        ),
    Bed("/Users/gt/data/paper4/rice_j_sorghum_n/rice_j_sorghum_n_cns.bed"))


def mask_to_bed(fasta_file, mask_bed_name):
    "creates a bed file of the start and stops of masked seqs"
    mask_bed = open(mask_bed_name, "wb")
    f = Fasta(fasta_file)
    mask_id = 1
    for seqid in f.keys():
        seq = f[seqid][:]
        for m in re.finditer("X+", seq):
            mask_id = mask_id + 1
            w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(
                seqid, m.start(), m.end(), "mask_id {0}".format(mask_id),
                (m.end() - m.start()), (m.end() - m.start() + 1))
コード例 #29
0
ファイル: merge.py プロジェクト: yuzhenpeng/find_cns
    org_bed_path = org_bed.path
    path = org_bed_path.split('/')
    dirc = '/'.join(path[:-1])
    org = path[-1]
    missed2 = '{0}/missed_from_{1}'.format(dirc, org)
    merge_fh = "{0}/all_{1}".format(dirc, org)
    print missed2
    merge(org_bed, Bed(missed2), merge_fh)


if __name__ == "__main__":
    import optparse
    parser = optparse.OptionParser("usage: %prog [options] ")
    parser.add_option("--missed",
                      dest="missed",
                      help="missed ORGA from ORGB bed file from coanno ")
    parser.add_option(
        "--match",
        dest="fh_match",
        help="missed ORGA from ORGB matches.txt file from coanno")
    parser.add_option("--org", dest="org_bed", help="orginal bed file for ORG")
    (options, _) = parser.parse_args()

    missed_bed = Bed(options.missed)
    org_bed = Bed(options.org_bed)

    main(missed_bed, options.fh_match, org_bed)

#merge_same_hits(Bed('data/athaliana_lyrata2/missed_lyrata_from_athaliana.bed'),'data/athaliana_lyrata2/missed_lyrata_from_athaliana.matches.txt',Bed('data/athaliana_lyrata2/lyrata.bed'))
#merge(Bed('data/athaliana_lyrata2/lyrata.bed'),Bed('data/athaliana_lyrata2/missed_from_lyrata.bed'),'data/athaliana_lyrata2/lyrata.all.bed')
コード例 #30
0
 def test_main(self):
     """test for test_get_cns_dict"""
     qbed = Bed(self.qbed, self.qfasta); qbed.fill_dict()
     sbed = Bed(self.sbed, self.sfasta); sbed.fill_dict()
     x = main(qbed, sbed, self.pairs, 12000,12000, "pair", self.blast_path, "T",2)
     print x