parser.add_option('-o','--output',dest='output',default=None,
                  help='file to write fasta records to [default: stdout]')

if __name__ == '__main__' :

    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) != 1 :
        parser.error('Exactly one argument is required')

    org_settings = get_org_settings(args[0])

    refgene_fn = org_settings['refgene_anno_path']
    refgene_f = RefGeneFile(refgene_fn)

    nib_db = NibDB(nib_dirs=[org_settings['genome_dir']])

    gene_list = None
    if opts.gene_list :
        gene_list = [x.strip() for x in open(opts.gene_list).readlines()]

    id_index = 'bin'
    if opts.gene_type != gene_type_choices[0] :
        if opts.gene_type  == 'refgene' :
            id_index = 'name'

    seq_recs = []
    gene_map = defaultdict(list)
    for rec in refgene_f :
        if gene_list and rec[id_index] not in gene_list : continue # skip this one
        st, end = max(0,int(rec['txStart'])-opts.upstream), min(int(rec['txStart'])+opts.downstream,nib_db.db_info[rec['chrom']]['nbases'])
from chipsequtil import get_org_settings, BEDFile
from chipsequtil.nib import NibDB
from pprint import pprint

genome_dir = get_org_settings('mm9')['genome_dir']
db = NibDB(nib_dirs=[genome_dir])
fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed')

pprint(seqs[:10])
Beispiel #3
0
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False):
    start=-3
    hang='NNN'

    match=[]

    #find CCGG positions using Fasta file
    fa=open(fafile)
    for line in fa:
        l=line.strip('\n')
        if l[0]=='>':
            ch=l[1:]
            continue
        if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN':
            start+=len(l)
            hang=l[-3:]
            continue
        else:
            seq=hang+l
            mers=[seq[x:(x+4)] for x in range(len(seq)-4)]
            for i,m in enumerate(mers):
                if m=='ccgg': match.append(start+i)
            hang=seq[-3:]
            start+=len(l)

    print len(match)
    
    fa.close()
    FRAG=[]
    
    #find cut sites 40-220bp and save as tuple
    for x,y in zip(match[:-1],match[1:]):
        d=y-x
        if d>40 and d<250: FRAG.append((x,y))

    print len(FRAG)

    #nibDB the cut sites 40bp 5'-3' and
    #save each as a pair of Fasta items with keys chr:position(strand)
    seq_dict={}
    ids,loci=[],[]
    BF=[]
    for x,y in FRAG:
        if bedFrag: BF.append([ch,str(x+1),str(y+3)])
        #for x
        start=x+1
        stop=x+41
        key=ch+':'+str(start)+'+'
        loc=(ch,start,stop,'+')
        ids.append(key)
        loci.append(loc)
        
        #for y
        start=y-37
        stop=y+3
        key=ch+':'+str(stop)+'-'
        loc=(ch,start,stop,'-')
        ids.append(key)
        loci.append(loc)

    if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t')
    if genome=='hg18':  DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/')
    else:  DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir'])
    fa_ids,seqs=DB.get_fasta_batch(loci)
    for id,seq in zip(ids,seqs):
        if convert: biseq=seq.replace('c','t')
        else: biseq=seq
        if id[-1]=='+':
            seq_dict[id]=biseq
        else:
            #seq_dict[id]=seq[::-1]
            seq_dict[id]=biseq
    Fasta.write(seq_dict,seqfile)
Beispiel #4
0
    # for pvalue vs motif score
    pval_num_bins = 20
    pval_bin_size = all_peaks[:]['pvalue'].size/pval_num_bins
    # try to take at least 100 sequences, at most 10% of bin size
    sample_percent = max(min(1.,100./pval_bin_size),0.1)
    pval_bin_memo = {}

    if opts.top_n is not None :
        peaks = all_peaks[0:opts.top_n]
        peak_pvals = peak_pvals[peak_pval_inds][0:opts.top_n]
    else :
        peaks = all_peaks

    # extract fasta sequences for these peaks
    nibDb = NibDB(nib_dirs=get_org_settings(org)['genome_dir'])

    """
    # get the peak sequences
    sys.stderr.write('Getting peak sequences\n')
    fasta_batch = []
    for i in range(peaks.size) :
        fasta_batch.append((str(peaks[i]['chr']),int(peaks[i]['start']),int(peaks[i]['end']),'+'))
    fg_fasta_headers, fg_fasta = nibDb.get_fasta_batch(fasta_batch)

    # need a dict for background sampling
    # headers have genome_dir and .nib in them, strip that out
    sys.stderr.write('Converting nib output to dict\n')
    fg_fasta_headers = list(fg_fasta_headers)
    fg_fasta_dict = {}
    for h,s in zip(fg_fasta_headers,fg_fasta) :