コード例 #1
0
if __name__ == '__main__' :

    # parse command line arguments
    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) < 3 :
        parser.error('Must provide two non-option arguments')

    # filenames and paths
    organism, experiment_fn, control_fn = args[0:3]
    control_fn = None
    if len(args) > 3 :
        control_fn = args[2]

    org_settings = get_org_settings(organism)
    refseq_fn = org_settings['annotation_path']

    exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn)
    exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name))

    if control_fn :
        cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn)
        cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name))

    # the pipeline
    pipeline = Pypeline()

    steps = []

    # split up files
コード例 #2
0
        mp.subplots_adjust(**subplots_sizes)
        mp.hist(peak_stats['fdr'],label=hist_labels[0],bins=20,log=True)
        mp.title('%s\npeak fdr distribution'%macs_f.file_info['name'])
        mp.xlabel('fdr')
        mp.ylabel('# peaks')
        mp.legend()
        mp.savefig(fdr_hist_fn)
        mp.clf()

        chr_dist_name = macs_f.file_info['name']+'_chr_dist.png'
        chr_dist_fn = os.path.join(infosite_img_path,chr_dist_name)
        chr_dist_url = json_d['stage url']+'/'+infosite_dir_name+'/images/'+chr_dist_name
        peak_json[peak_fn]['chr distribution url'] = chr_dist_url
        chromos = []
        if json_d.has_key('org') :
            chr_sizes_fn = get_org_settings(json_d['org'])['ucsc_chrom_sizes']
            chromos = [r[0] for r in reader(open(chr_sizes_fn),delimiter='\t')]
        else :
            chromos = list(set(pos_chr_dist.keys()).union(neg_chr_dist.keys()))
        standard_chromos = filter(lambda x: re.search('^chr[0-9MXY]+$',x) is not None,chromos)

        # hack chrM, chrX and chrY so they sort right
        if 'chrM' in standard_chromos :
            standard_chromos[standard_chromos.index('chrM')] = 'chr100'
        if 'chrX' in standard_chromos :
            standard_chromos[standard_chromos.index('chrX')] = 'chr101'
        if 'chrY' in standard_chromos :
            standard_chromos[standard_chromos.index('chrY')] = 'chr102'

        standard_chromos.sort(key=lambda x: int(x.replace('chr','')))
コード例 #3
0
ファイル: sampling.py プロジェクト: hjanime/OmicsIntegrator
def rejection_sample_bg(fg_dict,
                        organism,
                        bins=100,
                        num_samples=None,
                        verbose=False,
                        bg_match_epsilon=1e-3):
    '''Generate background sequences according to the size, distance from genes,
    and GC content distributions of the supplied foreground sequences.  *fg_dict*
    is a dictionary of <header>:<sequence> items, where the first part of the
    header must contain:

    >chrX:<start>-<end>

    *organism* is a string that will be used to call the *chipsequtil.get_org
    settings* function and uses the 'genome_dir' and 'annotation_path' keys.
    *bins* is the number of bins to use for representing the GC content
    distribution.  Function returns a dictionary of <header>:<sequence> items
    of generated background sequences.'''

    nib_db = NibDB(nib_dirs=[get_org_settings(organism)['genome_dir']])
    tss_fn = get_org_settings(organism)['annotation_path']
    tss = defaultdict(list)
    for rec in RefGeneFile(tss_fn):
        tss[rec['chrom']].append((
            int(rec['txStart']),
            int(rec['txEnd']),
        ))

    # for each peak find the chromosome, distance to nearest
    # gene, size of peaks in bases, and GC content
    num_samples = len(fg_dict) if not num_samples else num_samples
    dists, sizes = [], []

    for header, seq in fg_dict.items():

        # chromosome first field in fasta headers from bed2seq.bedtoseq
        chrom = header.split(':')[0]

        # adjust chromosomes in special cases
        if re.search('random', chrom.lower()) or chrom.lower() == 'chrm':
            continue

        # start first int in second field of bed2seq.bedtoseq header
        start = int(header.split(':')[1].split('-')[0])
        midpoint = start + len(seq) / 2

        # figure out which chromosome we're working on
        tss_chr = tss[chrom]

        # dsts_to_genes is the distance of this peak from all the genes, find minimum
        dists_to_genes = [(s[0] - midpoint) for s in tss_chr]
        try:
            min_dist = min(dists_to_genes, key=lambda x: abs(x))
            dists.append(min_dist)
        except:
            err_str = 'Warning: no genes were found for sequence with header' \
                         ' %s, not using to calculate distributions.\n'%header
            sys.stderr.write(err_str)

        # calculate # bases
        sizes.append(len(seq))

    # GC content distribution for the foreground sequences
    gc_dist = get_gc_content_distribution(fg_dict.values(), bins=bins)

    # max_gc is # peaks w/ highest GC content
    max_gc = max(gc_dist)

    # gene_starts is a list of all genes in (chromosome,gene start) tuples
    gene_starts = []
    for key in tss.keys():
        chrom = key.split('chr')[-1]
        for x in tss[key]:
            gene_starts.append((key, x[0]))

    # encapsulated function for proposing sequences
    def propose_sequence(dists, gene_starts, sizes, nib_db):
        # sample a random distance from the list of distances
        d = random.choice(dists)

        # pick a random gene
        chrom, coord = random.choice(gene_starts)

        # propose a starting point for the bg sequence
        midpoint = coord - d + random.randint(-100, 100)

        # propose a size for the bg sequence
        size = random.choice(sizes)
        start = int(midpoint - int(size / 2))
        stop = int(midpoint + int(size / 2))

        #sys.stderr.write("%s:coord=%d size=%d midpoint=%d d=%d\n"%(chrom,coord,size,midpoint,d))
        # if start or stop are negative, skip and try again
        if start < 0 or stop < 0: seq = None

        # randomly choose strand
        strand = '+' if random.random() > 0.5 else '-'

        # extract the proposed sequence
        try:
            nib_title, seq = nib_db.get_fasta(chrom, start, stop, strand)
        except IOError, e:
            if verbose:
                sys.stderr.write('IOError in NibDB, skipping: %s,%d-%d,%s\n' %
                                 (chrom, start, stop, strand))
            seq = None
        except NibException, e:
            if verbose: sys.stderr.write('NibDB.get_fasta error, %s\n' % e)
            seq = None
コード例 #4
0
                     ';'.join(['%s=%s'%(k,str(v)) for k,v in rec.items()])
        fasta.append((header,seq))

    return fasta


if __name__ == '__main__' :

    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) < 2 :
        parser.error('Must provide at least two non-option arguments')

    # instantiate the NibDB from the provided directory
    organism = args[0]
    nib_dir = get_org_settings(organism)['genome_dir']
    nib_db = NibDB(nib_dirs=[nib_dir])

    # determine specified format
    peak_fmt = opts.peak_format

    peak_fns = args[1:]

    # determine if there is an output file
    if opts.output :
        out_f = open(opts.output,'w')
    else :
        out_f = sys.stdout

    fasta_recs = []
    for peak_fn in peak_fns :
コード例 #5
0
                  help='file containing a list of gene identifiers to extract, one per line [default: %default]')
gene_type_choices = ['symbol','refgene']
parser.add_option('-t','--gene-type',dest='gene_type',type='choice',
                  choices=gene_type_choices,default=gene_type_choices[0],
                  help='type of gene identifier in gene list, choose from %s [default: %%default]'%gene_type_choices)
parser.add_option('-o','--output',dest='output',default=None,
                  help='file to write fasta records to [default: stdout]')

if __name__ == '__main__' :

    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) != 1 :
        parser.error('Exactly one argument is required')

    org_settings = get_org_settings(args[0])

    refgene_fn = org_settings['refgene_anno_path']
    refgene_f = RefGeneFile(refgene_fn)

    nib_db = NibDB(nib_dirs=[org_settings['genome_dir']])

    gene_list = None
    if opts.gene_list :
        gene_list = [x.strip() for x in open(opts.gene_list).readlines()]

    id_index = 'bin'
    if opts.gene_type != gene_type_choices[0] :
        if opts.gene_type  == 'refgene' :
            id_index = 'name'
コード例 #6
0
ファイル: sampling.py プロジェクト: aabaker99/OmicsIntegrator
def rejection_sample_bg(fg_dict,organism,bins=100,num_samples=None,verbose=False,
                        bg_match_epsilon=1e-3) :
    '''Generate background sequences according to the size, distance from genes,
    and GC content distributions of the supplied foreground sequences.  *fg_dict*
    is a dictionary of <header>:<sequence> items, where the first part of the
    header must contain:

    >chrX:<start>-<end>

    *organism* is a string that will be used to call the *chipsequtil.get_org
    settings* function and uses the 'genome_dir' and 'annotation_path' keys.
    *bins* is the number of bins to use for representing the GC content
    distribution.  Function returns a dictionary of <header>:<sequence> items
    of generated background sequences.'''

    nib_db = NibDB(nib_dirs=[get_org_settings(organism)['genome_dir']])
    tss_fn = get_org_settings(organism)['annotation_path']
    tss = defaultdict(list)
    for rec in RefGeneFile(tss_fn) :
        tss[rec['chrom']].append((int(rec['txStart']),int(rec['txEnd']),))

    # for each peak find the chromosome, distance to nearest
    # gene, size of peaks in bases, and GC content
    num_samples = len(fg_dict) if not num_samples else num_samples
    dists,sizes=[],[]

    for header,seq in fg_dict.items() :

        # chromosome first field in fasta headers from bed2seq.bedtoseq
        chrom = header.split(':')[0]

        # adjust chromosomes in special cases
        if re.search('random',chrom.lower()) or chrom.lower() == 'chrm' :
            continue

        # start first int in second field of bed2seq.bedtoseq header
        start = int(header.split(':')[1].split('-')[0])
        midpoint = start + len(seq)/2

        # figure out which chromosome we're working on
        tss_chr = tss[chrom]

        # dsts_to_genes is the distance of this peak from all the genes, find minimum
        dists_to_genes = [(s[0]-midpoint) for s in tss_chr]
        try :
            min_dist = min(dists_to_genes,key=lambda x : abs(x))
            dists.append(min_dist)
        except :
            err_str = 'Warning: no genes were found for sequence with header' \
                         ' %s, not using to calculate distributions.\n'%header
            sys.stderr.write(err_str)

        # calculate # bases
        sizes.append(len(seq))

    # GC content distribution for the foreground sequences
    gc_dist = get_gc_content_distribution(fg_dict.values(),bins=bins)

    # max_gc is # peaks w/ highest GC content
    max_gc = max(gc_dist)

    # gene_starts is a list of all genes in (chromosome,gene start) tuples
    gene_starts=[]
    for key in tss.keys():
        chrom=key.split('chr')[-1]
        for x in tss[key]:
            gene_starts.append((key,x[0]))

    # encapsulated function for proposing sequences
    def propose_sequence(dists, gene_starts, sizes, nib_db) :
        # sample a random distance from the list of distances
        d = random.choice(dists)

        # pick a random gene
        chrom, coord = random.choice(gene_starts)

        # propose a starting point for the bg sequence
        midpoint = coord-d+random.randint(-100,100)

        # propose a size for the bg sequence
        size = random.choice(sizes)
        start = int(midpoint-int(size/2))
        stop = int(midpoint+int(size/2))

        #sys.stderr.write("%s:coord=%d size=%d midpoint=%d d=%d\n"%(chrom,coord,size,midpoint,d))
        # if start or stop are negative, skip and try again
        if start < 0 or stop < 0 : seq = None

        # randomly choose strand
        strand = '+' if random.random() > 0.5 else '-'

        # extract the proposed sequence
        try :
            nib_title, seq = nib_db.get_fasta(chrom,start,stop,strand)
        except IOError, e :
            if verbose : sys.stderr.write('IOError in NibDB, skipping: %s,%d-%d,%s\n'%(chrom,start,stop,strand))
            seq = None
        except NibException, e :
            if verbose : sys.stderr.write('NibDB.get_fasta error, %s\n'%e)
            seq = None
コード例 #7
0
ファイル: find_msp_sites.py プロジェクト: cwng/RRBS
def seq_msp(fafile,seqfile,genome='mm9',convert=True,bedFrag=False):
    start=-3
    hang='NNN'

    match=[]

    #find CCGG positions using Fasta file
    fa=open(fafile)
    for line in fa:
        l=line.strip('\n')
        if l[0]=='>':
            ch=l[1:]
            continue
        if l=='NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN':
            start+=len(l)
            hang=l[-3:]
            continue
        else:
            seq=hang+l
            mers=[seq[x:(x+4)] for x in range(len(seq)-4)]
            for i,m in enumerate(mers):
                if m=='ccgg': match.append(start+i)
            hang=seq[-3:]
            start+=len(l)

    print len(match)
    
    fa.close()
    FRAG=[]
    
    #find cut sites 40-220bp and save as tuple
    for x,y in zip(match[:-1],match[1:]):
        d=y-x
        if d>40 and d<250: FRAG.append((x,y))

    print len(FRAG)

    #nibDB the cut sites 40bp 5'-3' and
    #save each as a pair of Fasta items with keys chr:position(strand)
    seq_dict={}
    ids,loci=[],[]
    BF=[]
    for x,y in FRAG:
        if bedFrag: BF.append([ch,str(x+1),str(y+3)])
        #for x
        start=x+1
        stop=x+41
        key=ch+':'+str(start)+'+'
        loc=(ch,start,stop,'+')
        ids.append(key)
        loci.append(loc)
        
        #for y
        start=y-37
        stop=y+3
        key=ch+':'+str(stop)+'-'
        loc=(ch,start,stop,'-')
        ids.append(key)
        loci.append(loc)

    if bedFrag: np.savetxt(seqfile.replace('.fa','_frag.bed'),BF,fmt='%s',delimiter='\t')
    if genome=='hg18':  DB=NibDB(nib_dirs='/nfs/genomes/human_gp_mar_06/')
    else:  DB=NibDB(nib_dirs=chipsequtil.get_org_settings('mm9')['genome_dir'])
    fa_ids,seqs=DB.get_fasta_batch(loci)
    for id,seq in zip(ids,seqs):
        if convert: biseq=seq.replace('c','t')
        else: biseq=seq
        if id[-1]=='+':
            seq_dict[id]=biseq
        else:
            #seq_dict[id]=seq[::-1]
            seq_dict[id]=biseq
    Fasta.write(seq_dict,seqfile)
コード例 #8
0
from chipsequtil import get_org_settings, BEDFile
from chipsequtil.nib import NibDB
from pprint import pprint

genome_dir = get_org_settings('mm9')['genome_dir']
db = NibDB(nib_dirs=[genome_dir])
fasta_headers,seqs = db.get_fasta_from_bed('shuffled_peaks.bed')

pprint(seqs[:10])
コード例 #9
0
    num_peak_bases = 0
    for header, seq in fg.items() :
        num_peak_bases += len(seq)


if __name__ == '__main__' :

    opts, args = parser.parse_args(sys.argv[1:])

    if len(args) < 3 :
        parser.error('Must provide three non-option arguments')

    sample_type, organism, fg_fn = args[:3]

    settings_dict = get_org_settings(organism)

    fg = Fasta.load(fg_fn)
    bg = rejection_sampling(fg,settings_dict)


###############################################################
# start Chris' code from rej_samp_bg_rand2.py
    the_genes={} #list of distances to nearest TSS

    # for each peak find the chromosome, distance to nearest
    # gene, size of peaks in bases, and GC content
    the_chrs,dists,sizes,gcs=[],[],[],[]

    # number of bases in the fg sequences
    size=0
コード例 #10
0
ファイル: motif_scan.py プロジェクト: dvanderk/chipsequtil
    # for pvalue vs motif score
    pval_num_bins = 20
    pval_bin_size = all_peaks[:]['pvalue'].size/pval_num_bins
    # try to take at least 100 sequences, at most 10% of bin size
    sample_percent = max(min(1.,100./pval_bin_size),0.1)
    pval_bin_memo = {}

    if opts.top_n is not None :
        peaks = all_peaks[0:opts.top_n]
        peak_pvals = peak_pvals[peak_pval_inds][0:opts.top_n]
    else :
        peaks = all_peaks

    # extract fasta sequences for these peaks
    nibDb = NibDB(nib_dirs=get_org_settings(org)['genome_dir'])

    """
    # get the peak sequences
    sys.stderr.write('Getting peak sequences\n')
    fasta_batch = []
    for i in range(peaks.size) :
        fasta_batch.append((str(peaks[i]['chr']),int(peaks[i]['start']),int(peaks[i]['end']),'+'))
    fg_fasta_headers, fg_fasta = nibDb.get_fasta_batch(fasta_batch)

    # need a dict for background sampling
    # headers have genome_dir and .nib in them, strip that out
    sys.stderr.write('Converting nib output to dict\n')
    fg_fasta_headers = list(fg_fasta_headers)
    fg_fasta_dict = {}
    for h,s in zip(fg_fasta_headers,fg_fasta) :