parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('nondiatable') parser.add_argument('allelehits') parser.add_argument('genelist') parser.add_argument('-o', '--pdfout', default='ERCCsum_vs_biallelic_nonDia_variableminrpkm_v11.pdf') parser.add_argument('--RNAamountfactor', default=1.0, type=float) parser.add_argument('--lineparams', nargs=2, type=float) parser.add_argument('--xfor20rpkm', default=21, type=float) o = parser.parse_args() ERCCvol_ul = 0.1/40000 ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul) * o.RNAamountfactor genelist_first = set(dr_tools.loadlist(o.genelist)) expra = dr_tools.loadexpr(o.allelehits, True) expr = dr_tools.loadexpr(o.rpkmfile, False) spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID] genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID] # pass 1: get cells per source cells_per_source = defaultdict(list) for p, sample, sample_i, cellsource in table_loader(): cells_per_source[cellsource].append(sample) # middle step: get gene lists per cell source at RPKM cutoff genelist_sources = defaultdict(dict) for source, samples_source in cells_per_source.items(): #samples = set.union(*map(set, cells_per_source.values())) #new in v10 from v9 samples = samples_source for ti, sym in enumerate(expr['symbols']):
from itertools import chain if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument('outfile') parser.add_argument('-m', '--maxgenes', type=int) parser.add_argument('-S', '--maxgeneselection', choices=['max', 'mean', 'random'], default='max') parser.add_argument('-t', '--transform', choices=['none', 'log10+0.3'], default='none') parser.add_argument('-c', '--centering', choices=['none', 'mean'], default='none') parser.add_argument('-s', '--samplelist') parser.add_argument('-e', '--excludesample', nargs='+') o = parser.parse_args() # load input expr = dr_tools.loadexpr(o.infile) # select samples if o.samplelist is not None: samples = dr_tools.loadlist(o.samplelist) else: samples = expr.samples if o.excludesample: samples = [s for s in samples if s not in o.excludesample] # select genes genes_i = range(len(expr['symbols'])) if o.maxgenes is not None: select_fn = {'max':max, 'mean':numpy.mean, 'random': (lambda v: random.random())}[o.maxgeneselection] sort_list = [(select_fn([expr[s][i] for s in samples]), i) for i in genes_i] sort_list.sort(reverse=True)
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkms', required=True) parser.add_argument('allelehits', required=True) parser.add_argument('--minrpkm', type=float, default=20) o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkms, False) expra = dr_tools.loadexpr(o.allelehits, True) samples = set(exprt.samples) & set(s.rsplit('_',1)[0] for s in expra.samples[::2]) count_per_gene = dict() assert expra['symbols'] == exprt['symbols'] for ti, sym in # not done
return num_c57only/num_both if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allelehits', required=True) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+', default=[]) parser.add_argument('-R', '--random_dots', type=int, default=1) parser.add_argument('-s', '--samplelist', required=True, nargs='+') parser.add_argument('-n', default=4, type=int) parser.add_argument('-o', '--figure', default='poolN.pdf') parser.add_argument('-S', '--subtract_allelerand', action='store_true') parser.add_argument('-r', '--allelerand_skew', action='store_true') o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) if o.allowedgenes: allowed_genes = set(dr_tools.loadlist(o.allowedgenes)) else: allowed_genes = None disallowed_genes = set() for filename in o.disallowedgenes: disallowed_genes.update(set(dr_tools.loadlist(filename))) random.seed(0) samples_n = dict((samplelist, [random.sample(dr_tools.loadlist(samplelist, ignore='#'), o.n) for di in range(o.random_dots)]) for samplelist in o.samplelist) samples_all = [sa.split('_c57only')[0] for sa in expra.samples[::2]] allelerand_skew = dict((gi, ratio(expra, gi, samples_all)) for gi in range(len(expra['symbols'])))
opts = argparse.ArgumentParser() opts.add_argument('inf') opts.add_argument('rpkmf_total') opts.add_argument('min_rpkm', type=float) opts.add_argument('--filter', nargs='+') opts.add_argument('-f', '--figf', default='plot_monoallelic_by_cell_minrpkm.pdf') opts.add_argument('-gi', '--genelistf_include', nargs='+') opts.add_argument('-ge', '--genelistf_exclude', nargs='+') opts.add_argument('--castfather', action='store_true') opts.add_argument('--infercross', action='store_true') opts.add_argument('--alg2', action='store_true') o = opts.parse_args() expr = dr_tools.loadexpr([o.inf], counts=True) exprt = dr_tools.loadexpr([o.rpkmf_total], counts=False) allowed_gene_i = gene_i_by_listf(o.genelistf_include, expr) if o.genelistf_include else None excluded_gene_i = gene_i_by_listf(o.genelistf_exclude, expr) if o.genelistf_exclude else None def rpkm(Ai, sample): Ti = exprt.ID_to_index[expr['IDs'][Ai]] return exprt[sample][Ti] for p in dr_tools.splitlines(o.inf): if p[0] == '#samples': samples = p[1:] break
raise # sort the columns sample_order = [name for num_out,name in sorted((num(name), name) for name in sample_values)] # add in removal of e.g. midblast_2-19,midblast_2-20,midblast_2-22 if o.samplenames: with open(o.samplenames, 'r') as infh: requested_samples = set(line.split()[0] for line in infh) sample_order = [name for name in sample_order if name in requested_samples] if requested_samples - set(sample_order): print 'Missing:\n' + '\n'.join(list(requested_samples - set(sample_order))) # change ID column if o.rpkmf_getID: expr = dr_tools.loadexpr(o.rpkmf_getID) symbol_to_IDs = dict(zip(expr['symbols'],expr['IDs'])) #IDs = [symbol_to_IDs.get(sym, prevID) for prevID, sym in zip(IDs, symbols)] IDs = [symbol_to_IDs.get(sym, 'NA') for prevID, sym in zip(IDs, symbols)] if o.rpkmf_genes: symbols_set = dict((s,i) for i,s in enumerate(symbols)) new_sample_values = dict() for name in sample_order: new_sample_values[name] = [] for i, symbol in enumerate(expr['symbols']): if symbol in symbols_set: new_sample_values[name].append(sample_values[name][symbols_set[symbol]]) else: new_sample_values[name].append('0 0') sample_values = new_sample_values symbols = expr['symbols']
import argparse, dr_tools, os if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmf_alleles', nargs='?') parser.add_argument('rpkmf_total') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False) counts = dr_tools.loadexpr(o.rpkmf_total, counts=True) if o.rpkmf_alleles: expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True) AiD = dict((ti, expra.ID_to_index[ID]) for ti, ID in enumerate(exprt['IDs']) if ID in expra.ID_to_index) for s in exprt.samples: if s+'_castonly' not in expra.samples: continue with open(s + '_expression.txt', 'w') as outfh: print >>outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs', 'RPKM', 'reads', 'CAST_hits', 'C57_hits') for ti in range(len(exprt['IDs'])): if ti in AiD: ai = AiD[ti] cast = int(expra[s+'_castonly'][ai]) c57 = int(expra[s+'_c57only'][ai]) else: cast = 0 c57 = 0 rpkm = exprt[s][ti] reads = int(round(counts[s][ti]))
parser.add_argument('-o', '--sample_list_prefix') o = parser.parse_args() header, markers, marker_order = parse_table(o.tableS4) gene_to_marker = dict(dr_tools.splitlines(o.to_cytof_markers)) marker_order = [m for m in marker_order if m in gene_to_marker.values()] if not o.shuffle_patterns: pop_cytof_pattern = dict( (pop, [markers[m][popi] for m in marker_order]) for popi, pop in enumerate(header)) else: pop_cytof_pattern = dict( (pop, random.shuffle([markers[m][popi] for m in marker_order])) for popi, pop in enumerate(header)) exprt = dr_tools.loadexpr(o.rpkmfile) random.seed(0) midexpr_symi_all_D = dict() for symi, sym in enumerate(exprt['symbols']): if sym not in gene_to_marker: raise Exception(dr_tools.join(sym, 'sym')) if gene_to_marker[sym] not in markers: raise Exception(dr_tools.join(gene_to_marker[sym], 'cytof')) midexpr_symi_all_D[gene_to_marker[sym]] = (numpy.mean( [exprt[s][symi] for s in exprt.samples]), symi) midexpr_symi_all = [midexpr_symi_all_D[m] for m in marker_order] sym_order = [midexpr_symi_all_D[m][1] for m in marker_order] pop_counts = dict((pop, 0) for pop in pop_cytof_pattern) pop_samples = defaultdict(list)
import argparse, dr_tools, os if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmf_alleles', nargs='?') parser.add_argument('rpkmf_total') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False) counts = dr_tools.loadexpr(o.rpkmf_total, counts=True) if o.rpkmf_alleles: expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True) AiD = dict((ti, expra.ID_to_index[ID]) for ti, ID in enumerate(exprt['IDs']) if ID in expra.ID_to_index) for s in exprt.samples: if s + '_castonly' not in expra.samples: continue with open(s + '_expression.txt', 'w') as outfh: print >> outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs', 'RPKM', 'reads', 'CAST_hits', 'C57_hits') for ti in range(len(exprt['IDs'])): if ti in AiD: ai = AiD[ti] cast = int(expra[s + '_castonly'][ai]) c57 = int(expra[s + '_c57only'][ai]) else: cast = 0
] # add in removal of e.g. midblast_2-19,midblast_2-20,midblast_2-22 if o.samplenames: with open(o.samplenames, 'r') as infh: requested_samples = set(line.split()[0] for line in infh) sample_order = [ name for name in sample_order if name in requested_samples ] if requested_samples - set(sample_order): print 'Missing:\n' + '\n'.join( list(requested_samples - set(sample_order))) # change ID column if o.rpkmf_getID: expr = dr_tools.loadexpr(o.rpkmf_getID) symbol_to_IDs = dict(zip(expr['symbols'], expr['IDs'])) #IDs = [symbol_to_IDs.get(sym, prevID) for prevID, sym in zip(IDs, symbols)] IDs = [ symbol_to_IDs.get(sym, 'NA') for prevID, sym in zip(IDs, symbols) ] if o.rpkmf_genes: symbols_set = dict((s, i) for i, s in enumerate(symbols)) new_sample_values = dict() for name in sample_order: new_sample_values[name] = [] for i, symbol in enumerate(expr['symbols']): if symbol in symbols_set: new_sample_values[name].append( sample_values[name][symbols_set[symbol]]) else:
else: conc_attomolul += float(p[Mix1_i]) return conc_attomolul * before_dilution_vol_ul * 602214.12927 if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('nondiatable') parser.add_argument('allelehits') o = parser.parse_args() ERCCvol_ul = 4e-7 ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul) expr = dr_tools.loadexpr(o.rpkmfile, False) spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID] genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID] xarr = defaultdict(list) for p in dr_tools.splitlines(o.nondiatable): if p[0] == '#sample': index_cellsource = p.index('cell.type') else: sample = p[0] if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample try: ERCC_rpkmsum = sum(expr[sample][spike] for spike in spikes_i) if ERCC_rpkmsum < 100: continue
o = parser.parse_args() plotted_variable_index = { 'mono%': 3, 'z': 1, 'num_genes': 0, 'error': 2, 'info_genes': 4 }[o.plotted_variable] # suffixes of sample names in expression file S2 = '_c57only' S1 = '_castonly' # load expra = dr_tools.loadexpr(o.allelehits_file, True) samples = [s.split(S2)[0] for s in expra.samples[::2]] if o.minrpkm: exprt = dr_tools.loadexpr(o.rpkm_file, False) samples = [s for s in samples if s in exprt.samples] global done_c done_c = dict() # pairs end in the same capital letter # skip the _wronglane samples and the non-split cells extract_short_name = extract_short_name2 if o.fibroblastnames else extract_short_name1 pair_letters = list( set(remove_digits(extract_short_name(name)) for name in samples) - set(['']))
from __future__ import division import argparse, dr_tools, numpy, pylab, random if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkms) allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None samples = exprt.samples for ti, sym in enumerate(exprt['symbols']): meanexpr = numpy.mean([exprt[s][ti] for s in samples]) if meanexpr < o.minrpkm: continue if o.maxrpkm is not None and meanexpr >= o.maxrpkm: continue if disallowedgenes and sym in disallowedgenes: continue if allowedgenes and sym not in allowedgenes: continue print sym
if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('rpkmf_alleles') opts.add_argument('--filter', nargs='+') opts.add_argument('-M', '--method', default='monoallelic', choices=['m', 'monoallelic', 'monoallelic_norm', 'monoallelic_norm2', 'c57overlap', 'castoverlap', 'c57overlap_assym', 'castoverlap_assym', 'numsamemono', 'numsameC57', 'numsameCAST', 'numsamemono_norm', 'numsameC57_norm', 'numsameCAST_norm', 'spearman', 'pearson', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', 'numsamemono100', 'numsamemono100_norm']) opts.add_argument('-L', '--linkage', default='complete', choices=['single', 'average', 'complete', 'linkage', 'weighted', 'centroid', 'median', 'ward']) opts.add_argument('-s', '--bootstrap', type=int) opts.add_argument('-S', '--states', default='3state', choices=['3state', 'fraction', 'diff', 'monoallelic']) opts.add_argument('--fig', default='which_allele_tree.pdf') opts.add_argument('-r', '--rpkmf_total') opts.add_argument('-t', '--threshold_rpkm', help='requires --rpkmf_total', type=float) opts.add_argument('-R', '--randomize', action='store_true') o = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([o.rpkmf_alleles], counts=True) samples_alleles = sorted([e for e in expr_alleles if e not in ('IDs', 'symbols')]) if o.rpkmf_total is not None: expr_total = dr_tools.loadexpr([o.rpkmf_total], counts=False) exprt_samples = set(expr_total.samples) character_matrix = [] # 2D, values from state() samplenames = [] for s1, s2 in zip(samples_alleles[::2], samples_alleles[1::2]): if o.filter is not None and not any(part in s1.rsplit('_',1)[0] for part in o.filter): continue samplename = s1.rsplit('_',1)[0] # check that sample labels are consistent if samplename != s2.rsplit('_',1)[0] and samplename in expr_total: continue
parser.add_argument('-o', '--figure', default='pair_overlap6.pdf') parser.add_argument('-v', '--plotted_variable', default='mono%', choices=['mono%', 'z', 'num_genes', 'error', 'info_genes']) parser.add_argument('--ylim', default=[0,1], type=float, nargs=2) parser.add_argument('-s', '--shiftpairs', type=int, nargs='?', const=1) parser.add_argument('-F', '--fibroblastnames', action='store_true') parser.add_argument('-S', '--separatelines', action='store_true') o = parser.parse_args() plotted_variable_index = {'mono%':3, 'z': 1, 'num_genes': 0, 'error': 2, 'info_genes':4}[o.plotted_variable] # suffixes of sample names in expression file S2 = '_c57only' S1 = '_castonly' # load expra = dr_tools.loadexpr(o.allelehits_file, True) samples = [s.split(S2)[0] for s in expra.samples[::2]] if o.minrpkm: exprt = dr_tools.loadexpr(o.rpkm_file, False) samples = [s for s in samples if s in exprt.samples] global done_c done_c = dict() # pairs end in the same capital letter # skip the _wronglane samples and the non-split cells extract_short_name = extract_short_name2 if o.fibroblastnames else extract_short_name1 pair_letters = list(set(remove_digits(extract_short_name(name)) for name in samples) - set([''])) if o.pairletters: pair_letters = [l for l in pair_letters if l.strip('-_') in o.pairletters] print pair_letters
def w(gi, Y_k, Y_r, N_k, N_r): return (N_k - Y_k[gi])/N_k/Y_k[gi] + (N_r - Y_r[gi])/N_r/Y_r[gi] def A(gi, Y_k, Y_r, N_k, N_r): return 0.5*log2(Y_k[gi]/N_k * Y_r[gi]/N_r) if Y_k[gi] > 0 else -10000 if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('infile') parser.add_argument('outfile') parser.add_argument('--ref_samples', nargs='+', metavar='samplename') parser.add_argument('--copy_counts', action='store_true', help='does not work with stdin as input') parser.add_argument('--run_on_counts', action='store_true') o = parser.parse_args() expr_in = dr_tools.loadexpr(o.infile, counts=o.run_on_counts) ref_samples = expr_in.samples if o.ref_samples is None else o.ref_samples Y_r = [numpy.mean([expr_in[s][gi] for s in ref_samples]) for gi in range(len(expr_in['symbols']))] N_r = sum(Y_r) expr_out = dr_tools.Parsed_rpkms([], False) normalization_factors = [] for s in expr_in.samples: Y_k = expr_in[s] N_k = sum(Y_k) nonzero = [gi for gi in range(len(expr_in['symbols'])) if Y_k[gi] > 0 and Y_r[gi] > 0] A_distr = sorted((A(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero) M_distr = sorted((M(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero)
from __future__ import division import argparse, dr_tools, numpy, pylab, random if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkms', required=True) parser.add_argument('-m', '--minrpkm', default=20, type=float) parser.add_argument('-M', '--maxrpkm', type=float) parser.add_argument('-gi', '--allowedgenes') parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() exprt = dr_tools.loadexpr(o.rpkms) allowedgenes = set(dr_tools.loadlist( o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None samples = exprt.samples for ti, sym in enumerate(exprt['symbols']): meanexpr = numpy.mean([exprt[s][ti] for s in samples]) if meanexpr < o.minrpkm: continue if o.maxrpkm is not None and meanexpr >= o.maxrpkm: continue if disallowedgenes and sym in disallowedgenes: continue if allowedgenes and sym not in allowedgenes: continue print sym
choices=['max', 'mean', 'random'], default='max') parser.add_argument('-t', '--transform', choices=['none', 'log10+0.3'], default='none') parser.add_argument('-c', '--centering', choices=['none', 'mean'], default='none') parser.add_argument('-s', '--samplelist') parser.add_argument('-e', '--excludesample', nargs='+') o = parser.parse_args() # load input expr = dr_tools.loadexpr(o.infile) # select samples if o.samplelist is not None: samples = dr_tools.loadlist(o.samplelist) else: samples = expr.samples if o.excludesample: samples = [s for s in samples if s not in o.excludesample] # select genes genes_i = range(len(expr['symbols'])) if o.maxgenes is not None: select_fn = { 'max': max, 'mean': numpy.mean,
from __future__ import division import argparse, pylab, dr_tools from scipy import stats if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('diatable') parser.add_argument('--dim', type=float, default=3) o = parser.parse_args() expr = dr_tools.loadexpr(o.rpkmfile, True) spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID] xarr = [] yarr = [] for p in dr_tools.splitlines(o.diatable): if p[0] == '#sample': index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')] else: sample = p[0] ERCC_readsum = sum(expr[sample][spike] for spike in spikes_i) sample_i = expr.samples.index(sample) mRNA_readsum = expr.normalizationreads[sample_i] try: width = float(p[index_dia[1]]) length = float(p[index_dia[0]]) except ValueError: continue xarr.append((width*length)**(o.dim/2))
parser.add_argument('-o', '--figure', default='pool_n.pdf') parser.add_argument('--nonrandom_n1', action='store_true') o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) allowedgenes = set(dr_tools.loadlist( o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None expra = dr_tools.loadexpr(o.allelehits, True) c57fraction = 0.5 for clonal_group in dr_tools.loadlist(o.clonal_groups): if not o.clonal_group in clonal_group: continue samples = [s.rsplit('_', 1)[0] for s in expra.samples[::2]] samples = [ s for s in samples if any( s.startswith(clonal_group_start) or s.startswith('pool.' + clonal_group_start) for clonal_group_start in clonal_group.split('\t')) ] xarr_n = [] yarr_mono = [] xarr_ctrl_n = []
if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('rpkmf_alleles') opts.add_argument( '--genePred', default= '/mnt/crick/danielr/twocellstage/mouse/annotation/mm9_refGene_31Jul2011_norandom.txt' ) opts.add_argument('--filter', nargs='+') opts.add_argument('-o', '--figurefile', default='monoallelic_by_chr.pdf') args = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted( e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any( part in e for part in args.filter))) # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) genes_per_chr = dict() ID_to_gene = dict() for p in dr_tools.splitlines(args.genePred): ID = p[1] if ID in allowed_IDs: chromosome = p[2] if 'random' in chromosome: continue
danielr@rna ~/casthybrid/one_chr_reads $ python allele_independence.py -i ~/casthybrid/snp_positions/allelecounts_from_pileup/v17S15_genomic_refseq_autosomes.txt --stages blast to allele_independence_blastocyst.pdf ''' if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('-i', '--inf', nargs='+', required=True) opts.add_argument('--stages', nargs='+', help='when there is not a genomewide maternal bias') opts.add_argument('--exclude', nargs='+', help='remove from stages', default=[]) opts.add_argument('--sim', action='store_true') opts.add_argument('-o', '--figure', default='allele_independence.pdf') opts.add_argument('--plotstyle', default=['mean_graph'], choices=['mean_graph', 'boxplot', 'mean_sem', 'violin', 'std', 'sayN', 'sayY'], nargs='+') opts.add_argument('--minN', default=0, type=int) o = opts.parse_args() expra = dr_tools.loadexpr(o.inf, counts=True) sample_pairs = pairs(expra, o.stages, o.exclude) bins = [Bin(num_cells, len(sample_pairs), o.sim) for num_cells in range(len(sample_pairs)+1)] for gene_i in range(len(expra['symbols'])): num_mono = sum((expra[s_pat][gene_i]>0)^(expra[s_mat][gene_i]>0) for s_pat,s_mat in sample_pairs) num_bi = sum((expra[s_pat][gene_i]>0)and(expra[s_mat][gene_i]>0) for s_pat,s_mat in sample_pairs) num_silent = sum((expra[s_pat][gene_i]==0)and(expra[s_mat][gene_i]==0) for s_pat,s_mat in sample_pairs) bins[num_silent].add(num_bi) if o.sim: while any(len(b.exp_frac_bi) < 10000 for b in bins): r = random.random()**2 sim_states = [(random.random() < r, random.random() < r) for p in sample_pairs]
allowedgenes = set() for genelistf in genelistf_arr: allowedgenes |= set(dr_tools.loadlist(genelistf)) return set(i for i,sym in enumerate(expr['symbols']) if sym in allowedgenes) if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('inf') opts.add_argument('--filter', nargs='+') opts.add_argument('-f', '--figf', default='plot_monoallelic_by_cell.pdf') opts.add_argument('-gi', '--genelistf_include', nargs='+') opts.add_argument('-ge', '--genelistf_exclude', nargs='+') o = opts.parse_args() expr = dr_tools.loadexpr([o.inf], counts=True) #samples = sorted([e for e in expr if e not in ('IDs', 'symbols')]) allowed_gene_i = gene_i_by_listf(o.genelistf_include, expr) if o.genelistf_include else None excluded_gene_i = gene_i_by_listf(o.genelistf_exclude, expr) if o.genelistf_exclude else None for p in dr_tools.splitlines(o.inf): if p[0] == '#samples': samples = p[1:]; break fractions = [] # maternal only + paternal only mfractions = [] # maternal only fractions_all3 = [] # maternal+parternal+biallelic labels = [] for s1, s2 in zip(samples[::2], samples[1::2]):
#if not 'Mix 1' in p[Mix1_i]: raise Exception else: conc_attomolul += float(p[Mix1_i]) return conc_attomolul * before_dilution_vol_ul * 602214.12927 if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('nondiatable') parser.add_argument('allelehits') o = parser.parse_args() ERCCvol_ul = 4e-7 ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul) expr = dr_tools.loadexpr(o.rpkmfile, False) spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID] genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID] xarr = defaultdict(list) for p in dr_tools.splitlines(o.nondiatable): if p[0] == '#sample': index_cellsource = p.index('cell.type') else: sample = p[0] if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample try: ERCC_rpkmsum = sum(expr[sample][spike] for spike in spikes_i) if ERCC_rpkmsum < 100: continue sample_i = expr.samples.index(sample)
pairs = list(itertools.combinations(samples, 2)) if len(pairs) > nmax: return random.sample(pairs, nmax) else: return pairs if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkmfile', nargs='+', required=True) parser.add_argument('-s', '--samplelist', nargs='+', required=True, action='append') parser.add_argument('-o', '--figure', default='correlation.pdf') parser.add_argument('-n', '--names', action='append') parser.add_argument('-m', '--maxpergroup', type=int, default=300000000) o = parser.parse_args() expr = dr_tools.loadexpr(o.rpkmfile) boxplot_values = [] labels = [] for samplelistgroup, name in itertools.izip_longest(o.samplelist, o.names): if samplelistgroup is None: raise Exception if name is None: label = '' else: label = name + '\n' rho_values = [] samples_used = set() possible_pairs = 0 for samplelistfile in samplelistgroup: samples = set(dr_tools.loadlist(samplelistfile)) rho_values.extend([stats.spearmanr(expr[s1], expr[s2])[0] for s1, s2 in maxpairs(samples, o.maxpergroup)]) samples_used.update(samples) possible_pairs += len(samples) * (len(samples)-1) // 2 boxplot_values.append(rho_values)
from __future__ import division import argparse, dr_tools, numpy from collections import defaultdict if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('sample_and_chromosome_list') parser.add_argument('output_file') parser.add_argument('-A', '--annotationfile') parser.add_argument('-a', '--allelehits') o = parser.parse_args() exprr = dr_tools.loadexpr(o.allelehits, False) expra = dr_tools.loadexpr(o.allelehits, True) chrom_to_IDs = defaultdict(set) for p in dr_tools.splitlines(o.annotationfile): chrom = p[2] sym = p[12] ID = p[1] chrom_to_IDs[chrom].add(ID) samples_set = set(expra.samples) with open(o.sample_and_chromosome_list) as infh: for line in infh: p = line.split() chrom = p[1] s_c57 = p[0]+'_c57only' s_cast = p[0]+'_castonly' if p[0] not in samples_set: continue
def MAfraction(expra, sample): count_bi, count_mono = 0, 0 for sym, c57, cast in zip(expra['symbols'], expra[sample+'_c57only'], expra[sample+'_castonly']): if c57 and cast: count_bi += 1 elif c57 or cast: count_mono += 1 return count_mono/(count_bi + count_mono) if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('allelehits') parser.add_argument('diatable') parser.add_argument('--dim', type=float, default=3) o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) xarr = [] yarr = [] for p in dr_tools.splitlines(o.diatable): if p[0] == '#sample': index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')] else: sample = p[0] try: width = float(p[index_dia[1]]) length = float(p[index_dia[0]]) except ValueError: continue
opts.add_argument('-s', '--bootstrap', type=int) opts.add_argument('-S', '--states', default='3state', choices=['3state', 'fraction', 'diff', 'monoallelic']) opts.add_argument('--fig', default='which_allele_tree.pdf') opts.add_argument('-r', '--rpkmf_total') opts.add_argument('-t', '--threshold_rpkm', help='requires --rpkmf_total', type=float) opts.add_argument('-R', '--randomize', action='store_true') o = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([o.rpkmf_alleles], counts=True) samples_alleles = sorted( [e for e in expr_alleles if e not in ('IDs', 'symbols')]) if o.rpkmf_total is not None: expr_total = dr_tools.loadexpr([o.rpkmf_total], counts=False) exprt_samples = set(expr_total.samples) character_matrix = [] # 2D, values from state() samplenames = [] for s1, s2 in zip(samples_alleles[::2], samples_alleles[1::2]): if o.filter is not None and not any(part in s1.rsplit('_', 1)[0] for part in o.filter): continue samplename = s1.rsplit('_', 1)[0] # check that sample labels are consistent
opts.add_argument('-f', '--figurefile', default='monoallelic_at_chr.png') opts.add_argument('-w', '--maxwhite', type=int) opts.add_argument('--allowallwhite', action='store_true') opts.add_argument('--allowedgenes') opts.add_argument('--disallowedgenes') opts.add_argument('--verticalborder', action='store_true') opts.add_argument('--stageline', action='store_true') opts.add_argument('--embryoline', action='store_true') opts.add_argument('--embryonotch', action='store_true') opts.add_argument('--mincoord', type=int) opts.add_argument('--maxcoord', type=int) opts.add_argument('--saygenes', action='store_true') args = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter))) for p in dr_tools.splitlines(args.rpkmf_alleles): if p[0] == '#samples': samples = p[1:]; break samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))] # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file if args.allowedgenes is None and args.disallowedgenes is None: allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) else: if args.allowedgenes: allowed_set = set(dr_tools.loadlist(args.allowedgenes)) if args.disallowedgenes: disallowed_set = set(dr_tools.loadlist(args.disallowedgenes))
if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('-i1', '--inf1', required=True) # e.g. ooref15... opts.add_argument('-F1', default=0.02, type=float) opts.add_argument('-i2', '--inf2') # e.g. ooref13... opts.add_argument('-F2', type=float, default=0) opts.add_argument('-o', '--outf', default='/dev/stdout') opts.add_argument('--addminreads', type=int, default=0) opts.add_argument('--round', choices=['0.5up', 'ceil', 'floor'], default='ceil') opts.add_argument('--minreadsboth', type=int, default=0) args = opts.parse_args() expr1 = dr_tools.loadexpr([args.inf1], counts=True) if args.inf2 is not None: expr2 = dr_tools.loadexpr([args.inf2], counts=True) for i, p in enumerate(dr_tools.splitlines(args.inf1)): samples = p[1:] break gene_counts_out = defaultdict(list) for s1, s2 in zip(samples[::2], samples[1::2]): if s1.rsplit('_', 1)[0] != s2.rsplit('_', 1)[0]: raise Exception for gene_i, symbol in enumerate(expr1['symbols']): # remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth: s1e = 0
roundfunc = {'ceil':math.ceil, '0.5up':round, 'floor':math.floor}[rounding] return max(0, expr_s1 - roundfunc(F*expr_s2+addminreads)) if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('-i1', '--inf1', required=True) # e.g. ooref15... opts.add_argument('-F1', default=0.02, type=float) opts.add_argument('-i2', '--inf2') # e.g. ooref13... opts.add_argument('-F2', type=float, default=0) opts.add_argument('-o', '--outf', default='/dev/stdout') opts.add_argument('--addminreads', type=int, default=0) opts.add_argument('--round', choices=['0.5up', 'ceil', 'floor'], default='ceil') opts.add_argument('--minreadsboth', type=int, default=0) args = opts.parse_args() expr1 = dr_tools.loadexpr([args.inf1], counts=True) if args.inf2 is not None: expr2 = dr_tools.loadexpr([args.inf2], counts=True) for i, p in enumerate(dr_tools.splitlines(args.inf1)): samples = p[1:] break gene_counts_out = defaultdict(list) for s1, s2 in zip(samples[::2], samples[1::2]): if s1.rsplit('_',1)[0] != s2.rsplit('_',1)[0]: raise Exception for gene_i, symbol in enumerate(expr1['symbols']): # remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth: s1e = 0
parser.add_argument('-n2', '--end_n', default=15, type=int) parser.add_argument('-o', '--figure', default='pool_n.pdf') parser.add_argument('--nonrandom_n1', action='store_true') o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None expra = dr_tools.loadexpr(o.allelehits, True) c57fraction = 0.5 for clonal_group in dr_tools.loadlist(o.clonal_groups): if not o.clonal_group in clonal_group: continue samples = [s.rsplit('_',1)[0] for s in expra.samples[::2]] samples = [s for s in samples if any(s.startswith(clonal_group_start) or s.startswith('pool.'+clonal_group_start) for clonal_group_start in clonal_group.split('\t'))] xarr_n = [] yarr_mono = [] xarr_ctrl_n = [] yarr_ctrl = [] xarr_n_line = [] yarr_mono_line = [] yarr_ctrl_line = [] for n in range(o.start_n, o.end_n+1):