parser = argparse.ArgumentParser()
	parser.add_argument('rpkmfile')
	parser.add_argument('nondiatable')
	parser.add_argument('allelehits')
	parser.add_argument('genelist')
	parser.add_argument('-o', '--pdfout', default='ERCCsum_vs_biallelic_nonDia_variableminrpkm_v11.pdf')
	parser.add_argument('--RNAamountfactor', default=1.0, type=float)
	parser.add_argument('--lineparams', nargs=2, type=float)
	parser.add_argument('--xfor20rpkm', default=21, type=float)
	o = parser.parse_args()
	
	ERCCvol_ul = 0.1/40000
	ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul) * o.RNAamountfactor
	
	genelist_first = set(dr_tools.loadlist(o.genelist))
	expra = dr_tools.loadexpr(o.allelehits, True)
	expr = dr_tools.loadexpr(o.rpkmfile, False)
	spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID]
	genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID]
	
	# pass 1: get cells per source
	cells_per_source = defaultdict(list)
	for p, sample, sample_i, cellsource in table_loader():
		cells_per_source[cellsource].append(sample)
		
	# middle step: get gene lists per cell source at RPKM cutoff
	genelist_sources = defaultdict(dict)
	for source, samples_source in cells_per_source.items():
		#samples = set.union(*map(set, cells_per_source.values())) #new in v10 from v9
		samples = samples_source
		for ti, sym in enumerate(expr['symbols']):
from itertools import chain

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('infile')
	parser.add_argument('outfile')
	parser.add_argument('-m', '--maxgenes', type=int)
	parser.add_argument('-S', '--maxgeneselection', choices=['max', 'mean', 'random'], default='max')
	parser.add_argument('-t', '--transform', choices=['none', 'log10+0.3'], default='none')
	parser.add_argument('-c', '--centering', choices=['none', 'mean'], default='none')
	parser.add_argument('-s', '--samplelist')
	parser.add_argument('-e', '--excludesample', nargs='+')
	o = parser.parse_args()

	# load input
	expr = dr_tools.loadexpr(o.infile)
	
	# select samples
	if o.samplelist is not None:
		samples = dr_tools.loadlist(o.samplelist)
	else:
		samples = expr.samples
	if o.excludesample:
		samples = [s for s in samples if s not in o.excludesample]

	# select genes
	genes_i = range(len(expr['symbols']))
	if o.maxgenes is not None:
		select_fn = {'max':max, 'mean':numpy.mean, 'random': (lambda v: random.random())}[o.maxgeneselection]
		sort_list = [(select_fn([expr[s][i] for s in samples]), i) for i in genes_i]
		sort_list.sort(reverse=True)
Ejemplo n.º 3
0
import argparse, dr_tools

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('rpkms', required=True)
	parser.add_argument('allelehits', required=True)
	parser.add_argument('--minrpkm', type=float, default=20)
	o = parser.parse_args()
	
	exprt = dr_tools.loadexpr(o.rpkms, False)
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	samples = set(exprt.samples) & set(s.rsplit('_',1)[0] for s in expra.samples[::2])
	
	count_per_gene = dict()
	
	assert expra['symbols'] == exprt['symbols']
	
	for ti, sym in 
	# not done
	return num_c57only/num_both

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-a', '--allelehits', required=True)
	parser.add_argument('-gi', '--allowedgenes')
	parser.add_argument('-ge', '--disallowedgenes', nargs='+', default=[])
	parser.add_argument('-R', '--random_dots', type=int, default=1)
	parser.add_argument('-s', '--samplelist', required=True, nargs='+')
	parser.add_argument('-n', default=4, type=int)
	parser.add_argument('-o', '--figure', default='poolN.pdf')
	parser.add_argument('-S', '--subtract_allelerand', action='store_true')
	parser.add_argument('-r', '--allelerand_skew', action='store_true')
	o = parser.parse_args()
	
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	if o.allowedgenes:
		allowed_genes = set(dr_tools.loadlist(o.allowedgenes))
	else:
		allowed_genes = None
	disallowed_genes = set()
	for filename in o.disallowedgenes:
		disallowed_genes.update(set(dr_tools.loadlist(filename)))
	
	random.seed(0)
	
	samples_n = dict((samplelist, [random.sample(dr_tools.loadlist(samplelist, ignore='#'), o.n) for di in range(o.random_dots)]) for samplelist in o.samplelist)
	
	samples_all = [sa.split('_c57only')[0] for sa in expra.samples[::2]]
	allelerand_skew =  dict((gi, ratio(expra, gi, samples_all)) for gi in range(len(expra['symbols'])))
Ejemplo n.º 5
0
    opts = argparse.ArgumentParser()
    opts.add_argument('inf')
    opts.add_argument('rpkmf_total')
    opts.add_argument('min_rpkm', type=float)
    opts.add_argument('--filter', nargs='+')
    opts.add_argument('-f',
                      '--figf',
                      default='plot_monoallelic_by_cell_minrpkm.pdf')
    opts.add_argument('-gi', '--genelistf_include', nargs='+')
    opts.add_argument('-ge', '--genelistf_exclude', nargs='+')
    opts.add_argument('--castfather', action='store_true')
    opts.add_argument('--infercross', action='store_true')
    opts.add_argument('--alg2', action='store_true')
    o = opts.parse_args()

    expr = dr_tools.loadexpr([o.inf], counts=True)
    exprt = dr_tools.loadexpr([o.rpkmf_total], counts=False)

    allowed_gene_i = gene_i_by_listf(o.genelistf_include,
                                     expr) if o.genelistf_include else None
    excluded_gene_i = gene_i_by_listf(o.genelistf_exclude,
                                      expr) if o.genelistf_exclude else None

    def rpkm(Ai, sample):
        Ti = exprt.ID_to_index[expr['IDs'][Ai]]
        return exprt[sample][Ti]

    for p in dr_tools.splitlines(o.inf):
        if p[0] == '#samples':
            samples = p[1:]
            break
						raise
	
	# sort the columns
	sample_order = [name for num_out,name in sorted((num(name), name) for name in sample_values)]
	
	# add in removal of e.g. midblast_2-19,midblast_2-20,midblast_2-22
	if o.samplenames:
		with open(o.samplenames, 'r') as infh:
			requested_samples = set(line.split()[0] for line in infh)
		sample_order = [name for name in sample_order if name in requested_samples]
		if requested_samples - set(sample_order):
			print 'Missing:\n' + '\n'.join(list(requested_samples - set(sample_order)))
	
	# change ID column
	if o.rpkmf_getID:
		expr = dr_tools.loadexpr(o.rpkmf_getID)
		symbol_to_IDs = dict(zip(expr['symbols'],expr['IDs']))
		#IDs = [symbol_to_IDs.get(sym, prevID) for prevID, sym in zip(IDs, symbols)]
		IDs = [symbol_to_IDs.get(sym, 'NA') for prevID, sym in zip(IDs, symbols)]
		if o.rpkmf_genes:
			symbols_set = dict((s,i) for i,s in enumerate(symbols))
			new_sample_values = dict()
			for name in sample_order:
				new_sample_values[name] = []
				for i, symbol in enumerate(expr['symbols']):
					if symbol in symbols_set:
						new_sample_values[name].append(sample_values[name][symbols_set[symbol]])
					else:
						new_sample_values[name].append('0 0')
			sample_values = new_sample_values
			symbols = expr['symbols']
import argparse, dr_tools, os

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('rpkmf_alleles', nargs='?')
	parser.add_argument('rpkmf_total')
	o = parser.parse_args()
	
	exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False)
	counts = dr_tools.loadexpr(o.rpkmf_total, counts=True)
	
	if o.rpkmf_alleles:
		expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True)
	
	
		AiD = dict((ti, expra.ID_to_index[ID]) for ti, ID in enumerate(exprt['IDs']) if ID in expra.ID_to_index)
	
		for s in exprt.samples:
			if s+'_castonly' not in expra.samples: continue
			with open(s + '_expression.txt', 'w') as outfh:
				print >>outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs', 'RPKM', 'reads', 'CAST_hits', 'C57_hits')
				for ti in range(len(exprt['IDs'])):
					if ti in AiD:
						ai = AiD[ti]
						cast = int(expra[s+'_castonly'][ai])
						c57 = int(expra[s+'_c57only'][ai])
					else:
						cast = 0
						c57 = 0
					rpkm = exprt[s][ti]
					reads = int(round(counts[s][ti]))
    parser.add_argument('-o', '--sample_list_prefix')
    o = parser.parse_args()

    header, markers, marker_order = parse_table(o.tableS4)

    gene_to_marker = dict(dr_tools.splitlines(o.to_cytof_markers))
    marker_order = [m for m in marker_order if m in gene_to_marker.values()]
    if not o.shuffle_patterns:
        pop_cytof_pattern = dict(
            (pop, [markers[m][popi] for m in marker_order])
            for popi, pop in enumerate(header))
    else:
        pop_cytof_pattern = dict(
            (pop, random.shuffle([markers[m][popi] for m in marker_order]))
            for popi, pop in enumerate(header))
    exprt = dr_tools.loadexpr(o.rpkmfile)
    random.seed(0)

    midexpr_symi_all_D = dict()
    for symi, sym in enumerate(exprt['symbols']):
        if sym not in gene_to_marker:
            raise Exception(dr_tools.join(sym, 'sym'))
        if gene_to_marker[sym] not in markers:
            raise Exception(dr_tools.join(gene_to_marker[sym], 'cytof'))
        midexpr_symi_all_D[gene_to_marker[sym]] = (numpy.mean(
            [exprt[s][symi] for s in exprt.samples]), symi)
    midexpr_symi_all = [midexpr_symi_all_D[m] for m in marker_order]
    sym_order = [midexpr_symi_all_D[m][1] for m in marker_order]

    pop_counts = dict((pop, 0) for pop in pop_cytof_pattern)
    pop_samples = defaultdict(list)
import argparse, dr_tools, os

if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument('rpkmf_alleles', nargs='?')
    parser.add_argument('rpkmf_total')
    o = parser.parse_args()

    exprt = dr_tools.loadexpr(o.rpkmf_total, counts=False)
    counts = dr_tools.loadexpr(o.rpkmf_total, counts=True)

    if o.rpkmf_alleles:
        expra = dr_tools.loadexpr(o.rpkmf_alleles, counts=True)

        AiD = dict((ti, expra.ID_to_index[ID])
                   for ti, ID in enumerate(exprt['IDs'])
                   if ID in expra.ID_to_index)

        for s in exprt.samples:
            if s + '_castonly' not in expra.samples: continue
            with open(s + '_expression.txt', 'w') as outfh:
                print >> outfh, dr_tools.join('#Gene_symbol', 'Refseq_IDs',
                                              'RPKM', 'reads', 'CAST_hits',
                                              'C57_hits')
                for ti in range(len(exprt['IDs'])):
                    if ti in AiD:
                        ai = AiD[ti]
                        cast = int(expra[s + '_castonly'][ai])
                        c57 = int(expra[s + '_c57only'][ai])
                    else:
                        cast = 0
Ejemplo n.º 10
0
    ]

    # add in removal of e.g. midblast_2-19,midblast_2-20,midblast_2-22
    if o.samplenames:
        with open(o.samplenames, 'r') as infh:
            requested_samples = set(line.split()[0] for line in infh)
        sample_order = [
            name for name in sample_order if name in requested_samples
        ]
        if requested_samples - set(sample_order):
            print 'Missing:\n' + '\n'.join(
                list(requested_samples - set(sample_order)))

    # change ID column
    if o.rpkmf_getID:
        expr = dr_tools.loadexpr(o.rpkmf_getID)
        symbol_to_IDs = dict(zip(expr['symbols'], expr['IDs']))
        #IDs = [symbol_to_IDs.get(sym, prevID) for prevID, sym in zip(IDs, symbols)]
        IDs = [
            symbol_to_IDs.get(sym, 'NA') for prevID, sym in zip(IDs, symbols)
        ]
        if o.rpkmf_genes:
            symbols_set = dict((s, i) for i, s in enumerate(symbols))
            new_sample_values = dict()
            for name in sample_order:
                new_sample_values[name] = []
                for i, symbol in enumerate(expr['symbols']):
                    if symbol in symbols_set:
                        new_sample_values[name].append(
                            sample_values[name][symbols_set[symbol]])
                    else:
Ejemplo n.º 11
0
        else:
            conc_attomolul += float(p[Mix1_i])
    return conc_attomolul * before_dilution_vol_ul * 602214.12927


if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument('rpkmfile')
    parser.add_argument('nondiatable')
    parser.add_argument('allelehits')
    o = parser.parse_args()

    ERCCvol_ul = 4e-7
    ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul)

    expr = dr_tools.loadexpr(o.rpkmfile, False)
    spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID]
    genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID]

    xarr = defaultdict(list)

    for p in dr_tools.splitlines(o.nondiatable):
        if p[0] == '#sample':
            index_cellsource = p.index('cell.type')
        else:
            sample = p[0]
            if sample == 'BQx46_indD_EmbryoMEF_BxC':
                continue  # degraded sample
            try:
                ERCC_rpkmsum = sum(expr[sample][spike] for spike in spikes_i)
                if ERCC_rpkmsum < 100: continue
    o = parser.parse_args()

    plotted_variable_index = {
        'mono%': 3,
        'z': 1,
        'num_genes': 0,
        'error': 2,
        'info_genes': 4
    }[o.plotted_variable]

    # suffixes of sample names in expression file
    S2 = '_c57only'
    S1 = '_castonly'

    # load
    expra = dr_tools.loadexpr(o.allelehits_file, True)
    samples = [s.split(S2)[0] for s in expra.samples[::2]]

    if o.minrpkm:
        exprt = dr_tools.loadexpr(o.rpkm_file, False)
        samples = [s for s in samples if s in exprt.samples]

    global done_c
    done_c = dict()

    # pairs end in the same capital letter
    # skip the _wronglane samples and the non-split cells
    extract_short_name = extract_short_name2 if o.fibroblastnames else extract_short_name1
    pair_letters = list(
        set(remove_digits(extract_short_name(name))
            for name in samples) - set(['']))
from __future__ import division
import argparse, dr_tools, numpy, pylab, random

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-r', '--rpkms', required=True)
	parser.add_argument('-m', '--minrpkm', default=20, type=float)
	parser.add_argument('-M', '--maxrpkm', type=float)
	parser.add_argument('-gi', '--allowedgenes')
	parser.add_argument('-ge', '--disallowedgenes', nargs='+')
	o = parser.parse_args()
	
	exprt = dr_tools.loadexpr(o.rpkms)
	allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None
	if o.disallowedgenes:
		disallowedgenes = set()
		for filename in o.disallowedgenes:
			disallowedgenes.update(set(dr_tools.loadlist(filename)))
	else:
		disallowedgenes = None
	
	samples = exprt.samples
	
	for ti, sym in enumerate(exprt['symbols']):
		meanexpr = numpy.mean([exprt[s][ti] for s in samples])
		if meanexpr < o.minrpkm: continue
		if o.maxrpkm is not None and meanexpr >= o.maxrpkm: continue
		if disallowedgenes and sym in disallowedgenes: continue
		if allowedgenes and sym not in allowedgenes: continue
		print sym
if '__main__' == __name__:
	opts = argparse.ArgumentParser()
	opts.add_argument('rpkmf_alleles')
	opts.add_argument('--filter', nargs='+')
	opts.add_argument('-M', '--method', default='monoallelic', choices=['m', 'monoallelic', 'monoallelic_norm', 'monoallelic_norm2', 'c57overlap', 'castoverlap', 'c57overlap_assym', 'castoverlap_assym', 'numsamemono', 'numsameC57', 'numsameCAST', 'numsamemono_norm', 'numsameC57_norm', 'numsameCAST_norm', 'spearman', 'pearson', 'braycurtis', 'canberra', 'chebyshev', 'cityblock', 'correlation', 'cosine', 'dice', 'euclidean', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', 'numsamemono100', 'numsamemono100_norm'])
	opts.add_argument('-L', '--linkage', default='complete', choices=['single', 'average', 'complete', 'linkage', 'weighted', 'centroid', 'median', 'ward'])
	opts.add_argument('-s', '--bootstrap', type=int)
	opts.add_argument('-S', '--states', default='3state', choices=['3state', 'fraction', 'diff', 'monoallelic'])
	opts.add_argument('--fig', default='which_allele_tree.pdf')
	opts.add_argument('-r', '--rpkmf_total')
	opts.add_argument('-t', '--threshold_rpkm', help='requires --rpkmf_total', type=float)
	opts.add_argument('-R', '--randomize', action='store_true')
	o = opts.parse_args()
	
	# load expression data
	expr_alleles = dr_tools.loadexpr([o.rpkmf_alleles], counts=True)
	samples_alleles = sorted([e for e in expr_alleles if e not in ('IDs', 'symbols')])
	if o.rpkmf_total is not None:
		expr_total = dr_tools.loadexpr([o.rpkmf_total], counts=False)
		exprt_samples = set(expr_total.samples)
	
	character_matrix = [] # 2D, values from state()
	samplenames = []
	
	for s1, s2 in zip(samples_alleles[::2], samples_alleles[1::2]):
		if o.filter is not None and not any(part in s1.rsplit('_',1)[0] for part in o.filter): continue
		samplename = s1.rsplit('_',1)[0]
		# check that sample labels are consistent
		if samplename != s2.rsplit('_',1)[0] and samplename in expr_total:
			continue
		
	parser.add_argument('-o', '--figure', default='pair_overlap6.pdf')
	parser.add_argument('-v', '--plotted_variable', default='mono%', choices=['mono%', 'z', 'num_genes', 'error', 'info_genes'])
	parser.add_argument('--ylim', default=[0,1], type=float, nargs=2)
	parser.add_argument('-s', '--shiftpairs', type=int, nargs='?', const=1)
	parser.add_argument('-F', '--fibroblastnames', action='store_true')
	parser.add_argument('-S', '--separatelines', action='store_true')
	o = parser.parse_args()
	
	plotted_variable_index = {'mono%':3, 'z': 1, 'num_genes': 0, 'error': 2, 'info_genes':4}[o.plotted_variable]
	
	# suffixes of sample names in expression file
	S2 = '_c57only'
	S1 = '_castonly'
	
	# load
	expra = dr_tools.loadexpr(o.allelehits_file, True)
	samples = [s.split(S2)[0] for s in expra.samples[::2]]
	
	if o.minrpkm:
		exprt = dr_tools.loadexpr(o.rpkm_file, False)
		samples = [s for s in samples if s in exprt.samples]
	
	global done_c
	done_c = dict()
	
	# pairs end in the same capital letter
	# skip the _wronglane samples and the non-split cells
	extract_short_name = extract_short_name2 if o.fibroblastnames else extract_short_name1
	pair_letters = list(set(remove_digits(extract_short_name(name)) for name in samples) - set(['']))
	if o.pairletters: pair_letters = [l for l in pair_letters if l.strip('-_') in o.pairletters]
	print pair_letters
def w(gi, Y_k, Y_r, N_k, N_r):
	return (N_k - Y_k[gi])/N_k/Y_k[gi] + (N_r - Y_r[gi])/N_r/Y_r[gi]

def A(gi, Y_k, Y_r, N_k, N_r):
	return 0.5*log2(Y_k[gi]/N_k * Y_r[gi]/N_r) if Y_k[gi] > 0 else -10000

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('infile')
	parser.add_argument('outfile')
	parser.add_argument('--ref_samples', nargs='+', metavar='samplename')
	parser.add_argument('--copy_counts', action='store_true', help='does not work with stdin as input')
	parser.add_argument('--run_on_counts', action='store_true')
	o = parser.parse_args()
	
	expr_in = dr_tools.loadexpr(o.infile, counts=o.run_on_counts)
	
	ref_samples = expr_in.samples if o.ref_samples is None else o.ref_samples
	Y_r = [numpy.mean([expr_in[s][gi] for s in ref_samples]) for gi in range(len(expr_in['symbols']))]
	N_r = sum(Y_r)
	
	expr_out = dr_tools.Parsed_rpkms([], False)
	normalization_factors = []
	
	for s in expr_in.samples:
		Y_k = expr_in[s]
		N_k = sum(Y_k)
		nonzero = [gi for gi in range(len(expr_in['symbols'])) if Y_k[gi] > 0 and Y_r[gi] > 0]
		A_distr = sorted((A(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero)
		M_distr = sorted((M(gi, Y_k, Y_r, N_k, N_r), gi) for gi in nonzero)
		
from __future__ import division
import argparse, dr_tools, numpy, pylab, random

if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--rpkms', required=True)
    parser.add_argument('-m', '--minrpkm', default=20, type=float)
    parser.add_argument('-M', '--maxrpkm', type=float)
    parser.add_argument('-gi', '--allowedgenes')
    parser.add_argument('-ge', '--disallowedgenes', nargs='+')
    o = parser.parse_args()

    exprt = dr_tools.loadexpr(o.rpkms)
    allowedgenes = set(dr_tools.loadlist(
        o.allowedgenes)) if o.allowedgenes else None
    if o.disallowedgenes:
        disallowedgenes = set()
        for filename in o.disallowedgenes:
            disallowedgenes.update(set(dr_tools.loadlist(filename)))
    else:
        disallowedgenes = None

    samples = exprt.samples

    for ti, sym in enumerate(exprt['symbols']):
        meanexpr = numpy.mean([exprt[s][ti] for s in samples])
        if meanexpr < o.minrpkm: continue
        if o.maxrpkm is not None and meanexpr >= o.maxrpkm: continue
        if disallowedgenes and sym in disallowedgenes: continue
        if allowedgenes and sym not in allowedgenes: continue
        print sym
Ejemplo n.º 18
0
                        choices=['max', 'mean', 'random'],
                        default='max')
    parser.add_argument('-t',
                        '--transform',
                        choices=['none', 'log10+0.3'],
                        default='none')
    parser.add_argument('-c',
                        '--centering',
                        choices=['none', 'mean'],
                        default='none')
    parser.add_argument('-s', '--samplelist')
    parser.add_argument('-e', '--excludesample', nargs='+')
    o = parser.parse_args()

    # load input
    expr = dr_tools.loadexpr(o.infile)

    # select samples
    if o.samplelist is not None:
        samples = dr_tools.loadlist(o.samplelist)
    else:
        samples = expr.samples
    if o.excludesample:
        samples = [s for s in samples if s not in o.excludesample]

    # select genes
    genes_i = range(len(expr['symbols']))
    if o.maxgenes is not None:
        select_fn = {
            'max': max,
            'mean': numpy.mean,
from __future__ import division
import argparse, pylab, dr_tools
from scipy import stats

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('rpkmfile')
	parser.add_argument('diatable')
	parser.add_argument('--dim', type=float, default=3)
	o = parser.parse_args()
	
	expr = dr_tools.loadexpr(o.rpkmfile, True)
	spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID]
	
	xarr = []
	yarr = []
	
	for p in dr_tools.splitlines(o.diatable):
		if p[0] == '#sample':
			index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')]
		else:
			sample = p[0]
			ERCC_readsum = sum(expr[sample][spike] for spike in spikes_i)
			sample_i = expr.samples.index(sample)
			mRNA_readsum = expr.normalizationreads[sample_i]
			try:
				width = float(p[index_dia[1]])
				length = float(p[index_dia[0]])
			except ValueError:
				continue
			xarr.append((width*length)**(o.dim/2))
    parser.add_argument('-o', '--figure', default='pool_n.pdf')
    parser.add_argument('--nonrandom_n1', action='store_true')
    o = parser.parse_args()

    if o.random_seed is not None: random.seed(o.random_seed)

    allowedgenes = set(dr_tools.loadlist(
        o.allowedgenes)) if o.allowedgenes else None
    if o.disallowedgenes:
        disallowedgenes = set()
        for filename in o.disallowedgenes:
            disallowedgenes.update(set(dr_tools.loadlist(filename)))
    else:
        disallowedgenes = None

    expra = dr_tools.loadexpr(o.allelehits, True)

    c57fraction = 0.5

    for clonal_group in dr_tools.loadlist(o.clonal_groups):
        if not o.clonal_group in clonal_group: continue
        samples = [s.rsplit('_', 1)[0] for s in expra.samples[::2]]
        samples = [
            s for s in samples if any(
                s.startswith(clonal_group_start)
                or s.startswith('pool.' + clonal_group_start)
                for clonal_group_start in clonal_group.split('\t'))
        ]
        xarr_n = []
        yarr_mono = []
        xarr_ctrl_n = []
Ejemplo n.º 21
0

if '__main__' == __name__:
    opts = argparse.ArgumentParser()
    opts.add_argument('rpkmf_alleles')
    opts.add_argument(
        '--genePred',
        default=
        '/mnt/crick/danielr/twocellstage/mouse/annotation/mm9_refGene_31Jul2011_norandom.txt'
    )
    opts.add_argument('--filter', nargs='+')
    opts.add_argument('-o', '--figurefile', default='monoallelic_by_chr.pdf')
    args = opts.parse_args()

    # load expression data
    expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True)
    samples_alleles = sorted(
        e for e in expr_alleles
        if e not in ('IDs', 'symbols') and (args.filter is None or any(
            part in e for part in args.filter)))

    # sort the genes by position
    # only include transcripts which are the first ID in the entry of the rpkm file
    allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs'])
    genes_per_chr = dict()
    ID_to_gene = dict()
    for p in dr_tools.splitlines(args.genePred):
        ID = p[1]
        if ID in allowed_IDs:
            chromosome = p[2]
            if 'random' in chromosome: continue
danielr@rna ~/casthybrid/one_chr_reads $ python allele_independence.py -i ~/casthybrid/snp_positions/allelecounts_from_pileup/v17S15_genomic_refseq_autosomes.txt --stages blast
to allele_independence_blastocyst.pdf
'''

if '__main__' == __name__:
	opts = argparse.ArgumentParser()
	opts.add_argument('-i', '--inf', nargs='+', required=True)
	opts.add_argument('--stages', nargs='+', help='when there is not a genomewide maternal bias')
	opts.add_argument('--exclude', nargs='+', help='remove from stages', default=[])
	opts.add_argument('--sim', action='store_true')
	opts.add_argument('-o', '--figure', default='allele_independence.pdf')
	opts.add_argument('--plotstyle', default=['mean_graph'], choices=['mean_graph', 'boxplot', 'mean_sem', 'violin', 'std', 'sayN', 'sayY'], nargs='+')
	opts.add_argument('--minN', default=0, type=int)
	o = opts.parse_args()
	
	expra = dr_tools.loadexpr(o.inf, counts=True)
	sample_pairs = pairs(expra, o.stages, o.exclude)
	
	bins = [Bin(num_cells, len(sample_pairs), o.sim) for num_cells in range(len(sample_pairs)+1)]
	
	for gene_i in range(len(expra['symbols'])):
		num_mono = sum((expra[s_pat][gene_i]>0)^(expra[s_mat][gene_i]>0) for s_pat,s_mat in sample_pairs)
		num_bi = sum((expra[s_pat][gene_i]>0)and(expra[s_mat][gene_i]>0) for s_pat,s_mat in sample_pairs)
		num_silent = sum((expra[s_pat][gene_i]==0)and(expra[s_mat][gene_i]==0) for s_pat,s_mat in sample_pairs)
		
		bins[num_silent].add(num_bi)
	
	if o.sim:
		while any(len(b.exp_frac_bi) < 10000 for b in bins):
			r = random.random()**2
			sim_states = [(random.random() < r, random.random() < r) for p in sample_pairs]
	allowedgenes = set()
	for genelistf in genelistf_arr:
		allowedgenes |= set(dr_tools.loadlist(genelistf))
	return set(i for i,sym in enumerate(expr['symbols']) if sym in allowedgenes)
	

if '__main__' == __name__:
	opts = argparse.ArgumentParser()
	opts.add_argument('inf')
	opts.add_argument('--filter', nargs='+')
	opts.add_argument('-f', '--figf', default='plot_monoallelic_by_cell.pdf')
	opts.add_argument('-gi', '--genelistf_include', nargs='+')
	opts.add_argument('-ge', '--genelistf_exclude', nargs='+')
	o = opts.parse_args()

	expr = dr_tools.loadexpr([o.inf], counts=True)
	#samples = sorted([e for e in expr if e not in ('IDs', 'symbols')])
	
	allowed_gene_i = gene_i_by_listf(o.genelistf_include, expr) if o.genelistf_include else None
	excluded_gene_i = gene_i_by_listf(o.genelistf_exclude, expr) if o.genelistf_exclude else None
	
	
	for p in dr_tools.splitlines(o.inf):
		if p[0] == '#samples': samples = p[1:]; break
	
	fractions = [] # maternal only + paternal only
	mfractions = [] # maternal only
	fractions_all3 = [] # maternal+parternal+biallelic
	labels = []
	
	for s1, s2 in zip(samples[::2], samples[1::2]):
			#if not 'Mix 1' in p[Mix1_i]: raise Exception
		else:
			conc_attomolul += float(p[Mix1_i])
	return conc_attomolul * before_dilution_vol_ul * 602214.12927

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('rpkmfile')
	parser.add_argument('nondiatable')
	parser.add_argument('allelehits')
	o = parser.parse_args()
	
	ERCCvol_ul = 4e-7
	ERCC_moleculenumber = calc_ERCC_moleculenumber('ERCC.txt', ERCCvol_ul)
	
	expr = dr_tools.loadexpr(o.rpkmfile, False)
	spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID]
	genes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' not in ID]
	
	xarr = defaultdict(list)
	
	for p in dr_tools.splitlines(o.nondiatable):
		if p[0] == '#sample':
			index_cellsource = p.index('cell.type')
		else:
			sample = p[0]
			if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample
			try:
				ERCC_rpkmsum = sum(expr[sample][spike] for spike in spikes_i)
				if ERCC_rpkmsum < 100: continue
				sample_i = expr.samples.index(sample)
	pairs = list(itertools.combinations(samples, 2))
	if len(pairs) > nmax:
		return random.sample(pairs, nmax)
	else:
		return pairs

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-r', '--rpkmfile', nargs='+', required=True)
	parser.add_argument('-s', '--samplelist', nargs='+', required=True, action='append')
	parser.add_argument('-o', '--figure', default='correlation.pdf')
	parser.add_argument('-n', '--names', action='append')
	parser.add_argument('-m', '--maxpergroup', type=int, default=300000000)
	o = parser.parse_args()
	
	expr = dr_tools.loadexpr(o.rpkmfile)
	boxplot_values = []
	labels = []
	for samplelistgroup, name in itertools.izip_longest(o.samplelist, o.names):
		if samplelistgroup is None: raise Exception
		if name is None: label = ''
		else: label = name + '\n'
		rho_values = []
		samples_used = set()
		possible_pairs = 0
		for samplelistfile in samplelistgroup:
			samples = set(dr_tools.loadlist(samplelistfile))
			rho_values.extend([stats.spearmanr(expr[s1], expr[s2])[0] for s1, s2 in maxpairs(samples, o.maxpergroup)])
			samples_used.update(samples)
			possible_pairs += len(samples) * (len(samples)-1) // 2
		boxplot_values.append(rho_values)
Ejemplo n.º 26
0
from __future__ import division
import argparse, dr_tools, numpy
from collections import defaultdict

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('sample_and_chromosome_list')
	parser.add_argument('output_file')
	parser.add_argument('-A', '--annotationfile')
	parser.add_argument('-a', '--allelehits')
	o = parser.parse_args()
	
	exprr = dr_tools.loadexpr(o.allelehits, False)
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	chrom_to_IDs = defaultdict(set)
	for p in dr_tools.splitlines(o.annotationfile):
		chrom = p[2]
		sym = p[12]
		ID = p[1]
		chrom_to_IDs[chrom].add(ID)
	
	samples_set = set(expra.samples)
	
	with open(o.sample_and_chromosome_list) as infh:
		for line in infh:
			p = line.split()
			chrom = p[1]
			s_c57 = p[0]+'_c57only'
			s_cast = p[0]+'_castonly'
			if p[0] not in samples_set: continue
def MAfraction(expra, sample):
	count_bi, count_mono = 0, 0
	for sym, c57, cast in zip(expra['symbols'], expra[sample+'_c57only'], expra[sample+'_castonly']):
		if c57 and cast: count_bi += 1
		elif c57 or cast: count_mono += 1
	return count_mono/(count_bi + count_mono)

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('allelehits')
	parser.add_argument('diatable')
	parser.add_argument('--dim', type=float, default=3)
	o = parser.parse_args()
	
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	xarr = []
	yarr = []
	
	for p in dr_tools.splitlines(o.diatable):
		if p[0] == '#sample':
			index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')]
		else:
			sample = p[0]
			
			try:
				width = float(p[index_dia[1]])
				length = float(p[index_dia[0]])
			except ValueError:
				continue
    opts.add_argument('-s', '--bootstrap', type=int)
    opts.add_argument('-S',
                      '--states',
                      default='3state',
                      choices=['3state', 'fraction', 'diff', 'monoallelic'])
    opts.add_argument('--fig', default='which_allele_tree.pdf')
    opts.add_argument('-r', '--rpkmf_total')
    opts.add_argument('-t',
                      '--threshold_rpkm',
                      help='requires --rpkmf_total',
                      type=float)
    opts.add_argument('-R', '--randomize', action='store_true')
    o = opts.parse_args()

    # load expression data
    expr_alleles = dr_tools.loadexpr([o.rpkmf_alleles], counts=True)
    samples_alleles = sorted(
        [e for e in expr_alleles if e not in ('IDs', 'symbols')])
    if o.rpkmf_total is not None:
        expr_total = dr_tools.loadexpr([o.rpkmf_total], counts=False)
        exprt_samples = set(expr_total.samples)

    character_matrix = []  # 2D, values from state()
    samplenames = []

    for s1, s2 in zip(samples_alleles[::2], samples_alleles[1::2]):
        if o.filter is not None and not any(part in s1.rsplit('_', 1)[0]
                                            for part in o.filter):
            continue
        samplename = s1.rsplit('_', 1)[0]
        # check that sample labels are consistent
	opts.add_argument('-f', '--figurefile', default='monoallelic_at_chr.png')
	opts.add_argument('-w', '--maxwhite', type=int)
	opts.add_argument('--allowallwhite', action='store_true')
	opts.add_argument('--allowedgenes')
	opts.add_argument('--disallowedgenes')
	opts.add_argument('--verticalborder', action='store_true')
	opts.add_argument('--stageline', action='store_true')
	opts.add_argument('--embryoline', action='store_true')
	opts.add_argument('--embryonotch', action='store_true')
	opts.add_argument('--mincoord', type=int)
	opts.add_argument('--maxcoord', type=int)
	opts.add_argument('--saygenes', action='store_true')
	args = opts.parse_args()
	
	# load expression data
	expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True)
	samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter)))
	
	for p in dr_tools.splitlines(args.rpkmf_alleles):
		if p[0] == '#samples': samples = p[1:]; break
	samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))]
	
	# sort the genes by position
	# only include transcripts which are the first ID in the entry of the rpkm file
	if args.allowedgenes is None and args.disallowedgenes is None:
		allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs'])
	else:
		if args.allowedgenes:
			allowed_set = set(dr_tools.loadlist(args.allowedgenes))
		if args.disallowedgenes:
			disallowed_set = set(dr_tools.loadlist(args.disallowedgenes))
if '__main__' == __name__:
    opts = argparse.ArgumentParser()
    opts.add_argument('-i1', '--inf1', required=True)  # e.g. ooref15...
    opts.add_argument('-F1', default=0.02, type=float)
    opts.add_argument('-i2', '--inf2')  # e.g. ooref13...
    opts.add_argument('-F2', type=float, default=0)
    opts.add_argument('-o', '--outf', default='/dev/stdout')
    opts.add_argument('--addminreads', type=int, default=0)
    opts.add_argument('--round',
                      choices=['0.5up', 'ceil', 'floor'],
                      default='ceil')
    opts.add_argument('--minreadsboth', type=int, default=0)
    args = opts.parse_args()

    expr1 = dr_tools.loadexpr([args.inf1], counts=True)
    if args.inf2 is not None:
        expr2 = dr_tools.loadexpr([args.inf2], counts=True)
    for i, p in enumerate(dr_tools.splitlines(args.inf1)):
        samples = p[1:]
        break

    gene_counts_out = defaultdict(list)

    for s1, s2 in zip(samples[::2], samples[1::2]):
        if s1.rsplit('_', 1)[0] != s2.rsplit('_', 1)[0]: raise Exception
        for gene_i, symbol in enumerate(expr1['symbols']):
            # remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa

            if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth:
                s1e = 0
	roundfunc = {'ceil':math.ceil, '0.5up':round, 'floor':math.floor}[rounding]
	return max(0, expr_s1 - roundfunc(F*expr_s2+addminreads))

if '__main__' == __name__:
	opts = argparse.ArgumentParser()
	opts.add_argument('-i1', '--inf1', required=True) # e.g. ooref15...
	opts.add_argument('-F1', default=0.02, type=float)
	opts.add_argument('-i2', '--inf2') # e.g. ooref13...
	opts.add_argument('-F2', type=float, default=0)
	opts.add_argument('-o', '--outf', default='/dev/stdout')
	opts.add_argument('--addminreads', type=int, default=0)
	opts.add_argument('--round', choices=['0.5up', 'ceil', 'floor'], default='ceil')
	opts.add_argument('--minreadsboth', type=int, default=0)
	args = opts.parse_args()
	
	expr1 = dr_tools.loadexpr([args.inf1], counts=True)
	if args.inf2 is not None: expr2 = dr_tools.loadexpr([args.inf2], counts=True)
	for i, p in enumerate(dr_tools.splitlines(args.inf1)):
		samples = p[1:]
		break
	
	gene_counts_out = defaultdict(list)
	
	for s1, s2 in zip(samples[::2], samples[1::2]):
		if s1.rsplit('_',1)[0] != s2.rsplit('_',1)[0]: raise Exception
		for gene_i, symbol in enumerate(expr1['symbols']):
			# remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa
			
			
			if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth:
				s1e = 0
	parser.add_argument('-n2', '--end_n', default=15, type=int)
	parser.add_argument('-o', '--figure', default='pool_n.pdf')
	parser.add_argument('--nonrandom_n1', action='store_true')
	o = parser.parse_args()
	
	if o.random_seed is not None: random.seed(o.random_seed)
	
	allowedgenes = set(dr_tools.loadlist(o.allowedgenes)) if o.allowedgenes else None
	if o.disallowedgenes:
		disallowedgenes = set()
		for filename in o.disallowedgenes:
			disallowedgenes.update(set(dr_tools.loadlist(filename)))
	else:
		disallowedgenes = None
	
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	c57fraction = 0.5
	
	for clonal_group in dr_tools.loadlist(o.clonal_groups):
		if not o.clonal_group in clonal_group: continue
		samples = [s.rsplit('_',1)[0] for s in expra.samples[::2]]
		samples = [s for s in samples if any(s.startswith(clonal_group_start) or s.startswith('pool.'+clonal_group_start) for clonal_group_start in clonal_group.split('\t'))]
		xarr_n = []
		yarr_mono = []
		xarr_ctrl_n = []
		yarr_ctrl = []
		xarr_n_line = []
		yarr_mono_line = []
		yarr_ctrl_line = []
		for n in range(o.start_n, o.end_n+1):