def loadfile(infile):
    reads_per_sample = dict()
    for p in dr_tools.splitlines(infile):
        sample = p[0].split('/')[-1].split('_refseq.txt:#')[0]
        reads = float(p[1])
        reads_per_sample[sample] = reads
    return reads_per_sample
def make_snp2gene_file(genepred, snptable, outfile_mpileup, outfile_snp2genes, include_overlap=False):
	with open(genepred) as infh:
		for line in infh:
			if line.startswith('#'): continue
			chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline(line)
			for start, end in zip(exonstarts, exonends):
				exon = dr_tools.Cregion(chromosome, start, end)
				exon.gene = genename
				exon.addtowindows()
	
	snps_per_gene = defaultdict(list)
	snp_positions = []
	
	for p in dr_tools.splitlines(snptable):
		# e.g. 585     chr1    10019   10020   rs376643643     0       +       A       A       -/A     genomic deletion        unknown 0       0       near-gene-5     exact   1               1       SSMP,   0   
		
		if p[11] != 'single': continue # ignore non-SNPs
		
		chromosome = p[1]
		position = int(p[2]) # 0-based
		genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint(chromosome, position))
		if include_overlap or len(genes) == 1:
			for gene in genes:
				snps_per_gene[gene].append('%s:%s|%s'%(p[1], p[3], p[9]))
				snp_positions.append('%s\t%s'%(p[1], p[3]))
	
	with open(outfile_snp2genes, 'w') as outfh:
		for gene, snps in snps_per_gene.items():
			print >>outfh, dr_tools.join(gene, len(snps), ';'.join(sorted(snps)))
	
	with open(outfile_mpileup, 'w') as outfh:
		for snpline in snp_positions:
			print >>outfh, snpline
def loadfile(infile):
	reads_per_sample = dict()
	for p in dr_tools.splitlines(infile):
		sample = p[0].split('/')[-1].split('_refseq.txt:#')[0]
		reads = float(p[1])
		reads_per_sample[sample] = reads
	return reads_per_sample
def calc_ERCC_moleculenumber(tablefile, before_dilution_vol_ul):
	Mix1_i = 3
	conc_attomolul = 0
	attomol = 602214.12927
	for i, p in enumerate(dr_tools.splitlines(tablefile)):
		if i == 0:
			if not 'attomoles/ul' in p[Mix1_i]: raise Exception
			#if not 'Mix 1' in p[Mix1_i]: raise Exception
		else:
			conc_attomolul += float(p[Mix1_i])
	return conc_attomolul * before_dilution_vol_ul * 602214.12927
def load_geneset(ID_to_symbol, filename):
	allowed_symbols = set(ID_to_symbol.values())
	allowed_IDs = set(ID_to_symbol.keys())
	geneset_symbols = set()
	entries = dict()
	for genes in dr_tools.splitlines(filename, sep=';'):
		genes_sym = set(gene for gene in genes if gene in allowed_symbols)
		genes_sym |= set(ID_to_symbol[gene] for gene in genes if gene in allowed_IDs and gene not in genes_sym)
		for sym in genes_sym:
			entries[sym] = frozenset(genes_sym)
	return entries
def table_loader():
	for p in dr_tools.splitlines(o.nondiatable):
		if p[0] == '#sample':
			index_cellsource = p.index('cell.type')
		else:
			try:
				sample = p[0]
				cellsource = p[index_cellsource]
				if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample
				sample_i = expr.samples.index(sample)
				if cellsource in ('mef', 'MAF'): cellsource='fibroblast'
				yield p, sample, sample_i, cellsource
			except KeyError:
				print 'missing', sample
				continue
def table_loader():
	for p in dr_tools.splitlines(o.nondiatable):
		if p[0] == '#sample':
			index_cellsource = p.index('cell.type')
		elif p[0].startswith('#'): pass
		else:
			try:
				sample = p[0]
				cellsource = p[index_cellsource]
				if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample
				sample_i = expr.samples.index(sample)
				if cellsource in ('mef', 'MAF'): cellsource='fibroblast'
				yield p, sample, sample_i, cellsource
			except KeyError:
				print 'missing', sample
				continue
	samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter)) and not any(part in e for part in args.exclude))
	allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs'])
	
	fractions_to_show = list()
	vals_real = list()
	vals_ctrl = list()
	labels = list()
	bootstrap_output = list()
	
	
	# sort the genes by posiotion
	# only include transcripts which are the first ID in the entry of the rpkm file
	for mindist,maxdist in zip(args.bindist[:-1], args.bindist[1:]):
		genes_per_chr = dict()
		ID_to_gene = dict()
		for p in dr_tools.splitlines(args.genePred):
			ID = p[1]
			if ID in allowed_IDs:
				chromosome = p[2]
				if chromosome in ('chrX', 'chrY'): continue
				if not chromosome in genes_per_chr: genes_per_chr[chromosome] = []
				genes_per_chr[chromosome].append(Gene(ID, int(p[4]) if p[3]=='+' else int(p[5]), p[3]))
		for chromosome in genes_per_chr:
			genes_per_chr[chromosome].sort(key=lambda gene: gene.TSS)
			non_chr_genes = [gene for chr_key in genes_per_chr if chr_key != chromosome for gene in genes_per_chr[chromosome]]
			for gene_i, gene in enumerate(genes_per_chr[chromosome]):
				ID_to_gene[gene.ID] = gene
				# only include neighbours in the 'forward' direction, to avoid dependence in stat tests later on
				gene.neighbours = genes_per_chr[chromosome][gene_i+1:args.numadjacent+gene_i+1]
				gene.neighbours = [other for other in gene.neighbours if mindist <= abs(other.TSS-gene.TSS) < maxdist]
				if args.onlydiffstrand:
def get_loadings(filepath):
	loadingsdict = dict((key, float(val)) for key,val in dr_tools.splitlines(filepath))
	name = filepath.split('/')[-1].rsplit('_loadings_',1)[-1].split('.')[0]
	return loadingsdict, name
	set2 = set(entries2)
	set1_unique_c = len(set(entries1[sym] for sym in (set1-set2)))
	set2_unique_c = len(set(entries2[sym] for sym in (set2-set1)))
	common_c = len(set(entries1[sym] for sym in (set2&set1)))
	common_c2 = len(set(entries2[sym] for sym in (set2&set1)))
	if not common_c == common_c2: raise Exception
	saygenes = []
	for genes in set(entries2[sym] for sym in (set2&set1)):
		saygenes.append(';'.join(list(genes)))
	return set1_unique_c, common_c, set2_unique_c, ', '.join(saygenes)

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-A', '--annotationfile', default='/mnt/crick/danielr/Xandclones_BR/BR_fibroblasts/snp-call/more_formats/mm9_ensembl_refseq_norandom_11Apr2012_genesymbols.txt')
	parser.add_argument('-a', '--set1', required=True)
	parser.add_argument('-b', '--set2', required=True)
	parser.add_argument('-ge', '--disallowedgenes', nargs='+')
	o = parser.parse_args()
	
	if o.disallowedgenes:
		disallowedgenes = set()
		for filename in o.disallowedgenes:
			disallowedgenes.update(set(dr_tools.loadlist(filename)))
	else:
		disallowedgenes = None
	
	ID_to_symbol = dict((p[1], p[12]) for p in dr_tools.splitlines(o.annotationfile) if disallowedgenes is None or p[12] not in disallowedgenes)
	
	print dr_tools.join(overlap_of_2(load_geneset(ID_to_symbol, o.set1), load_geneset(ID_to_symbol, o.set2)))
	print len(set(ID_to_symbol.values()))
Beispiel #11
0
    opts.add_argument('-o', '--figurefile', default='monoallelic_by_chr.pdf')
    args = opts.parse_args()

    # load expression data
    expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True)
    samples_alleles = sorted(
        e for e in expr_alleles
        if e not in ('IDs', 'symbols') and (args.filter is None or any(
            part in e for part in args.filter)))

    # sort the genes by position
    # only include transcripts which are the first ID in the entry of the rpkm file
    allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs'])
    genes_per_chr = dict()
    ID_to_gene = dict()
    for p in dr_tools.splitlines(args.genePred):
        ID = p[1]
        if ID in allowed_IDs:
            chromosome = p[2]
            if 'random' in chromosome: continue
            if not chromosome in genes_per_chr: genes_per_chr[chromosome] = []
            genes_per_chr[chromosome].append(
                Gene(ID,
                     int(p[4]) if p[3] == '+' else int(p[5])))
    for chromosome in genes_per_chr:
        genes_per_chr[chromosome].sort(key=lambda gene: gene.TSS)
        for gene_i, gene in enumerate(genes_per_chr[chromosome]):
            ID_to_gene[gene.ID] = gene

    samples = []
                        default=97)  # 97 for both round1 and round2
    parser.add_argument('--max_dm', type=float)
    parser.add_argument('--not_clusterabs',
                        action='store_false',
                        dest='clusterabs')
    parser.add_argument('--reverse_order', action='store_true')
    o = parser.parse_args()
    o.network = False

    proteins = set()
    correlations_together = defaultdict(lambda: defaultdict(list))
    correlations_opposing = defaultdict(lambda: defaultdict(list))
    qvalues_together = defaultdict(lambda: defaultdict(list))
    qvalues_opposing = defaultdict(lambda: defaultdict(list))
    for file_in in o.file_in:
        for p in dr_tools.splitlines(file_in):
            if p[0].startswith('(('): continue
            peakpoint = p[3]
            r = float(p[4])
            protline = p[2]

            if o.coexpression:
                r = float(p[5])

            if protline.startswith('req'):
                filter_pos, filter_neg = split_pos_neg(
                    protline.split('_')[0][3:])
                proteins_pos, proteins_neg = split_pos_neg(
                    protline.split('_')[1])
            else:
                proteins_pos, proteins_neg = split_pos_neg(protline)
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt'
file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt' # created by make_allelecalls.py -P using the -a and -s arguments
output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt'


import dr_tools

positions = set()
for p in dr_tools.splitlines(file1): # for each SNP line in the file
	if float(p[-2]) < 0.9: # if second last column's value is <0.9
		positions.add('%s:%s'%(p[0], p[1])) # add to allowed SNP list

print len(positions)
c=0
outfh = open(output, 'w')
for p in dr_tools.splitlines(file2): # for each gene
	snps = []
	for snpinfo in p[2].split(';'): # go through the SNPs for the gene
		if snpinfo.split('|')[0] in positions: # see if on allowed list
			snps.append(snpinfo) # add to SNPs to print to output
			c+=1
	print >>outfh, dr_tools.join(p[0], len(snps), ';'.join(snps)) # output the SNPs for the gene
outfh.close()
print c
Beispiel #14
0
        help='same output format as validated_mm9_refseq_snp2genes.txt')
    parser.add_argument('--include_overlap', action='store_true')
    o = parser.parse_args()

    with open(o.genepred) as infh:
        for line in infh:
            chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline(
                line)
            for start, end in zip(exonstarts, exonends):
                exon = dr_tools.Cregion(chromosome, start, end)
                exon.gene = genename
                exon.addtowindows()

    snps_per_gene = defaultdict(list)

    for p in dr_tools.splitlines(o.snplist):
        # e.g. chr11   117883408       C       A       0       1.00    -1.00   0.90    0.10    0.71    0.29

        if ',' in p[2] or ',' in p[3]:
            continue  # added 18 Dec, since snp_stats2.py -S removes these SNPs anyway

        chromosome = p[0]
        position = int(p[1]) - 1
        genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint(
            chromosome, position))
        if o.include_overlap:
            for gene in genes:
                snps_per_gene[gene].append('%s:%s' % (p[0], p[1]))
        else:
            if len(genes
                   ) == 1:  # don't allow overlapping genes, exclude those SNPs
    proteins_neg = [p.strip('*') for p in o.proteins_neg]
    random.seed(0)
    if o.pdf is None:
        o.pdf = 'expression_' + ''.join([p + '+'
                                         for p in proteins_pos]) + ''.join(
                                             [p + '-'
                                              for p in proteins_neg]) + '.pdf'
    elif o.title is None:
        o.title = ''.join([p + '+' for p in proteins_pos]) + ''.join(
            [p + '-' for p in proteins_neg])

    samples_csv = o.csv_in
    if o.use_from_list is not None:
        samples_to_use = set()
        samples_possible = [f for f in os.listdir('.') if f.endswith('.csv')]
        for p in dr_tools.splitlines(o.csv_in[0]):
            if p[1] == o.use_from_list:
                samples_to_use.update([
                    s for s in samples_possible
                    if s == p[0] or s.startswith(p[0] + '_')
                ])
        samples_csv = list(samples_to_use)
        o.csv_in = o.csv_in[1:]
    if len(o.csv_in) == 1 and o.csv_in[0].endswith('.txt'):
        with open(o.csv_in[0], 'rU') as infh:
            samples_csv = [f.strip() for f in infh.read().split()]

    levels = defaultdict(list)
    for csv_in in samples_csv:
        with open(csv_in, 'rb') as infh:
            sample = csv_in.rsplit('/', 1)[-1].split('.csv')[0]
	opts = argparse.ArgumentParser()
	opts.add_argument('inf')
	opts.add_argument('--filter', nargs='+')
	opts.add_argument('-f', '--figf', default='plot_monoallelic_by_cell.pdf')
	opts.add_argument('-gi', '--genelistf_include', nargs='+')
	opts.add_argument('-ge', '--genelistf_exclude', nargs='+')
	o = opts.parse_args()

	expr = dr_tools.loadexpr([o.inf], counts=True)
	#samples = sorted([e for e in expr if e not in ('IDs', 'symbols')])
	
	allowed_gene_i = gene_i_by_listf(o.genelistf_include, expr) if o.genelistf_include else None
	excluded_gene_i = gene_i_by_listf(o.genelistf_exclude, expr) if o.genelistf_exclude else None
	
	
	for p in dr_tools.splitlines(o.inf):
		if p[0] == '#samples': samples = p[1:]; break
	
	fractions = [] # maternal only + paternal only
	mfractions = [] # maternal only
	fractions_all3 = [] # maternal+parternal+biallelic
	labels = []
	
	for s1, s2 in zip(samples[::2], samples[1::2]):
		if o.filter is not None and not any(part in s1.rsplit('_',1)[0] for part in o.filter): continue
		if s1.rsplit('_',1)[0] != s2.rsplit('_',1)[0] or not 'c57' in s1 or 'c57' in s2:
			print 'Error in pair:', s1, s2
			continue # check for errors in input file format
		Z = zip(expr[s1], expr[s2])
		if o.genelistf_include or o.genelistf_exclude:
			Z = [E for i,E in enumerate(Z) if (allowed_gene_i is None or i in allowed_gene_i) and (excluded_gene_i is None or i not in excluded_gene_i)] # only those includes in the gene list
	parser.add_argument('-i', '--chrom_include', nargs='+', help="'rest' matches all not in -i or -e, 'random' matches e.g. chr1_random", required=True)
	parser.add_argument('-e', '--chrom_exclude', nargs='+', help="'rest' matches all not in -i or -e, 'random' matches e.g. chr1_random", default=[])
	o = parser.parse_args()
	
	all_chromosomes = set(dr_tools.loadlist(o.genePred, 2))
	
	include = set(c for c in all_chromosomes if c in o.chrom_include)
	exclude = set(c for c in all_chromosomes if c in o.chrom_exclude)
	if 'random' in o.chrom_include:
		include.update(set(c for c in all_chromosomes if '_random' in c and c not in exclude))
	if 'random' in o.chrom_exclude:
		exclude.update(set(c for c in all_chromosomes if '_random' in c and c not in include))
	if 'rest' in o.chrom_include:
		include.update(all_chromosomes-exclude)
	if 'rest' in o.chrom_exclude:
		exclude.update(all_chromosomes-include)
	
	
	genes_incl = set()
	genes_excl = set()
	
	for p in dr_tools.splitlines(o.genePred):
		symbol = p[12]
		chromosome = p[2]
		if chromosome in include: genes_incl.add(symbol)
		elif chromosome in exclude: genes_excl.add(symbol)
	
	with open(o.genelist_out, 'w') as outfh:
		for gene in genes_incl-genes_excl:
			print >>outfh, gene
Beispiel #18
0
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt'
file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt'  # created by make_allelecalls.py -P using the -a and -s arguments
output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt'

import dr_tools

positions = set()
for p in dr_tools.splitlines(file1):  # for each SNP line in the file
    if float(p[-2]) < 0.9:  # if second last column's value is <0.9
        positions.add('%s:%s' % (p[0], p[1]))  # add to allowed SNP list

print len(positions)
c = 0
outfh = open(output, 'w')
for p in dr_tools.splitlines(file2):  # for each gene
    snps = []
    for snpinfo in p[2].split(';'):  # go through the SNPs for the gene
        if snpinfo.split('|')[0] in positions:  # see if on allowed list
            snps.append(snpinfo)  # add to SNPs to print to output
            c += 1
    print >> outfh, dr_tools.join(
        p[0], len(snps), ';'.join(snps))  # output the SNPs for the gene
outfh.close()
print c
import argparse, dr_tools

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('snp2genes', help='/mnt/kauffman/danielr/crick/Xandclones_BR/snp-validation/snp2genes/ensembl__nooverlap_ra5val_3percent.txt')
	parser.add_argument('-g', '--genelist')
	o = parser.parse_args()
	
	
	if o.genelist: allowedgenes = set(dr_tools.loadlist(o.genelist))
	
	num_snps = 0
	num_genes = 0
	for p in dr_tools.splitlines(o.snp2genes):
		if o.genelist and p[0] not in allowedgenes: continue
		num_snps += int(p[1])
		num_genes += 1
	print 'snps:', num_snps
	print 'genes:', num_genes
    return header, markers, marker_order


if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--rpkmfile')
    parser.add_argument('--tableS4', default='tableS4.txt')
    parser.add_argument('--to_cytof_markers',
                        default='symbol_to_cytof_marker.txt')
    parser.add_argument('--shuffle_patterns', action='store_true')
    parser.add_argument('-o', '--sample_list_prefix')
    o = parser.parse_args()

    header, markers, marker_order = parse_table(o.tableS4)

    gene_to_marker = dict(dr_tools.splitlines(o.to_cytof_markers))
    marker_order = [m for m in marker_order if m in gene_to_marker.values()]
    if not o.shuffle_patterns:
        pop_cytof_pattern = dict(
            (pop, [markers[m][popi] for m in marker_order])
            for popi, pop in enumerate(header))
    else:
        pop_cytof_pattern = dict(
            (pop, random.shuffle([markers[m][popi] for m in marker_order]))
            for popi, pop in enumerate(header))
    exprt = dr_tools.loadexpr(o.rpkmfile)
    random.seed(0)

    midexpr_symi_all_D = dict()
    for symi, sym in enumerate(exprt['symbols']):
        if sym not in gene_to_marker:
import dr_tools, os, argparse

parser = argparse.ArgumentParser()
parser.add_argument('namefile')
parser.add_argument('--prefix_in', default='')
parser.add_argument('--prefix_out', default='')
parser.add_argument('--saysuccess', action='store_true')
o = parser.parse_args()

for p in dr_tools.splitlines(o.namefile):
	try:
		os.rename('%s%s.fastq.gz'%(o.prefix_in, p[0]), '%s%s.fastq.gz'%(o.prefix_out, p[1]))
	except:
		if not o.saysuccess:
			print 'fastq missing', p[0]
		pass
	else:
		if o.saysuccess:
			print 'renamed', p[0]
	
	try:
		os.rename('%s%s_expression.txt'%(o.prefix_in, p[0]), '%s%s_expression.txt'%(o.prefix_out, p[1]))
	except:
		if not o.saysuccess:
			print 'rpkms missing', p[0]
		pass
	else:
		if o.saysuccess:
			print 'renamed', p[0]
    opts.add_argument('-i1', '--inf1', required=True)  # e.g. ooref15...
    opts.add_argument('-F1', default=0.02, type=float)
    opts.add_argument('-i2', '--inf2')  # e.g. ooref13...
    opts.add_argument('-F2', type=float, default=0)
    opts.add_argument('-o', '--outf', default='/dev/stdout')
    opts.add_argument('--addminreads', type=int, default=0)
    opts.add_argument('--round',
                      choices=['0.5up', 'ceil', 'floor'],
                      default='ceil')
    opts.add_argument('--minreadsboth', type=int, default=0)
    args = opts.parse_args()

    expr1 = dr_tools.loadexpr([args.inf1], counts=True)
    if args.inf2 is not None:
        expr2 = dr_tools.loadexpr([args.inf2], counts=True)
    for i, p in enumerate(dr_tools.splitlines(args.inf1)):
        samples = p[1:]
        break

    gene_counts_out = defaultdict(list)

    for s1, s2 in zip(samples[::2], samples[1::2]):
        if s1.rsplit('_', 1)[0] != s2.rsplit('_', 1)[0]: raise Exception
        for gene_i, symbol in enumerate(expr1['symbols']):
            # remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa

            if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth:
                s1e = 0
                s2e = 0
            elif args.inf2 is None:
                s1e = subtract(expr1[s1][gene_i], args.F1, expr1[s2][gene_i],
if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--cellsums_files', metavar='cellsums', required=True, nargs='+')
	parser.add_argument('-v', '--snp_vcf', help='e.g. castsnp_alleles.txt')
	parser.add_argument('-o', '--outfile', default='/dev/null')
	parser.add_argument('-f', '--figure')
	parser.add_argument('-R', '--minratio', default=0, type=float)
	parser.add_argument('-rt', '--minreads_sum', default=1, type=int)
	parser.add_argument('-ra', '--minreads_allele', default=0, type=int)
	parser.add_argument('-V', '--snp_validatedbefore', help='e.g. validated_cast_c57_snps.txt')
	o = parser.parse_args()
	
	if o.snp_validatedbefore:
		allowed_coord = set()
		for p in dr_tools.splitlines(o.snp_validatedbefore, ignore='#'):
			chromosome = p[0]
			position = p[1]
			coord = '%s\t%s'%(chromosome, position)
			allowed_coord.add(coord)
	else: allowed_coord = None
	
	database_snps = dict()
	for p in dr_tools.splitlines(o.snp_vcf, ignore='#'):
		chromosome = 'chr'+p[0]
		position = p[1]
		c57base = p[3]
		castbase = p[4]
		if ',' in c57base or ',' in castbase: continue
		coord = '%s\t%s'%(chromosome, position)
		if allowed_coord is not None and coord not in allowed_coord: continue
    parser.add_argument('-m',
                        '--addition',
                        nargs=2,
                        action='append',
                        default=[])
    parser.add_argument('-l',
                        '--removal',
                        nargs=2,
                        action='append',
                        default=[])
    o = parser.parse_args()

    expra = dr_tools.loadexpr(o.allelehits, True)

    chrom_to_ai = defaultdict(list)
    for p in dr_tools.splitlines(o.annotationfile):
        chrom = p[2]
        sym = p[12]
        ID = p[1]
        chrom_to_ai[chrom].append(expra.ID_to_index[ID])

    for s_c57, s_cast in zip(expra.samples[::2], expra.samples[1::2]):
        sample = s_c57.rsplit('_', 1)[0]
        for chrom in chrom_to_ai:
            for samplepart, chromosome in o.removal:
                if chrom == chromosome and samplepart in sample:
                    continue
            for samplepart, chromosome in o.addition:
                if chrom == chromosome and samplepart in sample:
                    print sample, chrom
                    continue
Beispiel #25
0
	opts.add_argument('--allowedgenes')
	opts.add_argument('--disallowedgenes')
	opts.add_argument('--verticalborder', action='store_true')
	opts.add_argument('--stageline', action='store_true')
	opts.add_argument('--embryoline', action='store_true')
	opts.add_argument('--embryonotch', action='store_true')
	opts.add_argument('--mincoord', type=int)
	opts.add_argument('--maxcoord', type=int)
	opts.add_argument('--saygenes', action='store_true')
	args = opts.parse_args()
	
	# load expression data
	expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True)
	samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter)))
	
	for p in dr_tools.splitlines(args.rpkmf_alleles):
		if p[0] == '#samples': samples = p[1:]; break
	samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))]
	
	# sort the genes by position
	# only include transcripts which are the first ID in the entry of the rpkm file
	if 0:#args.allowedgenes is None and args.disallowedgenes in None:
		allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs'])
	else:
		if args.allowedgenes:
			allowed_set = set(dr_tools.loadlist(args.allowedgenes))
		if args.disallowedgenes:
			disallowed_set = set(dr_tools.loadlist(args.disallowedgenes))
		allowed_IDs = set(IDs.split('+')[0] for IDs, symbols in zip(expr_alleles['IDs'],expr_alleles['symbols']) if (args.allowedgenes is None or any(identifier in allowed_set for identifier in (IDs.split('+') + symbols.split('+')))) and not (args.disallowedgenes is not None and any(identifier in disallowed_set for identifier in (IDs.split('+') + symbols.split('+')))))
	genes_per_chr = dict()
	ID_to_gene = dict()
Beispiel #26
0
                        required=True,
                        nargs='+')
    parser.add_argument('-v', '--snp_vcf', help='e.g. castsnp_alleles.txt')
    parser.add_argument('-o', '--outfile', default='/dev/null')
    parser.add_argument('-f', '--figure')
    parser.add_argument('-R', '--minratio', default=0, type=float)
    parser.add_argument('-rt', '--minreads_sum', default=1, type=int)
    parser.add_argument('-ra', '--minreads_allele', default=0, type=int)
    parser.add_argument('-V',
                        '--snp_validatedbefore',
                        help='e.g. validated_cast_c57_snps.txt')
    o = parser.parse_args()

    if o.snp_validatedbefore:
        allowed_coord = set()
        for p in dr_tools.splitlines(o.snp_validatedbefore, ignore='#'):
            chromosome = p[0]
            position = p[1]
            coord = '%s\t%s' % (chromosome, position)
            allowed_coord.add(coord)
    else:
        allowed_coord = None

    database_snps = dict()
    for p in dr_tools.splitlines(o.snp_vcf, ignore='#'):
        chromosome = 'chr' + p[0]
        position = p[1]
        c57base = p[3]
        castbase = p[4]
        if ',' in c57base or ',' in castbase: continue
        coord = '%s\t%s' % (chromosome, position)
Beispiel #27
0
import argparse, dr_tools

if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'snp2genes',
        help=
        '/mnt/kauffman/danielr/crick/Xandclones_BR/snp-validation/snp2genes/ensembl__nooverlap_ra5val_3percent.txt'
    )
    parser.add_argument('-g', '--genelist')
    o = parser.parse_args()

    if o.genelist: allowedgenes = set(dr_tools.loadlist(o.genelist))

    num_snps = 0
    num_genes = 0
    for p in dr_tools.splitlines(o.snp2genes):
        if o.genelist and p[0] not in allowedgenes: continue
        num_snps += int(p[1])
        num_genes += 1
    print 'snps:', num_snps
    print 'genes:', num_genes
from scipy import stats

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('rpkmfile')
	parser.add_argument('diatable')
	parser.add_argument('--dim', type=float, default=3)
	o = parser.parse_args()
	
	expr = dr_tools.loadexpr(o.rpkmfile, True)
	spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID]
	
	xarr = []
	yarr = []
	
	for p in dr_tools.splitlines(o.diatable):
		if p[0] == '#sample':
			index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')]
		else:
			sample = p[0]
			ERCC_readsum = sum(expr[sample][spike] for spike in spikes_i)
			sample_i = expr.samples.index(sample)
			mRNA_readsum = expr.normalizationreads[sample_i]
			try:
				width = float(p[index_dia[1]])
				length = float(p[index_dia[0]])
			except ValueError:
				continue
			xarr.append((width*length)**(o.dim/2))
			yarr.append(mRNA_readsum/ERCC_readsum) # like D Edsgards' axis
	
if '__main__' == __name__:
	opts = argparse.ArgumentParser()
	opts.add_argument('-i1', '--inf1', required=True) # e.g. ooref15...
	opts.add_argument('-F1', default=0.02, type=float)
	opts.add_argument('-i2', '--inf2') # e.g. ooref13...
	opts.add_argument('-F2', type=float, default=0)
	opts.add_argument('-o', '--outf', default='/dev/stdout')
	opts.add_argument('--addminreads', type=int, default=0)
	opts.add_argument('--round', choices=['0.5up', 'ceil', 'floor'], default='ceil')
	opts.add_argument('--minreadsboth', type=int, default=0)
	args = opts.parse_args()
	
	expr1 = dr_tools.loadexpr([args.inf1], counts=True)
	if args.inf2 is not None: expr2 = dr_tools.loadexpr([args.inf2], counts=True)
	for i, p in enumerate(dr_tools.splitlines(args.inf1)):
		samples = p[1:]
		break
	
	gene_counts_out = defaultdict(list)
	
	for s1, s2 in zip(samples[::2], samples[1::2]):
		if s1.rsplit('_',1)[0] != s2.rsplit('_',1)[0]: raise Exception
		for gene_i, symbol in enumerate(expr1['symbols']):
			# remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa
			
			
			if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth:
				s1e = 0
				s2e = 0
			elif args.inf2 is None:
        action='store_true',
        help='if --nodivision_confint was used for the input file generation')
    parser.add_argument('--shorten_names', action='store_true')
    parser.add_argument('--random_seed', type=float)
    parser.add_argument('-c',
                        '--marker_colour',
                        action='append',
                        nargs=2,
                        default=[])
    o = parser.parse_args()

    if o.random_seed is not None: random.seed(o.random_seed)

    changesD = defaultdict(lambda: [Change(), Change()])
    for ii, infile in enumerate((o.infile1, o.infile2)):
        for p in dr_tools.splitlines(infile):
            name = p[2]
            c = changesD[name][ii]
            c.name = name.upper() if 'Fox' in name else name
            if o.shorten_names:
                c.name = c.name.split('_')[-1].rstrip('+')
            bottom_ch, top_ch = map(float, p[4].strip('()').split(', '))
            change = numpy.mean(
                [top_ch,
                 bottom_ch])  # relative value, i.e. start*(1+change) = end
            avg_abundance = float(p[5])  # = (start+end)/2
            if o.input_nodivision:
                c.startpoint = avg_abundance + change / 2
                c.endpoint = c.startpoint + change
                c.ends.append(top_ch + c.startpoint)
                c.ends.append(bottom_ch + c.startpoint)
Beispiel #31
0
from __future__ import division
import argparse, dr_tools, numpy, hcluster, random
import matplotlib.pyplot as pylab
import scipy.cluster.hierarchy as scipyhcluster

def Xi_activity_similarity(X, Y):
	num_different = sum(x != y for x,y in zip(X,Y))
	possibly_different = sum(X)+sum(Y)
	return num_different/possibly_different # corresponds to the binary distance i the R function dist

stateD = {'XI':1, 'bi':1, 'nd':0, 'xa':0}

if '__main__' == __name__:
	
	# load table
	linefeed = dr_tools.splitlines('chrX_clones_allelic_calls.txt')
	sample_labels = next(linefeed)[1:]
	character_matrixT = []
	for cells in linefeed:
		# values in cells are nd, XI, xa, bi, except first column which is gene symbol
		if any(c!='nd' for c in cells):
			character_matrixT.append([stateD[c] for c in cells[1:]])
	
	# make clusters
	character_matrix = numpy.array(character_matrixT).transpose()
	#hcdists = hcluster.pdist(character_matrix, metric='cityblock')
	hcdists = hcluster.pdist(character_matrix, metric=Xi_activity_similarity)
	hclinks = hcluster.linkage(hcdists, method='complete')
	draw_order = hcluster.leaves_list(hclinks)
	
	# draw tree
Beispiel #32
0
    opts.add_argument('--alg2', action='store_true')
    o = opts.parse_args()

    expr = dr_tools.loadexpr([o.inf], counts=True)
    exprt = dr_tools.loadexpr([o.rpkmf_total], counts=False)

    allowed_gene_i = gene_i_by_listf(o.genelistf_include,
                                     expr) if o.genelistf_include else None
    excluded_gene_i = gene_i_by_listf(o.genelistf_exclude,
                                      expr) if o.genelistf_exclude else None

    def rpkm(Ai, sample):
        Ti = exprt.ID_to_index[expr['IDs'][Ai]]
        return exprt[sample][Ti]

    for p in dr_tools.splitlines(o.inf):
        if p[0] == '#samples':
            samples = p[1:]
            break

    fractions = []  # maternal only + paternal only
    mfractions = []  # maternal only
    fractions_all3 = []  # maternal+parternal+biallelic
    labels = []

    for s1, s2 in zip(samples[::2], samples[1::2]):
        if o.filter is not None and not any(part in s1.rsplit('_', 1)[0]
                                            for part in o.filter):
            continue
        if s1.rsplit('_', 1)[0] != s2.rsplit(
                '_', 1)[0] or not 'c57' in s1 or 'c57' in s2:
    with open(csv_path, 'rU') as infh:
        for line in infh:
            count += 1
    return count - 1


if '__main__' == __name__:
    parser = argparse.ArgumentParser()
    parser.add_argument('-a', '--allcells_csv', nargs='+', required=True)
    parser.add_argument('-b', '--Bcells_csv', nargs='+', required=True)
    parser.add_argument('-c', '--cellnumbertable', required=True)
    parser.add_argument('--vocal', action='store_true')
    o = parser.parse_args()

    samples = []
    for li, p in enumerate(dr_tools.splitlines(o.cellnumbertable)):
        samples.append(Sample(str(li + 1), p[0], int(p[1]) if p[1] else None))

    for csv_path in o.allcells_csv:
        name = csv_path.split('/')[-1].split('_1.')[0].split('.txt')[0]
        matching_samples = [s for s in samples if name in s.names]
        if len(matching_samples) > 1: raise Exception
        if len(matching_samples) == 0:
            if o.vocal:
                print name, 'A'
            continue
        matching_samples[0].cells_cytof_all = count_cells(csv_path)

    for csv_path in o.Bcells_csv:
        name = csv_path.split('/')[-1].split('.')[0]
        matching_samples = [s for s in samples if name in s.names]
		elif c57 or cast: count_mono += 1
	return count_mono/(count_bi + count_mono)

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('allelehits')
	parser.add_argument('diatable')
	parser.add_argument('--dim', type=float, default=3)
	o = parser.parse_args()
	
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	xarr = []
	yarr = []
	
	for p in dr_tools.splitlines(o.diatable):
		if p[0] == '#sample':
			index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')]
		else:
			sample = p[0]
			
			try:
				width = float(p[index_dia[1]])
				length = float(p[index_dia[0]])
			except ValueError:
				continue
			xarr.append((width*length)**(o.dim/2))
			yarr.append(MAfraction(expra, sample))
	
	print stats.pearsonr(xarr, yarr)
	
Beispiel #35
0
import argparse, dr_tools
from collections import defaultdict
from itertools import chain

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('tcr_summary_file', metavar='tcr_summary.txt')
	parser.add_argument('output_prefix')
	parser.add_argument('--numbering_start', type=int, default=1)
	o = parser.parse_args()
	num = o.numbering_start

	clones_by_TCR_seq = defaultdict(lambda: defaultdict(list))
	for li, p in enumerate(dr_tools.splitlines(o.tcr_summary_file)):
		if li == 0:
			VDJ_i = []
			CDR3_i = []
			for column_header in ('CDR3 amino acid sequence',):
				CDR3_i.append(p.index(column_header)+1)
				CDR3_i.append(p.index(column_header)+1+len(p))
			for column_header in ('V segments', 'J segments'):
				VDJ_i.append(p.index(column_header)+1)
			for column_header in ('V segments', 'D segments', 'J segments', 'VD insertions', 'DJ insertions'):
				VDJ_i.append(p.index(column_header)+1+len(p))
		else:
			sample_name = p[0]
			if not p[1]: p = [p[0]]+p[2:]
			source_person = sample_name.split('_')[0]
			try: TCR_features = tuple(p[i] for i in chain(CDR3_i, VDJ_i))
			except IndexError:
				# discard, it is missing info
	opts.add_argument('--allowedgenes')
	opts.add_argument('--disallowedgenes')
	opts.add_argument('--verticalborder', action='store_true')
	opts.add_argument('--stageline', action='store_true')
	opts.add_argument('--embryoline', action='store_true')
	opts.add_argument('--embryonotch', action='store_true')
	opts.add_argument('--mincoord', type=int)
	opts.add_argument('--maxcoord', type=int)
	opts.add_argument('--saygenes', action='store_true')
	args = opts.parse_args()
	
	# load expression data
	expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True)
	samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter)))
	
	for p in dr_tools.splitlines(args.rpkmf_alleles):
		if p[0] == '#samples': samples = p[1:]; break
	samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))]
	
	# sort the genes by position
	# only include transcripts which are the first ID in the entry of the rpkm file
	if args.allowedgenes is None and args.disallowedgenes is None:
		allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs'])
	else:
		if args.allowedgenes:
			allowed_set = set(dr_tools.loadlist(args.allowedgenes))
		if args.disallowedgenes:
			disallowed_set = set(dr_tools.loadlist(args.disallowedgenes))
		allowed_IDs = set(IDs.split('+')[0] for IDs, symbols in zip(expr_alleles['IDs'],expr_alleles['symbols']) if (args.allowedgenes is None or any(identifier in allowed_set for identifier in (IDs.split('+') + symbols.split('+')))) and not (args.disallowedgenes is not None and any(identifier in disallowed_set for identifier in (IDs.split('+') + symbols.split('+')))))
	genes_per_chr = dict()
	ID_to_gene = dict()
Beispiel #37
0
    parser.add_argument('-g', '--genelist_in')
    parser.add_argument('-q', '--maxq', default=0.05, type=float)
    parser.add_argument('--top', action='store_true')
    parser.add_argument('--bottom', action='store_true')
    parser.add_argument('--oneID', action='store_true')
    o = parser.parse_args()
    if not o.top and not o.bottom:
        o.top = True
        o.bottom = True

    at_list_top = True
    if o.genelist_in: allowedgenes = set(dr_tools.loadlist(o.genelist_in))
    last_q = 0
    genes_top = []
    genes_bottom = []
    for li, p in enumerate(dr_tools.splitlines(o.infile)):
        if li == 0:
            q_i = p.index('FDR')
            sym_i = 0
            if o.oneID: ID_i = p.index('IDs')
        else:
            qval = float(p[q_i])
            if at_list_top and qval < last_q: at_list_top = False
            sym = p[sym_i]
            sym_out = p[ID_i].split('+')[0] if o.oneID else sym
            last_q = qval
            if o.genelist_in and sym not in allowedgenes: continue
            if qval > o.maxq: continue
            if at_list_top:
                genes_top.append(sym_out)
            else:
	parser.add_argument('-g', '--genelist_in')
	parser.add_argument('-q', '--maxq', default=0.05, type=float)
	parser.add_argument('--top', action='store_true')
	parser.add_argument('--bottom', action='store_true')
	parser.add_argument('--oneID', action='store_true')
	o = parser.parse_args()
	if not o.top and not o.bottom:
		o.top = True
		o.bottom = True

	at_list_top = True
	if o.genelist_in: allowedgenes = set(dr_tools.loadlist(o.genelist_in))
	last_q = 0
	genes_top = []
	genes_bottom = []
	for li, p in enumerate(dr_tools.splitlines(o.infile)):
		if li == 0:
			q_i = p.index('FDR')
			sym_i = 0
			if o.oneID: ID_i = p.index('IDs')
		else:
			qval = float(p[q_i])
			if at_list_top and qval < last_q: at_list_top = False
			sym = p[sym_i]
			sym_out = p[ID_i].split('+')[0] if o.oneID else sym
			last_q = qval
			if o.genelist_in and sym not in allowedgenes: continue
			if qval > o.maxq: continue
			if at_list_top:
				genes_top.append(sym_out)
			else:
import argparse, dr_tools, numpy
from collections import defaultdict

if '__main__' == __name__:
	parser = argparse.ArgumentParser()
	parser.add_argument('sample_and_chromosome_list')
	parser.add_argument('output_file')
	parser.add_argument('-A', '--annotationfile')
	parser.add_argument('-a', '--allelehits')
	o = parser.parse_args()
	
	exprr = dr_tools.loadexpr(o.allelehits, False)
	expra = dr_tools.loadexpr(o.allelehits, True)
	
	chrom_to_IDs = defaultdict(set)
	for p in dr_tools.splitlines(o.annotationfile):
		chrom = p[2]
		sym = p[12]
		ID = p[1]
		chrom_to_IDs[chrom].add(ID)
	
	samples_set = set(expra.samples)
	
	with open(o.sample_and_chromosome_list) as infh:
		for line in infh:
			p = line.split()
			chrom = p[1]
			s_c57 = p[0]+'_c57only'
			s_cast = p[0]+'_castonly'
			if p[0] not in samples_set: continue
			for ai, ID in enumerate(expra['IDs']):
			else:
				filter_pos, filter_neg = tuple(), tuple()
				protline = given_combo
			proteins_pos, proteins_neg =  split_pos_neg(protline)
			combos.append(tuple(map(tuple, (filter_pos, filter_neg, proteins_pos, proteins_neg))))
			levels_per_combo[combos[-1]] = defaultdict(list)
	if o.num_pos + o.num_neg < 1 and not combos: raise Exception
	req_proteins = set(o.req_proteins)
	filter_req_proteins = set(o.filter_req_proteins)
	
	# select samples
	samples_csv = o.csv_in
	if o.use_from_list is not None:
		samples_to_use = set()
		samples_possible = [f for f in os.listdir('.') if f.endswith('.csv') or f.endswith('.csv.gz')]
		for p in dr_tools.splitlines(o.csv_in[0]):
			if p[1] == o.use_from_list:
				samples_to_use.update([s for s in samples_possible if s == p[0] or s.startswith(p[0]+'_')])
		samples_csv = list(samples_to_use)
		o.csv_in = o.csv_in[1:]
	if len(o.csv_in) == 1 and o.csv_in[0].endswith('.txt'):
		with openfile(o.csv_in[0], 'rU') as infh:
			if o.use_from_list is not None:
				samples_csv = [f.strip() for f in infh.read().split() if f.strip() in samples_to_use]
			else:
				samples_csv = [f.strip() for f in infh.read().split()]
	
	CD3double = False
	if not combos:
		if o.proteins:
			row_keys = set(o.proteins)
Beispiel #41
0
import dr_tools, os, argparse

parser = argparse.ArgumentParser()
parser.add_argument('namefile')
parser.add_argument('--prefix_in', default='')
parser.add_argument('--prefix_out', default='')
parser.add_argument('--saysuccess', action='store_true')
o = parser.parse_args()

for p in dr_tools.splitlines(o.namefile):
    try:
        os.rename('%s%s.fastq.gz' % (o.prefix_in, p[0]),
                  '%s%s.fastq.gz' % (o.prefix_out, p[1]))
    except:
        if not o.saysuccess:
            print 'fastq missing', p[0]
        pass
    else:
        if o.saysuccess:
            print 'renamed', p[0]

    try:
        os.rename('%s%s_expression.txt' % (o.prefix_in, p[0]),
                  '%s%s_expression.txt' % (o.prefix_out, p[1]))
    except:
        if not o.saysuccess:
            print 'rpkms missing', p[0]
        pass
    else:
        if o.saysuccess:
            print 'renamed', p[0]