def loadfile(infile): reads_per_sample = dict() for p in dr_tools.splitlines(infile): sample = p[0].split('/')[-1].split('_refseq.txt:#')[0] reads = float(p[1]) reads_per_sample[sample] = reads return reads_per_sample
def make_snp2gene_file(genepred, snptable, outfile_mpileup, outfile_snp2genes, include_overlap=False): with open(genepred) as infh: for line in infh: if line.startswith('#'): continue chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline(line) for start, end in zip(exonstarts, exonends): exon = dr_tools.Cregion(chromosome, start, end) exon.gene = genename exon.addtowindows() snps_per_gene = defaultdict(list) snp_positions = [] for p in dr_tools.splitlines(snptable): # e.g. 585 chr1 10019 10020 rs376643643 0 + A A -/A genomic deletion unknown 0 0 near-gene-5 exact 1 1 SSMP, 0 if p[11] != 'single': continue # ignore non-SNPs chromosome = p[1] position = int(p[2]) # 0-based genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint(chromosome, position)) if include_overlap or len(genes) == 1: for gene in genes: snps_per_gene[gene].append('%s:%s|%s'%(p[1], p[3], p[9])) snp_positions.append('%s\t%s'%(p[1], p[3])) with open(outfile_snp2genes, 'w') as outfh: for gene, snps in snps_per_gene.items(): print >>outfh, dr_tools.join(gene, len(snps), ';'.join(sorted(snps))) with open(outfile_mpileup, 'w') as outfh: for snpline in snp_positions: print >>outfh, snpline
def calc_ERCC_moleculenumber(tablefile, before_dilution_vol_ul): Mix1_i = 3 conc_attomolul = 0 attomol = 602214.12927 for i, p in enumerate(dr_tools.splitlines(tablefile)): if i == 0: if not 'attomoles/ul' in p[Mix1_i]: raise Exception #if not 'Mix 1' in p[Mix1_i]: raise Exception else: conc_attomolul += float(p[Mix1_i]) return conc_attomolul * before_dilution_vol_ul * 602214.12927
def load_geneset(ID_to_symbol, filename): allowed_symbols = set(ID_to_symbol.values()) allowed_IDs = set(ID_to_symbol.keys()) geneset_symbols = set() entries = dict() for genes in dr_tools.splitlines(filename, sep=';'): genes_sym = set(gene for gene in genes if gene in allowed_symbols) genes_sym |= set(ID_to_symbol[gene] for gene in genes if gene in allowed_IDs and gene not in genes_sym) for sym in genes_sym: entries[sym] = frozenset(genes_sym) return entries
def table_loader(): for p in dr_tools.splitlines(o.nondiatable): if p[0] == '#sample': index_cellsource = p.index('cell.type') else: try: sample = p[0] cellsource = p[index_cellsource] if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample sample_i = expr.samples.index(sample) if cellsource in ('mef', 'MAF'): cellsource='fibroblast' yield p, sample, sample_i, cellsource except KeyError: print 'missing', sample continue
def table_loader(): for p in dr_tools.splitlines(o.nondiatable): if p[0] == '#sample': index_cellsource = p.index('cell.type') elif p[0].startswith('#'): pass else: try: sample = p[0] cellsource = p[index_cellsource] if sample == 'BQx46_indD_EmbryoMEF_BxC': continue # degraded sample sample_i = expr.samples.index(sample) if cellsource in ('mef', 'MAF'): cellsource='fibroblast' yield p, sample, sample_i, cellsource except KeyError: print 'missing', sample continue
samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter)) and not any(part in e for part in args.exclude)) allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) fractions_to_show = list() vals_real = list() vals_ctrl = list() labels = list() bootstrap_output = list() # sort the genes by posiotion # only include transcripts which are the first ID in the entry of the rpkm file for mindist,maxdist in zip(args.bindist[:-1], args.bindist[1:]): genes_per_chr = dict() ID_to_gene = dict() for p in dr_tools.splitlines(args.genePred): ID = p[1] if ID in allowed_IDs: chromosome = p[2] if chromosome in ('chrX', 'chrY'): continue if not chromosome in genes_per_chr: genes_per_chr[chromosome] = [] genes_per_chr[chromosome].append(Gene(ID, int(p[4]) if p[3]=='+' else int(p[5]), p[3])) for chromosome in genes_per_chr: genes_per_chr[chromosome].sort(key=lambda gene: gene.TSS) non_chr_genes = [gene for chr_key in genes_per_chr if chr_key != chromosome for gene in genes_per_chr[chromosome]] for gene_i, gene in enumerate(genes_per_chr[chromosome]): ID_to_gene[gene.ID] = gene # only include neighbours in the 'forward' direction, to avoid dependence in stat tests later on gene.neighbours = genes_per_chr[chromosome][gene_i+1:args.numadjacent+gene_i+1] gene.neighbours = [other for other in gene.neighbours if mindist <= abs(other.TSS-gene.TSS) < maxdist] if args.onlydiffstrand:
def get_loadings(filepath): loadingsdict = dict((key, float(val)) for key,val in dr_tools.splitlines(filepath)) name = filepath.split('/')[-1].rsplit('_loadings_',1)[-1].split('.')[0] return loadingsdict, name
set2 = set(entries2) set1_unique_c = len(set(entries1[sym] for sym in (set1-set2))) set2_unique_c = len(set(entries2[sym] for sym in (set2-set1))) common_c = len(set(entries1[sym] for sym in (set2&set1))) common_c2 = len(set(entries2[sym] for sym in (set2&set1))) if not common_c == common_c2: raise Exception saygenes = [] for genes in set(entries2[sym] for sym in (set2&set1)): saygenes.append(';'.join(list(genes))) return set1_unique_c, common_c, set2_unique_c, ', '.join(saygenes) if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-A', '--annotationfile', default='/mnt/crick/danielr/Xandclones_BR/BR_fibroblasts/snp-call/more_formats/mm9_ensembl_refseq_norandom_11Apr2012_genesymbols.txt') parser.add_argument('-a', '--set1', required=True) parser.add_argument('-b', '--set2', required=True) parser.add_argument('-ge', '--disallowedgenes', nargs='+') o = parser.parse_args() if o.disallowedgenes: disallowedgenes = set() for filename in o.disallowedgenes: disallowedgenes.update(set(dr_tools.loadlist(filename))) else: disallowedgenes = None ID_to_symbol = dict((p[1], p[12]) for p in dr_tools.splitlines(o.annotationfile) if disallowedgenes is None or p[12] not in disallowedgenes) print dr_tools.join(overlap_of_2(load_geneset(ID_to_symbol, o.set1), load_geneset(ID_to_symbol, o.set2))) print len(set(ID_to_symbol.values()))
opts.add_argument('-o', '--figurefile', default='monoallelic_by_chr.pdf') args = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted( e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any( part in e for part in args.filter))) # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) genes_per_chr = dict() ID_to_gene = dict() for p in dr_tools.splitlines(args.genePred): ID = p[1] if ID in allowed_IDs: chromosome = p[2] if 'random' in chromosome: continue if not chromosome in genes_per_chr: genes_per_chr[chromosome] = [] genes_per_chr[chromosome].append( Gene(ID, int(p[4]) if p[3] == '+' else int(p[5]))) for chromosome in genes_per_chr: genes_per_chr[chromosome].sort(key=lambda gene: gene.TSS) for gene_i, gene in enumerate(genes_per_chr[chromosome]): ID_to_gene[gene.ID] = gene samples = []
default=97) # 97 for both round1 and round2 parser.add_argument('--max_dm', type=float) parser.add_argument('--not_clusterabs', action='store_false', dest='clusterabs') parser.add_argument('--reverse_order', action='store_true') o = parser.parse_args() o.network = False proteins = set() correlations_together = defaultdict(lambda: defaultdict(list)) correlations_opposing = defaultdict(lambda: defaultdict(list)) qvalues_together = defaultdict(lambda: defaultdict(list)) qvalues_opposing = defaultdict(lambda: defaultdict(list)) for file_in in o.file_in: for p in dr_tools.splitlines(file_in): if p[0].startswith('(('): continue peakpoint = p[3] r = float(p[4]) protline = p[2] if o.coexpression: r = float(p[5]) if protline.startswith('req'): filter_pos, filter_neg = split_pos_neg( protline.split('_')[0][3:]) proteins_pos, proteins_neg = split_pos_neg( protline.split('_')[1]) else: proteins_pos, proteins_neg = split_pos_neg(protline)
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt' file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt' # created by make_allelecalls.py -P using the -a and -s arguments output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt' import dr_tools positions = set() for p in dr_tools.splitlines(file1): # for each SNP line in the file if float(p[-2]) < 0.9: # if second last column's value is <0.9 positions.add('%s:%s'%(p[0], p[1])) # add to allowed SNP list print len(positions) c=0 outfh = open(output, 'w') for p in dr_tools.splitlines(file2): # for each gene snps = [] for snpinfo in p[2].split(';'): # go through the SNPs for the gene if snpinfo.split('|')[0] in positions: # see if on allowed list snps.append(snpinfo) # add to SNPs to print to output c+=1 print >>outfh, dr_tools.join(p[0], len(snps), ';'.join(snps)) # output the SNPs for the gene outfh.close() print c
help='same output format as validated_mm9_refseq_snp2genes.txt') parser.add_argument('--include_overlap', action='store_true') o = parser.parse_args() with open(o.genepred) as infh: for line in infh: chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline( line) for start, end in zip(exonstarts, exonends): exon = dr_tools.Cregion(chromosome, start, end) exon.gene = genename exon.addtowindows() snps_per_gene = defaultdict(list) for p in dr_tools.splitlines(o.snplist): # e.g. chr11 117883408 C A 0 1.00 -1.00 0.90 0.10 0.71 0.29 if ',' in p[2] or ',' in p[3]: continue # added 18 Dec, since snp_stats2.py -S removes these SNPs anyway chromosome = p[0] position = int(p[1]) - 1 genes = set(exon.gene for exon in dr_tools.Cregion.overlappingpoint( chromosome, position)) if o.include_overlap: for gene in genes: snps_per_gene[gene].append('%s:%s' % (p[0], p[1])) else: if len(genes ) == 1: # don't allow overlapping genes, exclude those SNPs
proteins_neg = [p.strip('*') for p in o.proteins_neg] random.seed(0) if o.pdf is None: o.pdf = 'expression_' + ''.join([p + '+' for p in proteins_pos]) + ''.join( [p + '-' for p in proteins_neg]) + '.pdf' elif o.title is None: o.title = ''.join([p + '+' for p in proteins_pos]) + ''.join( [p + '-' for p in proteins_neg]) samples_csv = o.csv_in if o.use_from_list is not None: samples_to_use = set() samples_possible = [f for f in os.listdir('.') if f.endswith('.csv')] for p in dr_tools.splitlines(o.csv_in[0]): if p[1] == o.use_from_list: samples_to_use.update([ s for s in samples_possible if s == p[0] or s.startswith(p[0] + '_') ]) samples_csv = list(samples_to_use) o.csv_in = o.csv_in[1:] if len(o.csv_in) == 1 and o.csv_in[0].endswith('.txt'): with open(o.csv_in[0], 'rU') as infh: samples_csv = [f.strip() for f in infh.read().split()] levels = defaultdict(list) for csv_in in samples_csv: with open(csv_in, 'rb') as infh: sample = csv_in.rsplit('/', 1)[-1].split('.csv')[0]
opts = argparse.ArgumentParser() opts.add_argument('inf') opts.add_argument('--filter', nargs='+') opts.add_argument('-f', '--figf', default='plot_monoallelic_by_cell.pdf') opts.add_argument('-gi', '--genelistf_include', nargs='+') opts.add_argument('-ge', '--genelistf_exclude', nargs='+') o = opts.parse_args() expr = dr_tools.loadexpr([o.inf], counts=True) #samples = sorted([e for e in expr if e not in ('IDs', 'symbols')]) allowed_gene_i = gene_i_by_listf(o.genelistf_include, expr) if o.genelistf_include else None excluded_gene_i = gene_i_by_listf(o.genelistf_exclude, expr) if o.genelistf_exclude else None for p in dr_tools.splitlines(o.inf): if p[0] == '#samples': samples = p[1:]; break fractions = [] # maternal only + paternal only mfractions = [] # maternal only fractions_all3 = [] # maternal+parternal+biallelic labels = [] for s1, s2 in zip(samples[::2], samples[1::2]): if o.filter is not None and not any(part in s1.rsplit('_',1)[0] for part in o.filter): continue if s1.rsplit('_',1)[0] != s2.rsplit('_',1)[0] or not 'c57' in s1 or 'c57' in s2: print 'Error in pair:', s1, s2 continue # check for errors in input file format Z = zip(expr[s1], expr[s2]) if o.genelistf_include or o.genelistf_exclude: Z = [E for i,E in enumerate(Z) if (allowed_gene_i is None or i in allowed_gene_i) and (excluded_gene_i is None or i not in excluded_gene_i)] # only those includes in the gene list
parser.add_argument('-i', '--chrom_include', nargs='+', help="'rest' matches all not in -i or -e, 'random' matches e.g. chr1_random", required=True) parser.add_argument('-e', '--chrom_exclude', nargs='+', help="'rest' matches all not in -i or -e, 'random' matches e.g. chr1_random", default=[]) o = parser.parse_args() all_chromosomes = set(dr_tools.loadlist(o.genePred, 2)) include = set(c for c in all_chromosomes if c in o.chrom_include) exclude = set(c for c in all_chromosomes if c in o.chrom_exclude) if 'random' in o.chrom_include: include.update(set(c for c in all_chromosomes if '_random' in c and c not in exclude)) if 'random' in o.chrom_exclude: exclude.update(set(c for c in all_chromosomes if '_random' in c and c not in include)) if 'rest' in o.chrom_include: include.update(all_chromosomes-exclude) if 'rest' in o.chrom_exclude: exclude.update(all_chromosomes-include) genes_incl = set() genes_excl = set() for p in dr_tools.splitlines(o.genePred): symbol = p[12] chromosome = p[2] if chromosome in include: genes_incl.add(symbol) elif chromosome in exclude: genes_excl.add(symbol) with open(o.genelist_out, 'w') as outfh: for gene in genes_incl-genes_excl: print >>outfh, gene
file1 = '/mnt/crick/rickards/projects/hsa_snp_calling/snp_stats_ac2.txt' file2 = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/SNPs_per_gene.txt' # created by make_allelecalls.py -P using the -a and -s arguments output = '/mnt/kauffman/danielr/Xandclones_late2014/Tcell/male_P1299_YFV2001_newsnpcall/SNP_list/heterozygous_SNPs_per_gene.txt' import dr_tools positions = set() for p in dr_tools.splitlines(file1): # for each SNP line in the file if float(p[-2]) < 0.9: # if second last column's value is <0.9 positions.add('%s:%s' % (p[0], p[1])) # add to allowed SNP list print len(positions) c = 0 outfh = open(output, 'w') for p in dr_tools.splitlines(file2): # for each gene snps = [] for snpinfo in p[2].split(';'): # go through the SNPs for the gene if snpinfo.split('|')[0] in positions: # see if on allowed list snps.append(snpinfo) # add to SNPs to print to output c += 1 print >> outfh, dr_tools.join( p[0], len(snps), ';'.join(snps)) # output the SNPs for the gene outfh.close() print c
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('snp2genes', help='/mnt/kauffman/danielr/crick/Xandclones_BR/snp-validation/snp2genes/ensembl__nooverlap_ra5val_3percent.txt') parser.add_argument('-g', '--genelist') o = parser.parse_args() if o.genelist: allowedgenes = set(dr_tools.loadlist(o.genelist)) num_snps = 0 num_genes = 0 for p in dr_tools.splitlines(o.snp2genes): if o.genelist and p[0] not in allowedgenes: continue num_snps += int(p[1]) num_genes += 1 print 'snps:', num_snps print 'genes:', num_genes
return header, markers, marker_order if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-r', '--rpkmfile') parser.add_argument('--tableS4', default='tableS4.txt') parser.add_argument('--to_cytof_markers', default='symbol_to_cytof_marker.txt') parser.add_argument('--shuffle_patterns', action='store_true') parser.add_argument('-o', '--sample_list_prefix') o = parser.parse_args() header, markers, marker_order = parse_table(o.tableS4) gene_to_marker = dict(dr_tools.splitlines(o.to_cytof_markers)) marker_order = [m for m in marker_order if m in gene_to_marker.values()] if not o.shuffle_patterns: pop_cytof_pattern = dict( (pop, [markers[m][popi] for m in marker_order]) for popi, pop in enumerate(header)) else: pop_cytof_pattern = dict( (pop, random.shuffle([markers[m][popi] for m in marker_order])) for popi, pop in enumerate(header)) exprt = dr_tools.loadexpr(o.rpkmfile) random.seed(0) midexpr_symi_all_D = dict() for symi, sym in enumerate(exprt['symbols']): if sym not in gene_to_marker:
import dr_tools, os, argparse parser = argparse.ArgumentParser() parser.add_argument('namefile') parser.add_argument('--prefix_in', default='') parser.add_argument('--prefix_out', default='') parser.add_argument('--saysuccess', action='store_true') o = parser.parse_args() for p in dr_tools.splitlines(o.namefile): try: os.rename('%s%s.fastq.gz'%(o.prefix_in, p[0]), '%s%s.fastq.gz'%(o.prefix_out, p[1])) except: if not o.saysuccess: print 'fastq missing', p[0] pass else: if o.saysuccess: print 'renamed', p[0] try: os.rename('%s%s_expression.txt'%(o.prefix_in, p[0]), '%s%s_expression.txt'%(o.prefix_out, p[1])) except: if not o.saysuccess: print 'rpkms missing', p[0] pass else: if o.saysuccess: print 'renamed', p[0]
opts.add_argument('-i1', '--inf1', required=True) # e.g. ooref15... opts.add_argument('-F1', default=0.02, type=float) opts.add_argument('-i2', '--inf2') # e.g. ooref13... opts.add_argument('-F2', type=float, default=0) opts.add_argument('-o', '--outf', default='/dev/stdout') opts.add_argument('--addminreads', type=int, default=0) opts.add_argument('--round', choices=['0.5up', 'ceil', 'floor'], default='ceil') opts.add_argument('--minreadsboth', type=int, default=0) args = opts.parse_args() expr1 = dr_tools.loadexpr([args.inf1], counts=True) if args.inf2 is not None: expr2 = dr_tools.loadexpr([args.inf2], counts=True) for i, p in enumerate(dr_tools.splitlines(args.inf1)): samples = p[1:] break gene_counts_out = defaultdict(list) for s1, s2 in zip(samples[::2], samples[1::2]): if s1.rsplit('_', 1)[0] != s2.rsplit('_', 1)[0]: raise Exception for gene_i, symbol in enumerate(expr1['symbols']): # remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth: s1e = 0 s2e = 0 elif args.inf2 is None: s1e = subtract(expr1[s1][gene_i], args.F1, expr1[s2][gene_i],
if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-c', '--cellsums_files', metavar='cellsums', required=True, nargs='+') parser.add_argument('-v', '--snp_vcf', help='e.g. castsnp_alleles.txt') parser.add_argument('-o', '--outfile', default='/dev/null') parser.add_argument('-f', '--figure') parser.add_argument('-R', '--minratio', default=0, type=float) parser.add_argument('-rt', '--minreads_sum', default=1, type=int) parser.add_argument('-ra', '--minreads_allele', default=0, type=int) parser.add_argument('-V', '--snp_validatedbefore', help='e.g. validated_cast_c57_snps.txt') o = parser.parse_args() if o.snp_validatedbefore: allowed_coord = set() for p in dr_tools.splitlines(o.snp_validatedbefore, ignore='#'): chromosome = p[0] position = p[1] coord = '%s\t%s'%(chromosome, position) allowed_coord.add(coord) else: allowed_coord = None database_snps = dict() for p in dr_tools.splitlines(o.snp_vcf, ignore='#'): chromosome = 'chr'+p[0] position = p[1] c57base = p[3] castbase = p[4] if ',' in c57base or ',' in castbase: continue coord = '%s\t%s'%(chromosome, position) if allowed_coord is not None and coord not in allowed_coord: continue
parser.add_argument('-m', '--addition', nargs=2, action='append', default=[]) parser.add_argument('-l', '--removal', nargs=2, action='append', default=[]) o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) chrom_to_ai = defaultdict(list) for p in dr_tools.splitlines(o.annotationfile): chrom = p[2] sym = p[12] ID = p[1] chrom_to_ai[chrom].append(expra.ID_to_index[ID]) for s_c57, s_cast in zip(expra.samples[::2], expra.samples[1::2]): sample = s_c57.rsplit('_', 1)[0] for chrom in chrom_to_ai: for samplepart, chromosome in o.removal: if chrom == chromosome and samplepart in sample: continue for samplepart, chromosome in o.addition: if chrom == chromosome and samplepart in sample: print sample, chrom continue
opts.add_argument('--allowedgenes') opts.add_argument('--disallowedgenes') opts.add_argument('--verticalborder', action='store_true') opts.add_argument('--stageline', action='store_true') opts.add_argument('--embryoline', action='store_true') opts.add_argument('--embryonotch', action='store_true') opts.add_argument('--mincoord', type=int) opts.add_argument('--maxcoord', type=int) opts.add_argument('--saygenes', action='store_true') args = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter))) for p in dr_tools.splitlines(args.rpkmf_alleles): if p[0] == '#samples': samples = p[1:]; break samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))] # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file if 0:#args.allowedgenes is None and args.disallowedgenes in None: allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) else: if args.allowedgenes: allowed_set = set(dr_tools.loadlist(args.allowedgenes)) if args.disallowedgenes: disallowed_set = set(dr_tools.loadlist(args.disallowedgenes)) allowed_IDs = set(IDs.split('+')[0] for IDs, symbols in zip(expr_alleles['IDs'],expr_alleles['symbols']) if (args.allowedgenes is None or any(identifier in allowed_set for identifier in (IDs.split('+') + symbols.split('+')))) and not (args.disallowedgenes is not None and any(identifier in disallowed_set for identifier in (IDs.split('+') + symbols.split('+'))))) genes_per_chr = dict() ID_to_gene = dict()
required=True, nargs='+') parser.add_argument('-v', '--snp_vcf', help='e.g. castsnp_alleles.txt') parser.add_argument('-o', '--outfile', default='/dev/null') parser.add_argument('-f', '--figure') parser.add_argument('-R', '--minratio', default=0, type=float) parser.add_argument('-rt', '--minreads_sum', default=1, type=int) parser.add_argument('-ra', '--minreads_allele', default=0, type=int) parser.add_argument('-V', '--snp_validatedbefore', help='e.g. validated_cast_c57_snps.txt') o = parser.parse_args() if o.snp_validatedbefore: allowed_coord = set() for p in dr_tools.splitlines(o.snp_validatedbefore, ignore='#'): chromosome = p[0] position = p[1] coord = '%s\t%s' % (chromosome, position) allowed_coord.add(coord) else: allowed_coord = None database_snps = dict() for p in dr_tools.splitlines(o.snp_vcf, ignore='#'): chromosome = 'chr' + p[0] position = p[1] c57base = p[3] castbase = p[4] if ',' in c57base or ',' in castbase: continue coord = '%s\t%s' % (chromosome, position)
import argparse, dr_tools if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument( 'snp2genes', help= '/mnt/kauffman/danielr/crick/Xandclones_BR/snp-validation/snp2genes/ensembl__nooverlap_ra5val_3percent.txt' ) parser.add_argument('-g', '--genelist') o = parser.parse_args() if o.genelist: allowedgenes = set(dr_tools.loadlist(o.genelist)) num_snps = 0 num_genes = 0 for p in dr_tools.splitlines(o.snp2genes): if o.genelist and p[0] not in allowedgenes: continue num_snps += int(p[1]) num_genes += 1 print 'snps:', num_snps print 'genes:', num_genes
from scipy import stats if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('rpkmfile') parser.add_argument('diatable') parser.add_argument('--dim', type=float, default=3) o = parser.parse_args() expr = dr_tools.loadexpr(o.rpkmfile, True) spikes_i = [i for i, ID in enumerate(expr['IDs']) if 'ERCC' in ID] xarr = [] yarr = [] for p in dr_tools.splitlines(o.diatable): if p[0] == '#sample': index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')] else: sample = p[0] ERCC_readsum = sum(expr[sample][spike] for spike in spikes_i) sample_i = expr.samples.index(sample) mRNA_readsum = expr.normalizationreads[sample_i] try: width = float(p[index_dia[1]]) length = float(p[index_dia[0]]) except ValueError: continue xarr.append((width*length)**(o.dim/2)) yarr.append(mRNA_readsum/ERCC_readsum) # like D Edsgards' axis
if '__main__' == __name__: opts = argparse.ArgumentParser() opts.add_argument('-i1', '--inf1', required=True) # e.g. ooref15... opts.add_argument('-F1', default=0.02, type=float) opts.add_argument('-i2', '--inf2') # e.g. ooref13... opts.add_argument('-F2', type=float, default=0) opts.add_argument('-o', '--outf', default='/dev/stdout') opts.add_argument('--addminreads', type=int, default=0) opts.add_argument('--round', choices=['0.5up', 'ceil', 'floor'], default='ceil') opts.add_argument('--minreadsboth', type=int, default=0) args = opts.parse_args() expr1 = dr_tools.loadexpr([args.inf1], counts=True) if args.inf2 is not None: expr2 = dr_tools.loadexpr([args.inf2], counts=True) for i, p in enumerate(dr_tools.splitlines(args.inf1)): samples = p[1:] break gene_counts_out = defaultdict(list) for s1, s2 in zip(samples[::2], samples[1::2]): if s1.rsplit('_',1)[0] != s2.rsplit('_',1)[0]: raise Exception for gene_i, symbol in enumerate(expr1['symbols']): # remove a fraction F of the paternal chromosome's expression from the maternal chromosome's, and vice versa if expr1[s1][gene_i] + expr1[s2][gene_i] < args.minreadsboth: s1e = 0 s2e = 0 elif args.inf2 is None:
action='store_true', help='if --nodivision_confint was used for the input file generation') parser.add_argument('--shorten_names', action='store_true') parser.add_argument('--random_seed', type=float) parser.add_argument('-c', '--marker_colour', action='append', nargs=2, default=[]) o = parser.parse_args() if o.random_seed is not None: random.seed(o.random_seed) changesD = defaultdict(lambda: [Change(), Change()]) for ii, infile in enumerate((o.infile1, o.infile2)): for p in dr_tools.splitlines(infile): name = p[2] c = changesD[name][ii] c.name = name.upper() if 'Fox' in name else name if o.shorten_names: c.name = c.name.split('_')[-1].rstrip('+') bottom_ch, top_ch = map(float, p[4].strip('()').split(', ')) change = numpy.mean( [top_ch, bottom_ch]) # relative value, i.e. start*(1+change) = end avg_abundance = float(p[5]) # = (start+end)/2 if o.input_nodivision: c.startpoint = avg_abundance + change / 2 c.endpoint = c.startpoint + change c.ends.append(top_ch + c.startpoint) c.ends.append(bottom_ch + c.startpoint)
from __future__ import division import argparse, dr_tools, numpy, hcluster, random import matplotlib.pyplot as pylab import scipy.cluster.hierarchy as scipyhcluster def Xi_activity_similarity(X, Y): num_different = sum(x != y for x,y in zip(X,Y)) possibly_different = sum(X)+sum(Y) return num_different/possibly_different # corresponds to the binary distance i the R function dist stateD = {'XI':1, 'bi':1, 'nd':0, 'xa':0} if '__main__' == __name__: # load table linefeed = dr_tools.splitlines('chrX_clones_allelic_calls.txt') sample_labels = next(linefeed)[1:] character_matrixT = [] for cells in linefeed: # values in cells are nd, XI, xa, bi, except first column which is gene symbol if any(c!='nd' for c in cells): character_matrixT.append([stateD[c] for c in cells[1:]]) # make clusters character_matrix = numpy.array(character_matrixT).transpose() #hcdists = hcluster.pdist(character_matrix, metric='cityblock') hcdists = hcluster.pdist(character_matrix, metric=Xi_activity_similarity) hclinks = hcluster.linkage(hcdists, method='complete') draw_order = hcluster.leaves_list(hclinks) # draw tree
opts.add_argument('--alg2', action='store_true') o = opts.parse_args() expr = dr_tools.loadexpr([o.inf], counts=True) exprt = dr_tools.loadexpr([o.rpkmf_total], counts=False) allowed_gene_i = gene_i_by_listf(o.genelistf_include, expr) if o.genelistf_include else None excluded_gene_i = gene_i_by_listf(o.genelistf_exclude, expr) if o.genelistf_exclude else None def rpkm(Ai, sample): Ti = exprt.ID_to_index[expr['IDs'][Ai]] return exprt[sample][Ti] for p in dr_tools.splitlines(o.inf): if p[0] == '#samples': samples = p[1:] break fractions = [] # maternal only + paternal only mfractions = [] # maternal only fractions_all3 = [] # maternal+parternal+biallelic labels = [] for s1, s2 in zip(samples[::2], samples[1::2]): if o.filter is not None and not any(part in s1.rsplit('_', 1)[0] for part in o.filter): continue if s1.rsplit('_', 1)[0] != s2.rsplit( '_', 1)[0] or not 'c57' in s1 or 'c57' in s2:
with open(csv_path, 'rU') as infh: for line in infh: count += 1 return count - 1 if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('-a', '--allcells_csv', nargs='+', required=True) parser.add_argument('-b', '--Bcells_csv', nargs='+', required=True) parser.add_argument('-c', '--cellnumbertable', required=True) parser.add_argument('--vocal', action='store_true') o = parser.parse_args() samples = [] for li, p in enumerate(dr_tools.splitlines(o.cellnumbertable)): samples.append(Sample(str(li + 1), p[0], int(p[1]) if p[1] else None)) for csv_path in o.allcells_csv: name = csv_path.split('/')[-1].split('_1.')[0].split('.txt')[0] matching_samples = [s for s in samples if name in s.names] if len(matching_samples) > 1: raise Exception if len(matching_samples) == 0: if o.vocal: print name, 'A' continue matching_samples[0].cells_cytof_all = count_cells(csv_path) for csv_path in o.Bcells_csv: name = csv_path.split('/')[-1].split('.')[0] matching_samples = [s for s in samples if name in s.names]
elif c57 or cast: count_mono += 1 return count_mono/(count_bi + count_mono) if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('allelehits') parser.add_argument('diatable') parser.add_argument('--dim', type=float, default=3) o = parser.parse_args() expra = dr_tools.loadexpr(o.allelehits, True) xarr = [] yarr = [] for p in dr_tools.splitlines(o.diatable): if p[0] == '#sample': index_dia = [p.index('cytoplasm.length'), p.index('cytoplasm.width')] else: sample = p[0] try: width = float(p[index_dia[1]]) length = float(p[index_dia[0]]) except ValueError: continue xarr.append((width*length)**(o.dim/2)) yarr.append(MAfraction(expra, sample)) print stats.pearsonr(xarr, yarr)
import argparse, dr_tools from collections import defaultdict from itertools import chain if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('tcr_summary_file', metavar='tcr_summary.txt') parser.add_argument('output_prefix') parser.add_argument('--numbering_start', type=int, default=1) o = parser.parse_args() num = o.numbering_start clones_by_TCR_seq = defaultdict(lambda: defaultdict(list)) for li, p in enumerate(dr_tools.splitlines(o.tcr_summary_file)): if li == 0: VDJ_i = [] CDR3_i = [] for column_header in ('CDR3 amino acid sequence',): CDR3_i.append(p.index(column_header)+1) CDR3_i.append(p.index(column_header)+1+len(p)) for column_header in ('V segments', 'J segments'): VDJ_i.append(p.index(column_header)+1) for column_header in ('V segments', 'D segments', 'J segments', 'VD insertions', 'DJ insertions'): VDJ_i.append(p.index(column_header)+1+len(p)) else: sample_name = p[0] if not p[1]: p = [p[0]]+p[2:] source_person = sample_name.split('_')[0] try: TCR_features = tuple(p[i] for i in chain(CDR3_i, VDJ_i)) except IndexError: # discard, it is missing info
opts.add_argument('--allowedgenes') opts.add_argument('--disallowedgenes') opts.add_argument('--verticalborder', action='store_true') opts.add_argument('--stageline', action='store_true') opts.add_argument('--embryoline', action='store_true') opts.add_argument('--embryonotch', action='store_true') opts.add_argument('--mincoord', type=int) opts.add_argument('--maxcoord', type=int) opts.add_argument('--saygenes', action='store_true') args = opts.parse_args() # load expression data expr_alleles = dr_tools.loadexpr([args.rpkmf_alleles], counts=True) samples_alleles = sorted(e for e in expr_alleles if e not in ('IDs', 'symbols') and (args.filter is None or any(part in e for part in args.filter))) for p in dr_tools.splitlines(args.rpkmf_alleles): if p[0] == '#samples': samples = p[1:]; break samples_alleles = [e for e in samples if (args.filter is None or any(part in e for part in args.filter))] # sort the genes by position # only include transcripts which are the first ID in the entry of the rpkm file if args.allowedgenes is None and args.disallowedgenes is None: allowed_IDs = set(IDs.split('+')[0] for IDs in expr_alleles['IDs']) else: if args.allowedgenes: allowed_set = set(dr_tools.loadlist(args.allowedgenes)) if args.disallowedgenes: disallowed_set = set(dr_tools.loadlist(args.disallowedgenes)) allowed_IDs = set(IDs.split('+')[0] for IDs, symbols in zip(expr_alleles['IDs'],expr_alleles['symbols']) if (args.allowedgenes is None or any(identifier in allowed_set for identifier in (IDs.split('+') + symbols.split('+')))) and not (args.disallowedgenes is not None and any(identifier in disallowed_set for identifier in (IDs.split('+') + symbols.split('+'))))) genes_per_chr = dict() ID_to_gene = dict()
parser.add_argument('-g', '--genelist_in') parser.add_argument('-q', '--maxq', default=0.05, type=float) parser.add_argument('--top', action='store_true') parser.add_argument('--bottom', action='store_true') parser.add_argument('--oneID', action='store_true') o = parser.parse_args() if not o.top and not o.bottom: o.top = True o.bottom = True at_list_top = True if o.genelist_in: allowedgenes = set(dr_tools.loadlist(o.genelist_in)) last_q = 0 genes_top = [] genes_bottom = [] for li, p in enumerate(dr_tools.splitlines(o.infile)): if li == 0: q_i = p.index('FDR') sym_i = 0 if o.oneID: ID_i = p.index('IDs') else: qval = float(p[q_i]) if at_list_top and qval < last_q: at_list_top = False sym = p[sym_i] sym_out = p[ID_i].split('+')[0] if o.oneID else sym last_q = qval if o.genelist_in and sym not in allowedgenes: continue if qval > o.maxq: continue if at_list_top: genes_top.append(sym_out) else:
import argparse, dr_tools, numpy from collections import defaultdict if '__main__' == __name__: parser = argparse.ArgumentParser() parser.add_argument('sample_and_chromosome_list') parser.add_argument('output_file') parser.add_argument('-A', '--annotationfile') parser.add_argument('-a', '--allelehits') o = parser.parse_args() exprr = dr_tools.loadexpr(o.allelehits, False) expra = dr_tools.loadexpr(o.allelehits, True) chrom_to_IDs = defaultdict(set) for p in dr_tools.splitlines(o.annotationfile): chrom = p[2] sym = p[12] ID = p[1] chrom_to_IDs[chrom].add(ID) samples_set = set(expra.samples) with open(o.sample_and_chromosome_list) as infh: for line in infh: p = line.split() chrom = p[1] s_c57 = p[0]+'_c57only' s_cast = p[0]+'_castonly' if p[0] not in samples_set: continue for ai, ID in enumerate(expra['IDs']):
else: filter_pos, filter_neg = tuple(), tuple() protline = given_combo proteins_pos, proteins_neg = split_pos_neg(protline) combos.append(tuple(map(tuple, (filter_pos, filter_neg, proteins_pos, proteins_neg)))) levels_per_combo[combos[-1]] = defaultdict(list) if o.num_pos + o.num_neg < 1 and not combos: raise Exception req_proteins = set(o.req_proteins) filter_req_proteins = set(o.filter_req_proteins) # select samples samples_csv = o.csv_in if o.use_from_list is not None: samples_to_use = set() samples_possible = [f for f in os.listdir('.') if f.endswith('.csv') or f.endswith('.csv.gz')] for p in dr_tools.splitlines(o.csv_in[0]): if p[1] == o.use_from_list: samples_to_use.update([s for s in samples_possible if s == p[0] or s.startswith(p[0]+'_')]) samples_csv = list(samples_to_use) o.csv_in = o.csv_in[1:] if len(o.csv_in) == 1 and o.csv_in[0].endswith('.txt'): with openfile(o.csv_in[0], 'rU') as infh: if o.use_from_list is not None: samples_csv = [f.strip() for f in infh.read().split() if f.strip() in samples_to_use] else: samples_csv = [f.strip() for f in infh.read().split()] CD3double = False if not combos: if o.proteins: row_keys = set(o.proteins)
import dr_tools, os, argparse parser = argparse.ArgumentParser() parser.add_argument('namefile') parser.add_argument('--prefix_in', default='') parser.add_argument('--prefix_out', default='') parser.add_argument('--saysuccess', action='store_true') o = parser.parse_args() for p in dr_tools.splitlines(o.namefile): try: os.rename('%s%s.fastq.gz' % (o.prefix_in, p[0]), '%s%s.fastq.gz' % (o.prefix_out, p[1])) except: if not o.saysuccess: print 'fastq missing', p[0] pass else: if o.saysuccess: print 'renamed', p[0] try: os.rename('%s%s_expression.txt' % (o.prefix_in, p[0]), '%s%s_expression.txt' % (o.prefix_out, p[1])) except: if not o.saysuccess: print 'rpkms missing', p[0] pass else: if o.saysuccess: print 'renamed', p[0]