def write_cyst_file(known_cyst_positions): unaligned_genes = utils.read_germlines(args.dirname, only_region='v')['v'] aligned_genes = utils.read_germlines(args.dirname, only_region='v', aligned=True)['v'] common_gene = None # we need to find at least one gene that's in the old and the new sets, so we know how to convert cyst positions for gene, info in known_cyst_positions.items(): if gene in aligned_genes: common_gene = gene break if common_gene is None: raise Exception('couldn\'t find any genes in common between %s and %s, so can\'t write new cyst position file' % (args.reference, args.dirname + '/' + aligned_fname)) aligned_seq = aligned_genes[common_gene] seq = unaligned_genes[common_gene] cpos = known_cyst_positions[common_gene]['cysteine-position'] utils.check_conserved_cysteine(seq, cpos) cpos_in_alignment = cpos ipos = 0 # position in unaligned sequence n_dots_passed = 0 # number of gapped positions in the aligned sequences that we pass before getting to cpos (i.e. while ipos < cpos) while ipos < cpos: if aligned_seq[ipos + n_dots_passed] in utils.gap_chars: cpos_in_alignment += 1 n_dots_passed += 1 else: ipos += 1 utils.check_conserved_cysteine(aligned_seq, cpos_in_alignment) displacement = cpos_in_alignment - cpos print ' cpos displacement: %d' % displacement cyst_positions = [] for gene, seq in unaligned_genes.items(): cyst_positions.append({'gene' : gene, 'cyst_start' : cpos})
def __init__( self, seqfname, joinfnames, datadir ): # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output) self.debug = 0 self.n_max_queries = -1 self.queries = [] self.germline_seqs = utils.read_germlines(datadir, remove_N_nukes=False) assert os.path.exists(os.getenv("www")) self.perfplotter = PerformancePlotter( self.germline_seqs, os.getenv("www") + "/partis/joinsolver_performance", "js" ) # get info that was passed to joinsolver self.seqinfo = {} with opener("r")(seqfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if len(self.queries) > 0 and line["unique_id"] not in self.queries: continue self.seqinfo[line["unique_id"]] = line iline += 1 if self.n_max_queries > 0 and iline >= self.n_max_queries: break self.n_failed, self.n_total = 0, 0 for joinfname in joinfnames: self.parse_file(joinfname) self.perfplotter.plot() print "failed: %d / %d = %f" % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines( self.args.datadir) #, add_fp=True) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line self.precluster_info = {} if self.args.seqfile is not None: self.input_info, self.reco_info = get_seqfile_info( self.args.seqfile, self.args.is_data, self.germline_seqs, self.cyst_positions, self.tryp_positions, self.args.n_max_queries, self.args.queries, self.args.reco_ids) self.outfile = None if self.args.outfname != None: if os.path.exists(self.args.outfname): os.remove(self.args.outfname) self.outfile = open(self.args.outfname, 'a')
def align_new_genes(old_aligned_genes, genes_without_alignments, all_new_genes): print 'missing alignments for %d genes' % len(genes_without_alignments) old_aligned_fname = args.dirname + '/old-aligned.fasta' missing_fname = args.dirname + '/missing-alignments.fasta' msa_table_fname = args.dirname + '/msa-table.txt' all_fname = args.dirname + '/all.fa' with open(old_aligned_fname, 'w') as tmpfile: for gene, seq in old_aligned_genes.items(): tmpfile.write('>%s\n%s\n' % (gene, seq.replace('.', '-'))) with open(missing_fname, 'w') as tmpfile: for gene, seq in genes_without_alignments.items(): tmpfile.write('>%s\n%s\n' % (gene, seq.replace('.', '-'))) check_call('ruby bin/makemergetable.rb ' + old_aligned_fname + ' 1>' + msa_table_fname, shell=True) check_call('cat ' + old_aligned_fname + ' ' + missing_fname + ' >' + all_fname, shell=True) check_call('mafft --merge ' + msa_table_fname + ' ' + all_fname + ' >' + args.dirname + '/' + aligned_fname, shell=True) # options= # "--localpair --maxiterate 1000" # then rewrite aligned file with only new genes, converting to upper case and dots for gaps all_aligned_germlines = utils.read_germlines(args.dirname, only_region='v', aligned=True) with open(args.dirname + '/' + aligned_fname, 'w') as tmpfile: for gene, seq in all_aligned_germlines['v'].items(): if gene not in all_new_genes: continue tmpfile.write('>%s\n%s\n' % (gene, seq.replace('-', '.').upper())) os.remove(old_aligned_fname) os.remove(missing_fname) os.remove(msa_table_fname) os.remove(all_fname)
def __init__( self, seqfname, joinfnames, datadir ): # <seqfname>: input to joinsolver, <joinfname> output from joinsolver (I only need both because they don't seem to put the full query seq in the output) self.debug = 0 self.n_max_queries = -1 self.queries = [] self.germline_seqs = utils.read_germlines(datadir, remove_N_nukes=False) assert os.path.exists(os.getenv('www')) self.perfplotter = PerformancePlotter( self.germline_seqs, os.getenv('www') + '/partis/joinsolver_performance', 'js') # get info that was passed to joinsolver self.seqinfo = {} with opener('r')(seqfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if len(self.queries ) > 0 and line['unique_id'] not in self.queries: continue self.seqinfo[line['unique_id']] = line iline += 1 if self.n_max_queries > 0 and iline >= self.n_max_queries: break self.n_failed, self.n_total = 0, 0 for joinfname in joinfnames: self.parse_file(joinfname) self.perfplotter.plot() print 'failed: %d / %d = %f' % (self.n_failed, self.n_total, float(self.n_failed) / self.n_total)
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'ihhhmmm') self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener('r')(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue self.siminfo[line['unique_id']] = line self.sim_need.append(line['unique_id']) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + '/*.fostream') if len(fostream_names) == 0: raise Exception('no fostreams found in %s' % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures( infname) # returns list of unique ids in this file with opener('r')(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print '%-20s no info' % unique_id self.perfplotter.add_fail() n_failed += 1 print '' print 'partially failed: %d / %d = %.2f' % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo)) print 'failed: %d / %d = %.2f' % (n_failed, len( self.siminfo), float(n_failed) / len(self.siminfo)) print '' self.perfplotter.plot()
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, "ihhhmmm") self.details = OrderedDict() self.failtails = {} self.n_partially_failed = 0 # get sequence info that was passed to ihhhmmm self.siminfo = OrderedDict() self.sim_need = [] # list of queries that we still need to find with opener("r")(self.args.simfname) as seqfile: reader = csv.DictReader(seqfile) iline = 0 for line in reader: if self.args.queries != None and line["unique_id"] not in self.args.queries: continue self.siminfo[line["unique_id"]] = line self.sim_need.append(line["unique_id"]) iline += 1 if args.n_queries > 0 and iline >= args.n_queries: break fostream_names = glob.glob(self.args.indir + "/*.fostream") if len(fostream_names) == 0: raise Exception("no fostreams found in %s" % args.indir) fostream_names.sort() # maybe already sorted? for infname in fostream_names: if len(self.sim_need) == 0: break # try to get whatever you can for the failures unique_ids = self.find_partial_failures(infname) # returns list of unique ids in this file with opener("r")(infname) as infile: self.parse_file(infile, unique_ids) # now check that we got results for all the queries we wanted n_failed = 0 for unique_id in self.siminfo: if unique_id not in self.details and unique_id not in self.failtails: print "%-20s no info" % unique_id self.perfplotter.add_fail() n_failed += 1 print "" print "partially failed: %d / %d = %.2f" % ( self.n_partially_failed, len(self.siminfo), float(self.n_partially_failed) / len(self.siminfo), ) print "failed: %d / %d = %.2f" % (n_failed, len(self.siminfo), float(n_failed) / len(self.siminfo)) print "" self.perfplotter.plot()
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) #, add_fp=True) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line self.precluster_info = {} if self.args.seqfile is not None: self.input_info, self.reco_info = get_seqfile_info(self.args.seqfile, self.args.is_data, self.germline_seqs, self.cyst_positions, self.tryp_positions, self.args.n_max_queries, self.args.queries, self.args.reco_ids) self.outfile = None if self.args.outfname != None: if os.path.exists(self.args.outfname): os.remove(self.args.outfname) self.outfile = open(self.args.outfname, 'a')
def __init__(self, args, seed, sublabel=None, total_length_from_right=-1): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename( self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.total_length_from_right = total_length_from_right # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.) self.all_seqs = {} # all the Vs, all the Ds... self.index_keys = { } # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = { } # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.all_seqs = utils.read_germlines(self.args.datadir) with opener('r')( self.args.datadir + '/v-meta.json' ) as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')( self.args.datadir + '/j_tryp.csv' ) as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = { row[0]: row[1] for row in tryp_reader } # WARNING: this doesn't filter out the header line # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.read_insertion_content() if self.args.naivety == 'M': # read shm info if non-naive is requested # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')( self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line[ 'value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')( self.treefname ) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath( self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
from performanceplotter import PerformancePlotter import csv import utils #---------------------------- #Get user input germlineDirectory = raw_input( 'Enter the path of the germline sequences): ') or 'data/imgt' originalInputFile = raw_input( 'Enter the path of the original input file into mixcr): ' ) or 'simu-10-leaves-1-mutate.csv' mixcrOutput = raw_input( 'Enter the path of the output from mixcr: ') or 'edited_output_file.txt' mixcrPlotDir = 'mixcrPlotDir' #---------------------------- #hardcoded default germline sequences germline_seqs = utils.read_germlines(germlineDirectory) #create an instance of the performance plotter class perfplotter = PerformancePlotter(germline_seqs, 'mixcr') #The true dictionary contains the correct locations taken from the original simulated data file #The inferred dictionary (iDictionary) will contain the inferences of those locations from Mixcr trueDictionary = {} iDictionary = {} with open(originalInputFile) as inFile1: with open(mixcrOutput) as inFile2: reader1 = csv.DictReader(inFile1) reader2 = csv.DictReader(inFile2, delimiter='\t') for row1, row2 in zip(reader1, reader2): unique_id = row1['unique_id'] #print unique_id
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line['unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[ : self.args.indir.rfind('/')] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len(re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall('No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position+1) pgraph = full_text[position : full_text.find('\n\n', position+1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches(line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
sys.path.insert(1, './python') import csv import argparse from clusterpath import ClusterPath from seqfileopener import get_seqfile_info import utils parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')') parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--simfname') parser.add_argument('--is-data', action='store_true') args = parser.parse_args() germline_seqs = utils.read_germlines(args.datadir) cyst_positions = utils.read_cyst_positions(args.datadir) with open(args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region tryp_reader = csv.reader(csv_file) tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line reco_info = None if args.simfname is not None: input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, germline_seqs, cyst_positions, tryp_positions) cp = ClusterPath() cp.readfile(args.infname) cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir) perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'imgt') # get sequence info that was passed to imgt self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.queries != None and line[ 'unique_id'] not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[line['unique_id']] = line iline += 1 if self.args.n_queries > 0 and iline >= self.args.n_queries: break paragraphs, csv_info = None, None if self.args.infname != None and '.html' in self.args.infname: print 'reading', self.args.infname with opener('r')(self.args.infname) as infile: soup = BeautifulSoup(infile) paragraphs = soup.find_all('pre') summarydir = self.args.indir[:self.args.indir.rfind( '/' )] # one directoy up from <indir>, which has the detailed per-sequence files summary_fname = glob.glob(summarydir + '/1_Summary_*.txt') assert len(summary_fname) == 1 summary_fname = summary_fname[0] get_genes_to_skip(summary_fname, self.germline_seqs) n_failed, n_skipped, n_total, n_not_found, n_found = 0, 0, 0, 0, 0 for unique_id in self.seqinfo: if self.args.debug: print unique_id, imgtinfo = [] # print 'true' # utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id]) if self.args.infname != None and '.html' in self.args.infname: for pre in paragraphs: # NOTE this loops over everything an awful lot of times. Shouldn't really matter for now, though if unique_id in pre.text: imgtinfo.append(pre.text) else: n_total += 1 assert self.args.infname == None infnames = glob.glob(self.args.indir + '/' + unique_id + '*') assert len(infnames) <= 1 if len(infnames) != 1: if self.args.debug: print ' couldn\'t find it' n_not_found += 1 continue n_found += 1 with opener('r')(infnames[0]) as infile: full_text = infile.read() if len( re.findall('[123]. Alignment for [VDJ]-GENE', full_text)) < 3: failregions = re.findall( 'No [VDJ]-GENE has been identified', full_text) if self.args.debug and len(failregions) > 0: print ' ', failregions n_failed += 1 continue # loop over the paragraphs I want position = full_text.find(unique_id) # don't need this one for ir in range(4): position = full_text.find(unique_id, position + 1) pgraph = full_text[position:full_text. find('\n\n', position + 1)] if 'insertion(s) and/or deletion(s) which are not dealt in this release' in pgraph: ir -= 1 continue imgtinfo.append(pgraph) # query seq paragraph if len(imgtinfo) == 0: print '%s no info' % unique_id continue else: if self.args.debug: print '' line = self.parse_query_text(unique_id, imgtinfo) if 'skip_gene' in line: # assert self.args.skip_missing_genes n_skipped += 1 continue try: assert 'failed' not in line joinparser.add_insertions(line, debug=self.args.debug) joinparser.resolve_overlapping_matches( line, debug=False, germlines=self.germline_seqs) except (AssertionError, KeyError): print ' giving up' n_failed += 1 perfplotter.add_partial_fail(self.seqinfo[unique_id], line) # print ' perfplotter: not sure what to do with a fail' continue perfplotter.evaluate(self.seqinfo[unique_id], line) if self.args.debug: utils.print_reco_event(self.germline_seqs, self.seqinfo[unique_id], label='true:') utils.print_reco_event(self.germline_seqs, line, label='inferred:') perfplotter.plot() print 'failed: %d / %d = %f' % (n_failed, n_total, float(n_failed) / n_total) print 'skipped: %d / %d = %f' % (n_skipped, n_total, float(n_skipped) / n_total) print ' ', for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g)) print '' if n_not_found > 0: print ' not found: %d / %d = %f' % (n_not_found, n_not_found + n_found, n_not_found / float(n_not_found + n_found))
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast') self.n_total, self.n_partially_failed = 0, 0 # get sequence info that was passed to igblast self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.n_max_queries > 0 and iline >= self.args.n_max_queries: break iline += 1 if self.args.queries != None and int( line['unique_id']) not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace( re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[int(line['unique_id'])] = line paragraphs = None print 'reading', self.args.infname info = {} with opener('r')(self.args.infname) as infile: line = infile.readline() # first find the start of the next query's section while line.find('<b>Query=') != 0: line = infile.readline() # then keep going till eof iquery = 0 while line != '': if self.args.n_max_queries > 0 and iquery >= self.args.n_max_queries: break # first find the query name query_name = int(line.split()[1]) # and collect the lines for this query query_lines = [] line = infile.readline() while line.find('<b>Query=') != 0: query_lines.append(line.strip()) line = infile.readline() if line == '': break iquery += 1 # then see if we want this query if self.args.queries != None and query_name not in self.args.queries: continue if query_name not in self.seqinfo: print 'ERROR %d not in reco info' % query_name sys.exit() if self.args.debug: print query_name # and finally add the query to <info[query_name]> info[query_name] = {'unique_id': query_name} self.n_total += 1 self.process_query(info[query_name], query_name, query_lines) self.perfplotter.plot() print 'partially failed: %d / %d = %f' % ( self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total)
# ---------------------------------------------------------------------------------------- parser = argparse.ArgumentParser() parser.add_argument('ighv_fname', help='input germline v set (presumably a new one), in fasta') parser.add_argument('--dirname', help='directory name for output (if not specified, we use <infname> with suffix removed)') parser.add_argument('--reference-dir', default='data/imgt', help='directory with reference/old germline sets') args = parser.parse_args() if args.dirname is None: args.dirname = os.path.os.path.splitext(args.ighv_fname)[0] files_to_copy = ['ighd.fasta', 'ighj.fasta', 'j_tryp.csv'] unaligned_fname = 'ighv.fasta' aligned_fname = 'ighv-aligned.fasta' # ---------------------------------------------------------------------------------------- # figure out which v genes we need to align old_aligned_genes = utils.read_germlines(args.reference_dir, only_region='v', aligned=True) all_new_genes = utils.read_germlines(args.dirname, only_region='v') # all genes in ighv_fname, not just the new ones genes_without_alignments = {} for gene in all_new_genes['v']: if gene not in old_aligned_genes['v']: genes_without_alignments[gene] = all_new_genes['v'][gene] # clean_dir() # shutil.copyfile(args.ighv_fname, args.dirname + '/' + unaligned_fname) # if len(genes_without_alignments) > 0: # align_new_genes(old_aligned_genes['v'], genes_without_alignments, all_new_genes['v']) # for fname in files_to_copy: # shutil.copyfile(args.reference_dir + '/' + fname, args.dirname + '/' + fname) known_cyst_positions = utils.read_cyst_positions(args.reference_dir) write_cyst_file(known_cyst_positions)
#This script takes in the inferences for gene locations from project Mixcr in the form of a text file and outputs a directory containing the results in both table and histogram form. #---------------------------- #Import relevant packages from performanceplotter import PerformancePlotter import csv import utils #---------------------------- #Get user input germlineDirectory = raw_input('Enter the path of the germline sequences): ') or 'data/imgt' originalInputFile = raw_input('Enter the path of the original input file into mixcr): ') or 'simu-10-leaves-1-mutate.csv' mixcrOutput = raw_input('Enter the path of the output from mixcr: ') or 'edited_output_file.txt' mixcrPlotDir='mixcrPlotDir' #---------------------------- #hardcoded default germline sequences germline_seqs = utils.read_germlines(germlineDirectory) #create an instance of the performance plotter class perfplotter = PerformancePlotter(germline_seqs, 'mixcr') #The true dictionary contains the correct locations taken from the original simulated data file #The inferred dictionary (iDictionary) will contain the inferences of those locations from Mixcr trueDictionary = {} iDictionary = {} with open(originalInputFile) as inFile1: with open(mixcrOutput) as inFile2: reader1 = csv.DictReader(inFile1) reader2 = csv.DictReader(inFile2, delimiter='\t') for row1, row2 in zip(reader1, reader2): unique_id = row1['unique_id'] #print unique_id trueDictionary[unique_id] = {}
def __init__(self, args): self.args = args self.germline_seqs = utils.read_germlines(self.args.datadir, remove_N_nukes=True) self.perfplotter = PerformancePlotter(self.germline_seqs, self.args.plotdir, 'igblast') self.n_total, self.n_partially_failed, self.n_skipped = 0, 0, 0 # get sequence info that was passed to igblast self.seqinfo = {} with opener('r')(self.args.simfname) as simfile: reader = csv.DictReader(simfile) iline = 0 for line in reader: if self.args.n_queries > 0 and iline >= self.args.n_queries: break iline += 1 if self.args.queries != None and int(line['unique_id']) not in self.args.queries: continue if len(re.findall('_[FP]', line['j_gene'])) > 0: line['j_gene'] = line['j_gene'].replace(re.findall('_[FP]', line['j_gene'])[0], '') self.seqinfo[int(line['unique_id'])] = line print 'reading', self.args.infname get_genes_to_skip(self.args.infname, self.germline_seqs, method='igblast', debug=False) paragraphs = None info = {} with opener('r')(self.args.infname) as infile: line = infile.readline() # first find the start of the next query's section while line.find('<b>Query=') != 0: line = infile.readline() # then keep going till eof iquery = 0 while line != '': if self.args.n_queries > 0 and iquery >= self.args.n_queries: break # first find the query name query_name = int(line.split()[1]) # and collect the lines for this query query_lines = [] line = infile.readline() while line.find('<b>Query=') != 0: query_lines.append(line.strip()) line = infile.readline() if line == '': break iquery += 1 # then see if we want this query if self.args.queries != None and query_name not in self.args.queries: continue if query_name not in self.seqinfo: print 'ERROR %d not in reco info' % query_name sys.exit() if self.args.debug: print query_name # and finally add the query to <info[query_name]> info[query_name] = {'unique_id':query_name} self.n_total += 1 self.process_query(info[query_name], query_name, query_lines) self.perfplotter.plot() print 'partially failed: %d / %d = %f' % (self.n_partially_failed, self.n_total, float(self.n_partially_failed) / self.n_total) print 'skipped: %d / %d = %f' % (self.n_skipped, self.n_total, float(self.n_skipped) / self.n_total) for g, n in genes_actually_skipped.items(): print ' %d %s' % (n, utils.color_gene(g))
# for human in A B C; do # datadir=data/human-beings/$human/M/data # bzgrep -m100 . $datadir/data.tsv.bz2 | sed 's/[ \t][ \t]*/,/g'|cut -f2 -d, |sed 's/nucleotide/seq/'> $datadir/head-data.csv # done naivety = 'M' infname = '' if data_type == 'simu': infname = '/home/dralph/Dropbox/work/recombinator/output/' + human + '/' + naivety + '/simu.csv' else: infname = 'data/human-beings/' + human + '/' + naivety + '/' + data_type + '/head-data.csv' baseoutdir = 'data/human-beings/' + human + '/' + naivety + '/' + data_type print 'opening ',infname print ' output',baseoutdir with opener('r')(infname) as infile: germlines = utils.read_germlines('../../../recombinator') reader = csv.DictReader(infile) il = 0 for inline in reader: il += 1 print inline['seq'][-100:] # if len(inline['seq']) != 130: # assert 'simulated' in infname searcher = Searcher(inline['seq'][-100:], debug=False, n_matches_max=5) found_str = searcher.search() values['found_strings']['v'][0].append(found_str) # toss them in ['v'][0] -- doesn't really make sense, but they're fine anywhere if found_str != 'vjd': # skip the ones where we didn't find matches in this order (see freqs above). continue for region in utils.regions: for imatch in range(len(searcher.matches[region])): if imatch > 2:
def __init__(self, args, seed, sublabel=None, total_length_from_right=-1): self.args = args if sublabel == None: self.workdir = self.args.workdir + '/recombinator' self.outfname = self.args.outfname else: # need a separate workdir for each subprocess self.workdir = self.args.workdir + '/recombinator-' + sublabel self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname) utils.prep_dir(self.workdir) if not os.path.exists(self.args.parameter_dir): raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e') # parameters that control recombination, erosion, and whatnot self.total_length_from_right = total_length_from_right # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.) self.all_seqs = {} # all the Vs, all the Ds... self.index_keys = {} # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict self.version_freq_table = {} # list of the probabilities with which each VDJ combo appears in data self.mute_models = {} # self.treeinfo = [] # list of newick-formatted tree strings with region-specific branch info tacked at the end for region in utils.regions: self.mute_models[region] = {} for model in ['gtr', 'gamma']: self.mute_models[region][model] = {} # first read info that doesn't depend on which person we're looking at self.all_seqs = utils.read_germlines(self.args.datadir) with opener('r')(self.args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region self.cyst_positions = json.load(json_file) with opener('r')(self.args.datadir + '/j_tryp.csv') as csv_file: # get location of <end> tryptophan in each j region (TGG) tryp_reader = csv.reader(csv_file) self.tryp_positions = {row[0]:row[1] for row in tryp_reader} # WARNING: this doesn't filter out the header line # then read stuff that's specific to each person self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all')) self.read_insertion_content() if self.args.naivety == 'M': # read shm info if non-naive is requested # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals with opener('r')(self.args.gtrfname) as gtrfile: # read gtr parameters reader = csv.DictReader(gtrfile) for line in reader: parameters = line['parameter'].split('.') region = parameters[0][3].lower() assert region == 'v' or region == 'd' or region == 'j' model = parameters[1].lower() parameter_name = parameters[2] assert model in self.mute_models[region] self.mute_models[region][model][parameter_name] = line['value'] treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed) self.treefname = self.workdir + '/trees.tre' treegen.generate_trees(seed, self.treefname) with opener('r')(self.treefname) as treefile: # read in the trees (and other info) that we just generated self.treeinfo = treefile.readlines() if not self.args.no_clean: os.remove(self.treefname) if os.path.exists(self.outfname): os.remove(self.outfname) elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))): os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
import numpy from subprocess import check_call import itertools from collections import OrderedDict import utils import plotting # ---------------------------------------------------------------------------------------- datadir = 'data/imgt' xtitles = { 'indels' : 'fraction of positions indel\'d', 'subs' : 'substitution fraction' } glfo = {} glfo['seqs'] = utils.read_germlines(datadir) glfo['aligned-v-genes'] = utils.read_germlines(datadir, only_region='v', aligned=True) vgenes = glfo['aligned-v-genes']['v'].keys() pversions = OrderedDict() for vg in vgenes: pv = utils.primary_version(vg) if pv not in pversions: pversions[pv] = [] pversions[pv].append(vg) # remove primary versions that only have one gene for pv in pversions: if len(pversions[pv]) == 1: print 'removing single-gene pv %s' % pv del pversions[pv]