def __init__(self, args, base_plotdir, skip_boring_states=''): self.base_plotdir = base_plotdir self.skip_boring_states = skip_boring_states plot_types = ('transitions', 'emissions', 'pair-emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype + '/plots' utils.prep_dir(plotdir, '*.png') if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: print 'ERROR zero files passed to modelplotter' sys.exit() for infname in filelist: gene_name = os.path.basename(infname).replace( '.yaml', '') # the sanitized name, actually # # ---------------------------------------------------------------------------------------- # if utils.get_region(gene_name) == 'v' and 'IGHV4-39_star_' not in gene_name: # continue # # ---------------------------------------------------------------------------------------- with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model) # self.make_pair_emission_plot(gene_name, model) for ptype in plot_types: check_call([ './bin/makeHtml', self.base_plotdir + '/' + ptype, '1', 'null', 'png' ]) check_call(['./bin/permissify-www', self.base_plotdir])
def __init__(self, args, base_plotdir, skip_boring_states=''): raise Exception('needs to be converted off root') self.base_plotdir = base_plotdir self.skip_boring_states = skip_boring_states plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype + '/plots' utils.prep_dir(plotdir, wildlings='*.png') if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: print 'ERROR zero files passed to modelplotter' sys.exit() for infname in filelist: gene_name = os.path.basename(infname).replace('.yaml', '') # the sanitized name, actually # # ---------------------------------------------------------------------------------------- # if utils.get_region(gene_name) == 'v' and 'IGHV4-39_star_' not in gene_name: # continue # # ---------------------------------------------------------------------------------------- with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model) for ptype in plot_types: check_call(['./bin/makeHtml', self.base_plotdir + '/' + ptype, '1', 'null', 'png']) check_call(['./bin/permissify-www', self.base_plotdir])
def run(input_file: str): cmds = get_arg_list(input_file) sub = Submarine() sub.process_cmds(cmds) sub.print() sub2 = AimedSub() sub2.process_cmds(cmds) sub2.print()
def process_gls_gen_args(args): # well, also does stuff with non-gls-gen new allele args positions = { 'snp' : utils.get_arg_list(args.snp_positions), 'indel' : utils.get_arg_list(args.indel_positions), } numbers = { 'snp' : utils.get_arg_list(args.nsnp_list, intify=True), 'indel' : utils.get_arg_list(args.nindel_list, intify=True), } delattr(args, 'snp_positions') # just to make sure you don't accidentally use them (should only use the new args.new_allele_info that gets created below) delattr(args, 'indel_positions') delattr(args, 'nsnp_list') delattr(args, 'nindel_list') n_new_alleles = None mtypes = ['snp', 'indel'] for mtype in mtypes: if positions[mtype] is not None: # if specific positions were specified on the command line positions[mtype] = [[int(p) for p in pos_str.split(',')] for pos_str in positions[mtype]] # NOTE I think I could switch this to utils.get_arg_list() with list_of_lists=True if len(positions[mtype]) != len(args.sim_v_genes): # we shouldn't be able to get here unless args has .sim_v_genes raise Exception('--%s-positions %s and --sim-v-genes %s not the same length (%d vs %d)' % (mtype, positions[mtype], args.sim_v_genes, len(positions[mtype]), len(args.sim_v_genes))) if numbers[mtype] is not None: if not args.generate_germline_set and len(numbers[mtype]) != len(args.sim_v_genes): # we shouldn't be able to get here unless args has .sim_v_genes raise Exception('--n%s-list %s and --sim-v-genes %s not the same length (%d vs %d)' % (mtype, numbers[mtype], args.sim_v_genes, len(numbers[mtype]), len(args.sim_v_genes))) if positions[mtype] is not None: raise Exception('can\'t specify both --n%s-list and --%s-positions' % (mtype, mtype)) positions[mtype] = [[None for _ in range(number)] for number in numbers[mtype]] # the <None> tells glutils to choose a position at random if positions[mtype] is not None: if n_new_alleles is None: n_new_alleles = len(positions[mtype]) if len(positions[mtype]) != n_new_alleles: raise Exception('mismatched number of new alleles for %s' % ' vs '.join(mtypes)) if n_new_alleles is None: n_new_alleles = 0 for mtype in mtypes: if positions[mtype] is None: # if it wasn't specified at all, i.e. we don't want to generate any new alleles positions[mtype] = [[] for _ in range(n_new_alleles)] args.new_allele_info = [{'gene' : args.sim_v_genes[igene] if not args.generate_germline_set else None, # we shouldn't be able to get here unless args has .sim_v_genes 'snp-positions' : positions['snp'][igene], 'indel-positions' : positions['indel'][igene]} for igene in range(n_new_alleles)]
def __init__(self, args, base_plotdir): self.base_plotdir = base_plotdir self.eps_to_skip = 1e-3 print 'skipping eps %f' % self.eps_to_skip plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype utils.prep_dir(plotdir, wildlings=['*.png', '*.svg']) if args.hmmdir != None: self.filelist = glob.glob(args.hmmdir + '/*.yaml') else: self.filelist = utils.get_arg_list(args.infiles) if len(self.filelist) == 0: raise Exception('zero files passed to modelplotter')
def __init__(self, args, base_plotdir): self.base_plotdir = base_plotdir self.eps_to_skip = 1e-3 print 'skipping eps %f' % self.eps_to_skip plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype utils.prep_dir(plotdir, wildlings=['*.png', '*.svg']) if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: raise Exception('zero files passed to modelplotter') for infname in filelist: gene_name = os.path.basename(infname).replace('.yaml', '') # the sanitized name, actually with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model)
def __init__(self, args, base_plotdir): self.base_plotdir = base_plotdir self.eps_to_skip = 1e-3 print 'skipping eps %f' % self.eps_to_skip plot_types = ('transitions', 'emissions') for ptype in plot_types: plotdir = self.base_plotdir + '/' + ptype utils.prep_dir(plotdir, wildlings=['*.png', '*.svg']) if args.hmmdir != None: filelist = glob.glob(args.hmmdir + '/*.yaml') else: filelist = utils.get_arg_list(args.infiles) if len(filelist) == 0: raise Exception('zero files passed to modelplotter') for infname in filelist: gene_name = os.path.basename(infname).replace( '.yaml', '') # the sanitized name, actually with open(infname) as infile: model = yaml.load(infile) self.make_transition_plot(gene_name, model) self.make_emission_plot(gene_name, model)
default='100:120', help= 'Times (reproductive rounds) at which to selection sequences for observation.' ) parser.add_argument('--carry-cap', type=int, default=1000, help='carrying capacity of germinal center') parser.add_argument( '--target-distance', type=int, default=15, help= 'Desired distance (number of non-synonymous mutations) between the naive sequence and the target sequences.' ) parser.add_argument('--target-count', type=int, default=10, help='Number of target sequences to generate.') parser.add_argument('--branching-parameter', type=float, default=2., help='') parser.add_argument('--base-mutation-rate', type=float, default=0.365, help='') parser.add_argument('--lb-tau', type=float, default=0.4, help='') args = parser.parse_args() args.obs_times = utils.get_arg_list(args.obs_times, intify=True) # ---------------------------------------------------------------------------------------- simulate() partition()
def process(args): if args.action == 'run-viterbi': print' note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)' args.action = 'annotate' if args.action == 'view-alternative-naive-seqs': print' note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)' args.action = 'view-alternative-annotations' if args.calculate_alternative_naive_seqs: print ' note: replacing deprecated option \'--calculate-alternative-naive-seqs\' with new option \'--calculate-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)' args.calculate_alternative_annotations = True delattr(args, 'calculate_alternative_naive_seqs') if args.chain is not None: print ' note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % (args.chain, 'ig' + args.chain) args.locus = 'ig' + args.chain args.chain = None args.loci = utils.get_arg_list(args.loci, choices=utils.loci) if args.loci is None: # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough args.loci = [args.locus] else: args.locus = args.loci[0] args.only_genes = utils.get_arg_list(args.only_genes) args.queries = utils.get_arg_list(args.queries) args.queries_to_include = utils.get_arg_list(args.queries_to_include) args.reco_ids = utils.get_arg_list(args.reco_ids) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) if args.istartstop is not None: if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0: raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1])) args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True) if len(args.n_max_per_region) != 3: raise Exception('n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region)) args.write_additional_cluster_annotations = utils.get_arg_list(args.write_additional_cluster_annotations, intify=True) if args.write_additional_cluster_annotations is not None and len(args.write_additional_cluster_annotations) != 2: raise Exception('--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations) args.extra_annotation_columns = utils.get_arg_list(args.extra_annotation_columns, choices=utils.extra_annotation_headers) args.cluster_indices = utils.get_arg_list(args.cluster_indices, intify=True) args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths, intify=True) args.region_end_exclusions = {r : [args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p']] for r in utils.regions} args.region_end_exclusion_length = None # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version args.typical_genes_per_region_per_subject = utils.get_arg_list(args.typical_genes_per_region_per_subject, intify=True) if len(args.typical_genes_per_region_per_subject) != len(utils.regions): raise Exception('wrong length for --typical-genes-per-region-per-subject, has to be three') tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject args.min_allele_prevalence_fractions = {r : tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)] for r in utils.regions} delattr(args, 'min_allele_prevalence_fraction') # delete the non-plural version delattr(args, 'typical_genes_per_region_per_subject') # and we don't need this any more either args.annotation_clustering_thresholds = utils.get_arg_list(args.annotation_clustering_thresholds, floatify=True) args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True) if args.small_clusters_to_ignore is not None: if '-' in args.small_clusters_to_ignore: lo, hi = [int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-')] args.small_clusters_to_ignore = range(lo, hi + 1) else: args.small_clusters_to_ignore = utils.get_arg_list(args.small_clusters_to_ignore, intify=True) if args.seed_unique_id is not None: args.seed_unique_id = args.seed_unique_id.strip() # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign) if args.queries is not None and args.seed_unique_id not in args.queries: raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries))) if args.random_seed_seq: raise Exception('can\'t specify both --seed-unique-id and --random-seed-seq') if args.queries_to_include is None: # make sure the seed is in --queries-to-include args.queries_to_include = [args.seed_unique_id] elif args.seed_unique_id not in args.queries_to_include: args.queries_to_include = [args.seed_unique_id] + args.queries_to_include # may as well put it first, I guess (?) elif args.seed_seq is not None: args.seed_unique_id = 'seed-seq' if args.sw_debug is None: # if not explicitly set, set equal to regular debug args.sw_debug = args.debug if args.only_genes is not None: for gene in args.only_genes: # make sure they're all at least valid ig genes utils.split_gene(gene) if args.print_git_commit or args.action == 'version': print ' commit: %s' % subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip() cmd = 'git describe --always --tags' out, err = utils.simplerun(cmd, return_out_err=True, debug=False) if '-' in out: if out.count('-') == 2: tag, n_ahead, commit_hash_abbrev = out.strip().split('-') ahead_str = '' if int(n_ahead) > 0: ahead_str = ' (well, %d commits ahead of)' % int(n_ahead) print ' tag: %s%s' % (tag, ahead_str) else: print ' couldn\'t figure out tag from \'%s\' output: %s' % (cmd, out) else: tag = out.strip() print ' tag: %s' % tag if args.action == 'version': sys.exit(0) args.is_data = not args.is_simu # whole code base uses is_data, this is better than changing all of that if args.simultaneous_true_clonal_seqs: if args.is_data: raise Exception('can only pass true clonal families to multi-hmm together on simulation and with --is-simu set') if args.n_simultaneous_seqs is not None: raise Exception('can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs') if args.all_seqs_simultaneous: raise Exception('can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs') if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous: raise Exception('doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.') if args.no_indels: print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty args.gap_open_penalty = args.no_indel_gap_open_penalty if args.indel_frequency > 0.: if args.indel_frequency < 0. or args.indel_frequency > 1.: raise Exception('--indel-frequency must be in [0., 1.] (got %f)' % args.indel_frequency) args.n_indels_per_indeld_seq = utils.get_arg_list(args.n_indels_per_indeld_seq, intify=True) if args.indel_location not in [None, 'v', 'cdr3']: if int(args.indel_location) in range(500): args.indel_location = int(args.indel_location) if any(n > 1 for n in args.n_indels_per_indeld_seq): print ' note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [n for n in args.n_indels_per_indeld_seq if n > 1] args.n_indels_per_indeld_seq = [n for n in args.n_indels_per_indeld_seq if n <= 1] else: raise Exception('--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500' % args.indel_location) if 'tr' in args.locus and args.mutation_multiplier is None: args.mutation_multiplier = 0. if args.workdir is None: # set default here so we know whether it was set by hand or not args.workdir = get_workdir(args.batch_system) else: args.workdir = args.workdir.rstrip('/') if os.path.exists(args.workdir): raise Exception('workdir %s already exists' % args.workdir) if args.batch_system == 'sge' and args.batch_options is not None: if '-e' in args.batch_options or '-o' in args.batch_options: print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % (utils.color('red', 'warning'), args.workdir) if args.outfname is not None and not args.presto_output and not args.airr_output: if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']: raise Exception('unhandled --outfname suffix %s' % utils.getsuffix(args.outfname)) if utils.getsuffix(args.outfname) != '.yaml': print ' %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % (utils.color('yellow', 'note:'), utils.getsuffix(args.outfname)) if args.action in ['view-annotations', 'view-partitions'] and utils.getsuffix(args.outfname) == '.yaml': raise Exception('have to use \'view-output\' action to view .yaml output files') if args.presto_output: if args.outfname is None: raise Exception('have to set --outfname if --presto-output is set') if args.action == 'annotate' and utils.getsuffix(args.outfname) != '.tsv': raise Exception('--outfname suffix has to be .tsv for annotation with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.action == 'partition' and utils.getsuffix(args.outfname) not in ['.fa', '.fasta']: raise Exception('--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.aligned_germline_fname is None: args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % (args.default_initial_germline_dir, args.species, args.locus) if not os.path.exists(args.aligned_germline_fname): raise Exception('--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output' % args.aligned_germline_fname) if args.airr_output: if args.outfname is None: raise Exception('have to set --outfname if --airr-output is set') if utils.getsuffix(args.outfname) != '.tsv': raise Exception('--outfname suffix has to be .tsv if --airr-output is set (got %s)' % utils.getsuffix(args.outfname)) if args.airr_input: args.seq_column = 'sequence' args.name_column = 'sequence_id' if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix(args.outfname) == '.csv': # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format args.cluster_annotation_fname = utils.insert_before_suffix('-cluster-annotations', args.outfname) if args.calculate_alternative_annotations and args.outfname is None: raise Exception('have to specify --outfname in order to calculate alternative annotations') if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None: # handle existing old-style output assert args.outfname is not None if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'): args.persistent_cachefname = utils.getprefix(args.outfname) + '-hmm-cache.csv' # written by bcrham, so has to be csv, not yaml if args.plot_performance: print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color('yellow', 'warning') args.plot_annotation_performance = True if args.plot_annotation_performance: if args.plotdir is None: raise Exception('can\'t plot performance unless --plotdir is specified') if not args.is_simu: raise Exception('can\'t plot performance unless --is-simu is set') if args.action == 'plot-partitions' and args.plotdir is None: raise Exception('--plotdir must be specified for plot-partitions') if args.make_per_gene_per_base_plots and not args.make_per_gene_plots: # the former doesn't do anything unless the latter is turned on args.make_per_gene_plots = True if args.parameter_type != 'hmm': print ' using non-default parameter type \'%s\'' % args.parameter_type if args.simulate_from_scratch: args.rearrange_from_scratch = True args.mutate_from_scratch = True if args.flat_mute_freq or args.same_mute_freq_for_all_seqs: assert args.mutate_from_scratch if args.action == 'simulate': if len(args.loci) != 1: raise Exception('needs to be implemented') if args.batch_system is not None and args.n_procs > 1 and not args.subsimproc: print ' %s setting subsimproc' % utils.color('red', 'warning') args.subsimproc = True if args.n_trees is None: args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs)) if args.outfname is None: print ' note: no --outfname specified, so nothing will be written to disk' args.outfname = get_dummy_outfname(args.workdir) # hackey, but otherwise I have to rewrite the wole run_simulation() in bin/partis to handle None type outfname if args.n_max_queries != -1: print ' note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)' # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed) if args.parameter_dir is not None: if args.rearrange_from_scratch or args.mutate_from_scratch: raise Exception('can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)') if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None: raise Exception('can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set') args.reco_parameter_dir = args.parameter_dir args.shm_parameter_dir = args.parameter_dir args.parameter_dir = None if args.rearrange_from_scratch and args.reco_parameter_dir is not None: raise Exception('doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir') if args.mutate_from_scratch and args.shm_parameter_dir is not None: raise Exception('doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir') if args.reco_parameter_dir is None and not args.rearrange_from_scratch: raise Exception('have to either set --rearrange-from-scratch or --reco-parameter-dir') if args.shm_parameter_dir is None and not args.mutate_from_scratch: raise Exception('have to either set --mutate-from-scratch or --shm-parameter-dir') if args.generate_germline_set and not args.rearrange_from_scratch: raise Exception('can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)') if args.generate_germline_set: args.snp_positions = None # if you want to control the exact positions, you have to use bin/test-germline-inference.py args.indel_positions = None process_gls_gen_args(args) if args.parameter_dir is not None: args.parameter_dir = args.parameter_dir.rstrip('/') if os.path.exists(args.parameter_dir) and len(set(os.listdir(args.parameter_dir)) & set(parameter_type_choices)) == 0: raise Exception('couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?' % (args.parameter_dir, ' '.join(parameter_type_choices), ' '.join(os.listdir(args.parameter_dir)))) if os.path.exists(args.default_initial_germline_dir + '/' + args.species): # ick that is hackey args.default_initial_germline_dir += '/' + args.species if args.species != 'human' and not args.allele_cluster: print ' non-human species \'%s\', turning on allele clustering' % args.species args.allele_cluster = True if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None: if args.n_max_snps > args.n_max_mutations_per_segment - 10: raise Exception('--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment)) if args.leave_default_germline: args.dont_remove_unlikely_alleles = True args.allele_cluster = False args.dont_find_new_alleles = True if args.infname is None and args.action not in ['simulate', 'view-output', 'view-annotations', 'view-partitions', 'view-cluster-annotations', 'plot-partitions', 'view-alternative-annotations', 'get-tree-metrics', 'get-linearham-info']: raise Exception('--infname is required for action \'%s\'' % args.action) if args.action == 'get-linearham-info': if args.linearham_info_fname is None: # for some reason setting required=True isn't working raise Exception('have to specify --linearham-info-fname') if args.sw_cachefname is None and args.parameter_dir is None: raise Exception('have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs') if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns: args.extra_annotation_columns = utils.add_lists(args.extra_annotation_columns, ['linearham-info'])
parser.add_argument('--plot-annotation-performance', action='store_true', help='see bin/partis --help') parser.add_argument('--methods', default='simu:partis', help='colon-separated list of methods to run. By default runs simulation, and then partis inference (igdiscover and tigger, if installed, are the other options)') parser.add_argument('--outdir', default=utils.fsdir() + '/partis/allele-finder') parser.add_argument('--inf-glfo-dir', help='default set below') parser.add_argument('--simfname', help='default set below') parser.add_argument('--workdir', default=utils.fsdir() + '/_tmp/hmms/' + str(random.randint(0, 999999))) parser.add_argument('--n-tests', type=int, help='instead of just running once, run <N> independent tests simultaneously') parser.add_argument('--iteststart', type=int, default=0, help='for use with --n-tests, if you want to add more tests on') parser.add_argument('--plot-and-fit-absolutely-everything', type=int, help='fit every single position for this <istart> and write every single corresponding plot (slow as hell, and only for debugging/making plots for paper)') parser.add_argument('--partis-path', default='./bin/partis') parser.add_argument('--species', default='human', choices=('human', 'macaque')) parser.add_argument('--locus', default='igh') args = parser.parse_args() assert args.locus == 'igh' # would just need to update some things, e.g. propagate through to the various methods args.methods = utils.get_arg_list(args.methods) available_methods = set(['simu', 'partis', 'full', 'tigger-default', 'tigger-tuned', 'igdiscover']) if len(set(args.methods) - available_methods) > 0: raise Exception('unexpected --methods: %s' % ' '.join(set(args.methods) - available_methods)) # args.default_germline_dir = 'old-glfo/%s' % args.species # 'data/germlines/%s' % args.species # NOTE gad damnit, I just deleted old-glfo, had no idea what it was for print ' %s hopefully old-glfo/ isn\'t needed to recreate old results (see comment)' % utils.color('yellow', 'note:') args.default_germline_dir = 'data/germlines/%s' % args.species # 'data/germlines/%s' % args.species args.generate_germline_set = args.gls_gen # for compatibility with bin/partis (i.e. so they can both use the fcn in processargs, but I don't have to rewrite either) args.mut_mult = args.mutation_multiplier # for compatibility with bin/partis (i.e. so they can both use the fcn in processargs, but I don't have to rewrite either) if args.generate_germline_set: # if we're generating/inferring a whole germline set these are either set automatically or not used delattr(args, 'sim_v_genes') delattr(args, 'inf_v_genes') delattr(args, 'dj_genes') args.allele_prevalence_freqs = None args.inf_glfo_dir = None
parser.add_argument('--scale-errors') parser.add_argument('--rebin', type=int) parser.add_argument('--colors') parser.add_argument('--linestyles') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--leaves-per-tree') parser.add_argument('--linewidths') parser.add_argument('--markersizes') parser.add_argument('--calculate-mean-info', action='store_true') parser.add_argument('--normalize', action='store_true') parser.add_argument('--strings-to-ignore') # remove this string from the plot names in each dir (e.g. '-mean-bins') NOTE replaces '_' with '-' args = parser.parse_args() if args.strings_to_ignore is not None: args.strings_to_ignore = args.strings_to_ignore.replace('_', '-') args.plotdirs = utils.get_arg_list(args.plotdirs) args.scale_errors = utils.get_arg_list(args.scale_errors) args.colors = utils.get_arg_list(args.colors, intify=True, translation={810 : 'red', 634 : 'darkred', 596 : 'mediumblue', 418 : 'green', 798 : 'goldenrod', 869 : 'lightseagreen'}) args.linestyles = utils.get_arg_list(args.linestyles, intify=True, translation={1 : '-',2 : '--'}) args.names = utils.get_arg_list(args.names) args.leaves_per_tree = utils.get_arg_list(args.leaves_per_tree, intify=True) args.strings_to_ignore = utils.get_arg_list(args.strings_to_ignore) args.markersizes = utils.get_arg_list(args.markersizes, intify=True) args.linewidths = utils.get_arg_list(args.linewidths, intify=True) for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') assert len(args.plotdirs) == len(args.names) with opener('r')(args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region args.cyst_positions = json.load(json_file)
help= 'colon-separated list of allele prevalence frequencies, including newly-generated snpd genes (ordered alphabetically)' ) parser.add_argument( '--remove-template-genes', action='store_true', help='when generating snps, remove the original gene before simulation') parser.add_argument('--mut-mult', type=float) parser.add_argument('--slurm', action='store_true') parser.add_argument('--outdir', default=fsdir + '/partis/allele-finder') parser.add_argument('--workdir', default=fsdir + '/_tmp/hmms/' + str(random.randint(0, 999999))) parser.add_argument('--n-tests', type=int) args = parser.parse_args() args.dj_genes = utils.get_arg_list(args.dj_genes) args.sim_v_genes = utils.get_arg_list(args.sim_v_genes) args.inf_v_genes = utils.get_arg_list(args.inf_v_genes) args.snp_positions = utils.get_arg_list(args.snp_positions) args.nsnp_list = utils.get_arg_list(args.nsnp_list, intify=True) args.allele_prevalence_freqs = utils.get_arg_list(args.allele_prevalence_freqs, floatify=True) if args.snp_positions is not None: args.snp_positions = [[int(p) for p in pos_str.split(',')] for pos_str in args.snp_positions] if len(args.snp_positions) != len(args.sim_v_genes): raise Exception( '--snp-positions %s and --sim-v-genes %s not the same length (%d vs %d)' % (args.snp_positions, args.sim_v_genes, len( args.snp_positions), len(args.sim_v_genes))) if args.nsnp_list is not None:
info[region + '_gene'] = gene self.perfplotter.add_partial_fail(self.siminfo[unique_id], info) if self.args.debug: print '%-20s partial fail %s %s %s' % (unique_id, utils.color_gene(info['v_gene']) if 'v_gene' in info else '', utils.color_gene(info['d_gene']) if 'd_gene' in info else '', utils.color_gene(info['j_gene']) if 'j_gene' in info else ''), print ' (true %s %s %s)' % tuple([self.siminfo[unique_id][region + '_gene'] for region in utils.regions]) self.failtails[unique_id] = info self.n_partially_failed += 1 self.sim_need.remove(unique_id) return unique_ids # ---------------------------------------------------------------------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-b', action='store_true') # passed on to ROOT when plotting parser.add_argument('--n-max-queries', type=int, default=-1) parser.add_argument('--queries') parser.add_argument('--plotdir', required=True) parser.add_argument('--debug', type=int, default=0, choices=[0, 1, 2]) parser.add_argument('--datadir', default='data/imgt') args = parser.parse_args() args.queries = utils.get_arg_list(args.queries) args.indir = 'data/performance/ihhhmmm' args.simfname = 'data/performance/simu.csv' check_call(['tar', 'xzf', args.indir + '.tgz', '-C', 'data/performance/']) # untar the ihmmune-align output ihhhmmmparser = IhhhmmmParser(args)
parser = argparse.ArgumentParser() parser.add_argument( 'action', choices=['mfreq', 'nsnp', 'multi-nsnp', 'prevalence', 'n-leaves']) parser.add_argument('--v-genes', default='IGHV4-39*01') parser.add_argument('--varvals') parser.add_argument( '--n-event-list', default='1000:2000:4000:8000') # NOTE modified later for multi-nsnp parser.add_argument('--n-tests', type=int, default=10) parser.add_argument('--plot', action='store_true') parser.add_argument('--no-slurm', action='store_true') parser.add_argument('--label') args = parser.parse_args() args.v_genes = utils.get_arg_list(args.v_genes) args.n_event_list = utils.get_arg_list(args.n_event_list, intify=True) # ---------------------------------------------------------------------------------------- baseoutdir = alfdir if args.label is not None: baseoutdir += '/' + args.label baseoutdir += '/' + args.action if args.varvals is None: args.varvals = default_varvals[args.action] kwargs = {} if args.action == 'mfreq' or args.action == 'prevalence' or args.action == 'n-leaves': kwargs['floatify'] = True if args.action == 'nsnp': kwargs['intify'] = True
sys.path.insert(1, current_script_dir) import utils all_codes = ['partis', 'multi-partis', 'ihhhmmm', 'imgt', 'igblast'] parser = argparse.ArgumentParser() parser.add_argument('--run-codes', help='Which codes to check? (if none, we just plot preexisting results)') parser.add_argument('--n-queries', type=int) args = parser.parse_args() if args.run_codes == 'all': args.run_codes = all_codes elif args.run_codes == 'none': args.run_codes = [] else: args.run_codes = utils.get_arg_list(args.run_codes) outdir = '/fh/fast/matsen_e/dralph/work/partis-dev' simfname = outdir + '/data/performance/simu.csv' base_plotdir = outdir + '/_compare' for code in args.run_codes: if code not in all_codes: raise Exception('ERROR bad code: ' + code) if 'partis' in code: n_procs = max(1, multiprocessing.cpu_count() / 2) for action in ('cache-simu-parameters', 'plot-performance'): cmd = './bin/run-driver.py --label comparisons-' + code + ' --action ' + action + ' --plotdir ' + base_plotdir + '/' + code + ' --simfname ' + simfname + ' --n-procs 50:10' # + str(n_procs) if 'multi-' in code: cmd += ' --extra-args __n-sets:5:--slurm:--workdir:_tmp/foop' else:
figsize=figsize, no_labels=no_labels, log=log, translegend=translegend) # ---------------------------------------------------------------------------------------- parser = argparse.ArgumentParser() parser.add_argument('--outdir', required=True) parser.add_argument('--plotdirs', required=True) parser.add_argument('--names', required=True) parser.add_argument('--performance-plots', action='store_true') parser.add_argument('--colors', default='#006600:#990012:#3333ff:#cc0000:#3399ff:#2b65ec:#2b65ec:#808080') parser.add_argument('--linewidths', default='5:3:2:2:2') parser.add_argument('--gldir', default='data/germlines/human') parser.add_argument('--chain', default='h') parser.add_argument('--normalize', action='store_true') args = parser.parse_args() args.plotdirs = utils.get_arg_list(args.plotdirs) args.names = utils.get_arg_list(args.names) args.colors = utils.get_arg_list(args.colors) args.linewidths = utils.get_arg_list(args.linewidths) for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') # if you just pass in one parent directory, we assume <args.names> contains the desired subdirs if len(args.plotdirs) == 1: parentdir = args.plotdirs[0] args.plotdirs = [parentdir + '/' + n for n in args.names] if len(args.plotdirs) != len(args.names): raise Exception('poorly formatted args:\n %s\n %s' % (' '.join(args.plotdirs), ' '.join(args.names))) # if args.gldir is not 'none':
qr_info[region + '_5p_del'] = gl_start assert gl_end <= len(self.germline_seqs[region][gene]) qr_info[region + '_3p_del'] = len(self.germline_seqs[region][gene]) - gl_end # bounds qr_info[region + '_qr_bounds'] = find_qr_bounds(qr_start, qr_end, gl_seq) if self.args.debug: print ' %s match: %s' % (region, clean_alignment_crap(qr_seq, gl_seq)) # ---------------------------------------------------------------------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-b', action='store_true') # passed on to ROOT when plotting parser.add_argument('--n-queries', type=int, default=-1) parser.add_argument('--queries') parser.add_argument('--plotdir', required=True) parser.add_argument('--debug', type=int, default=0, choices=[0, 1, 2]) parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--infname', required=True) #, default='data/performance/igblast/igblast.html') parser.add_argument('--simfname', required=True) parser.add_argument('--skip-missing-genes', action='store_true') parser.add_argument('-dont-skip-or15-genes', action='store_true', help='by default skip all the genes with the /OR1[56] bullshit, since they don\'t seem to be in imgt\'s output') args = parser.parse_args() args.queries = utils.get_arg_list(args.queries, intify=True) # if os.path.isdir('data/performance/igblast'): # print 'skipping tar xzf \'cause output\'s already there' # else: # print 'untgzing...' # check_call(['tar', 'xzf', 'data/performance/igblast.tgz', '-C', 'data/performance/']) # untar the igblast output igblastparser = IgblastParser(args)
def process(args): if args.action == 'run-viterbi': print ' note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (this doesn\'t change any actual behavior)' args.action = 'annotate' if args.chain is not None: print ' note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % ( args.chain, 'ig' + args.chain) args.locus = 'ig' + args.chain args.chain = None args.loci = utils.get_arg_list(args.loci, choices=utils.loci) if args.loci is None: # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough args.loci = [args.locus] else: args.locus = args.loci[0] args.only_genes = utils.get_arg_list(args.only_genes) args.n_procs = utils.get_arg_list(args.n_procs, intify=True) args.n_fewer_procs = args.n_procs[0] if len( args.n_procs) == 1 else args.n_procs[1] args.n_procs = args.n_procs[0] args.queries = utils.get_arg_list(args.queries) args.queries_to_include = utils.get_arg_list(args.queries_to_include) args.reco_ids = utils.get_arg_list(args.reco_ids) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) if args.istartstop is not None: if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0: raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1])) args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True) if len(args.n_max_per_region) != 3: raise Exception( 'n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region)) args.write_additional_cluster_annotations = utils.get_arg_list( args.write_additional_cluster_annotations, intify=True) if args.write_additional_cluster_annotations is not None and len( args.write_additional_cluster_annotations) != 2: raise Exception( '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations) args.extra_annotation_columns = utils.get_arg_list( args.extra_annotation_columns, choices=utils.extra_annotation_headers) args.region_end_exclusions = { r: [ args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p'] ] for r in utils.regions } args.region_end_exclusion_length = None # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version args.initial_match_mismatch = utils.get_arg_list( args.initial_match_mismatch, intify=True) if len(args.initial_match_mismatch) != 2: raise Exception( '--initial-match-mismatch should be of the form \'match:mismatch\', but I got ' + str(args.n_max_per_region)) args.annotation_clustering_thresholds = utils.get_arg_list( args.annotation_clustering_thresholds, floatify=True) args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True) if args.small_clusters_to_ignore is not None: if '-' in args.small_clusters_to_ignore: lo, hi = [ int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-') ] args.small_clusters_to_ignore = range(lo, hi + 1) else: args.small_clusters_to_ignore = utils.get_arg_list( args.small_clusters_to_ignore, intify=True) if args.seed_unique_id is not None: args.seed_unique_id = args.seed_unique_id.strip( ) # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign) if args.queries is not None and args.seed_unique_id not in args.queries: raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries))) if args.random_seed_seq: raise Exception( 'can\'t specify both --seed-unique-id and --random-seed-seq') if args.queries_to_include is None: # make sure the seed is in --queries-to-include args.queries_to_include = [args.seed_unique_id] elif args.seed_unique_id not in args.queries_to_include: args.queries_to_include = [ args.seed_unique_id ] + args.queries_to_include # may as well put it first, I guess (?) if args.sw_debug is None: # if not explicitly set, set equal to regular debug args.sw_debug = args.debug if args.only_genes is not None: for gene in args.only_genes: # make sure they're all at least valid ig genes utils.split_gene(gene) # if n_procs < 1 or n_procs > 9999: # It happened, at least once. You know, probably. # raise Exception('bad n_procs %s' % n_procs) if args.n_procs > args.n_max_procs: print 'reducing n procs %d to --n-max-procs %d' % (args.n_procs, args.n_max_procs) args.n_procs = args.n_max_procs if args.n_fewer_procs > args.n_max_procs: print 'reducing n procs %d to --n-max-procs %d' % (args.n_fewer_procs, args.n_max_procs) args.n_fewer_procs = args.n_max_procs if args.print_git_commit or args.action == 'version': print 'RUN ' + ' '.join(sys.argv) tag = check_output(['git', 'tag']).split()[-1] print ' tag %s' % tag print ' commit %s' % check_output(['git', 'rev-parse', 'HEAD' ]).strip() if args.action == 'version': sys.exit(0) args.is_data = not args.is_simu # whole code base uses is_data, this is better than changing all of that if args.simultaneous_true_clonal_seqs: if args.is_data: raise Exception( 'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set' ) if args.n_simultaneous_seqs is not None: raise Exception( 'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs' ) if args.no_indels and args.gap_open_penalty < 1000: print 'forcing --gap-open-penalty to 1000 to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' args.gap_open_penalty = 1000 if 'tr' in args.locus and args.mutation_multiplier is None: args.mutation_multiplier = 0. if args.workdir is None: # set default here so we know whether it was set by hand or not def choose_random_subdir(dirname): subname = str(random.randint(0, 999999)) while os.path.exists(dirname + '/' + subname): subname = str(random.randint(0, 999999)) return dirname + '/' + subname if args.batch_system is not None and os.path.exists( '/fh/fast/matsen_e'): args.workdir = choose_random_subdir( '/fh/fast/matsen_e/' + os.path.basename(os.getenv('HOME')) + '/_tmp/hmms') else: args.workdir = choose_random_subdir( '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms') if args.batch_system is not None: print ' %s: using batch system %s with default --workdir (%s) -- if this isn\'t visible to the batch nodes on your system, you\'ll need to change it' % ( utils.color('red', 'warning'), args.batch_system, args.workdir) else: args.workdir = args.workdir.rstrip('/') if os.path.exists(args.workdir): raise Exception('workdir %s already exists' % args.workdir) if args.batch_system == 'sge' and args.batch_options is not None: if '-e' in args.batch_options or '-o' in args.batch_options: print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % ( utils.color('red', 'warning'), args.workdir) if args.cluster_annotation_fname is None and args.outfname is not None: args.cluster_annotation_fname = args.outfname.replace( utils.getsuffix(args.outfname), '-cluster-annotations.csv') if args.calculate_alternative_naive_seqs or ( args.action == 'view-alternative-naive-seqs' and args.persistent_cachefname is None): if args.outfname is None: raise Exception( 'have to specify --outfname in order to calculate alternative naive sequences' ) args.persistent_cachefname = args.outfname.replace( '.csv', '-hmm-cache.csv') if args.calculate_alternative_naive_seqs and os.path.exists( args.persistent_cachefname): if os.stat(args.persistent_cachefname).st_size == 0: print ' note: removing existing zero-length persistent cache file %s' % args.persistent_cachefname os.remove(args.persistent_cachefname) else: raise Exception( 'persistent cache file %s already exists, but we were asked to --calculate-alternative-naive-seqs. Either it\'s an old file (in which case you should delete it), or you\'ve already got the alternative annotations (so you can just run view-alternative-naive-seqs)' % args.persistent_cachefname) if args.plot_performance: print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color( 'yellow', 'warning') args.plot_annotation_performance = True if args.plot_annotation_performance: if args.plotdir is None: raise Exception( 'can\'t plot performance unless --plotdir is specified') if not args.is_simu: raise Exception('can\'t plot performance unless --is-simu is set') if args.parameter_type != 'hmm': print ' using non-default parameter type \'%s\'' % args.parameter_type if args.presto_output and args.aligned_germline_fname is None: raise Exception( 'in order to get presto output, you have to set --aligned-germline-fname (a fasta file with germline alignments for every germline gene)' ) if args.parameter_dir is not None: args.parameter_dir = args.parameter_dir.rstrip('/') if args.count_parameters and not args.dont_write_parameters: raise Exception( 'if you set --count-parameters, you should also set --dont-write-parameters to make sure you\'re not accidentally overwriting existing parameters ' ) if os.path.exists(args.default_initial_germline_dir + '/' + args.species): # ick that is hackey args.default_initial_germline_dir += '/' + args.species if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None: if args.n_max_snps > args.n_max_mutations_per_segment - 10: raise Exception( '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment)) if args.n_alleles_per_gene is None: if not args.dont_find_new_alleles: args.n_alleles_per_gene = 1 else: args.n_alleles_per_gene = 2 if args.leave_default_germline: args.dont_remove_unlikely_alleles = True args.allele_cluster = False args.dont_find_new_alleles = True if args.flat_mute_freq is not None or args.same_mute_freq_for_all_seqs: assert args.mutate_from_scratch
import argparse import sys import csv sys.path.insert(1, './python') import utils from seqfileopener import get_seqfile_info from opener import opener parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--outdir', required=True) parser.add_argument('--start-indices', required=True) # colon-separated list of start indices. E.g. with '0:1:2' we will write three output files. The first seq line in <infname> goes to 0, the next to 1, the third to 2, and then we skip 97 seqs, then yadda yadda parser.add_argument('--modulo', type=int, default=100) args = parser.parse_args() args.start_indices = utils.get_arg_list(args.start_indices, intify=True) print 'subsetting %s: every %d th sequence' % (args.infname, args.modulo) infile = opener('r')(args.infname) input_info, _ = get_seqfile_info(args.infname, is_data=True) #, n_max_queries=1000) for key, d in input_info.items(): # get field names (they should be the same for each row, this just grabs the first one) fieldnames = d.keys() break utils.prep_dir(args.outdir) #, '*.bz2') outfiles, writers = {}, {} for iout in args.start_indices: outfname = args.outdir + ('/every-' + str(args.modulo) + '-subset-%d.csv.bz2' % iout) outfiles[iout] = opener('w')(outfname) writers[iout] = csv.DictWriter(outfiles[iout], fieldnames, delimiter=',')
# parser.add_argument('--tree-parameter-file', default='/shared/silo_researcher/Matsen_F/MatsenGrp/data/bcr/output_sw/A/04-A-M_gtr_tr-qi-gi.json.gz', help='File from which to read inferred tree parameters (from mebcell analysis)') parser.add_argument('--gtrfname', default='data/recombinator/gtr.txt', help='File with list of GTR parameters. Fed into bppseqgen along with the chosen tree') parser.add_argument('--branch-length-fname', default='data/recombinator/branch-lengths.txt', help='Branch lengths from Connor\'s mebcell stuff') # NOTE command to generate gtr parameter file: [stoat] partis/ > zcat /shared/silo_researcher/Matsen_F/MatsenGrp/data/bcr/output_sw/A/04-A-M_gtr_tr-qi-gi.json.gz | jq .independentParameters | grep -v '[{}]' | sed 's/["\:,]//g' | sed 's/^[ ][ ]*//' | sed 's/ /,/' | sort >data/gtr.txt # uncommon arguments parser.add_argument('--apply-choice_probs_in_sw', action='store_true', help='Apply gene choice probs in Smith-Waterman step. Probably not a good idea (see comments in waterer.py).') parser.add_argument('--insertion-base-content', default=True, action='store_true',help='Account for non-uniform base content in insertions. Slows us down by a factor around five and gives no performance benefit.') parser.add_argument('--allow_unphysical_insertions', action='store_true', help='allow insertions on left side of v and right side of j. NOTE this is very slow.') # parser.add_argument('--allow_external_deletions', action='store_true') # ( " ) deletions ( " ) # parser.add_argument('--total-length-from-right', type=int, default=-1, help='Total read length you want for simulated sequences') parser.add_argument('--joint-emission', action='store_true', help='Use information about both sequences when writing pair emission probabilities?') args = parser.parse_args() args.only_genes = utils.get_arg_list(args.only_genes) args.n_procs = utils.get_arg_list(args.n_procs, intify=True) if len(args.n_procs) == 1: args.n_fewer_procs = args.n_procs[0] else: args.n_fewer_procs = args.n_procs[1] args.n_procs = args.n_procs[0] if args.slurm and '/tmp' in args.workdir: print 'ERROR it appears that <workdir> isn\'t set to something visible to all slurm nodes' sys.exit() if args.plot_performance: assert not args.is_data # assert args.algorithm == 'viterbi'
import os sys.path.insert(1, './python') import utils import plotting from humans import humans, colors, all_subdirs parser = argparse.ArgumentParser() parser.add_argument('-b', action='store_true') # passed on to ROOT when plotting parser.add_argument('--subdirs', default='all', help='Which variable categories?') parser.add_argument('--dataset', choices=('adaptive', 'stanford', 'both'), default='adaptive') args = parser.parse_args() if args.subdirs == 'all': args.subdirs = all_subdirs else: args.subdirs = utils.get_arg_list(args.subdirs) modulo = '10' webdir = '/var/www/sharing/dralph/partis' subset = 0 for subdir in args.subdirs: print subdir, '-----------------' if subdir not in all_subdirs: raise Exception('ERROR bad subdir: ' + str(subdir)) plotdirs, names, colorlist, linestyles, linewidths, markersizes, scale_errors, strings_to_ignore = [], [], [], [], [], [], [], [] for human in humans[args.dataset]: print ' ', human baselabel = 'every-' + modulo + '-' + human datadir = webdir + '/' + baselabel + '/cf-subsets/' + subdir simudir = webdir + '/' + baselabel + '-subset-' + str(subset) + '/params/simu/hmm/true/' + subdir
check_call(['permissify-www', plotdir]) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-b', action='store_true') parser.add_argument('--infnames') parser.add_argument('--outfname') parser.add_argument('--normalize-axes', default=[]) parser.add_argument('--use-all-steps', action='store_true') parser.add_argument('--is-data', action='store_true') parser.add_argument('--xbounds') parser.add_argument('--logprob-bounds') parser.add_argument('--adjmi-bounds') args = parser.parse_args() args.infnames = get_arg_list(args.infnames) if args.xbounds is not None: args.xbounds = get_arg_list(args.xbounds, floatify=True) if args.logprob_bounds is not None: args.logprob_bounds = get_arg_list(args.logprob_bounds, floatify=True) if args.adjmi_bounds is not None: args.adjmi_bounds = get_arg_list(args.adjmi_bounds, floatify=True) if len(args.normalize_axes) > 0: args.normalize_axes = get_arg_list(args.normalize_axes) fsize = 26 mpl.rcParams.update({ 'font.size': 26, 'axes.labelsize': 26, 'xtick.labelsize':20, 'ytick.labelsize':20,
parser.add_argument('--rebin', type=int) parser.add_argument('--colors') parser.add_argument('--linestyles') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--leaves-per-tree') parser.add_argument('--linewidths') parser.add_argument('--markersizes') parser.add_argument('--dont-calculate-mean-info', action='store_true') parser.add_argument('--normalize', action='store_true') parser.add_argument('--graphify', action='store_true') parser.add_argument('--strings-to-ignore') # remove this string from the plot names in each dir (e.g. '-mean-bins') NOTE replaces '_' with '-' args = parser.parse_args() if args.strings_to_ignore is not None: args.strings_to_ignore = args.strings_to_ignore.replace('_', '-') args.plotdirs = utils.get_arg_list(args.plotdirs) args.scale_errors = utils.get_arg_list(args.scale_errors) args.colors = utils.get_arg_list(args.colors, intify=True) args.linestyles = utils.get_arg_list(args.linestyles, intify=True) args.names = utils.get_arg_list(args.names) args.leaves_per_tree = utils.get_arg_list(args.leaves_per_tree, intify=True) args.strings_to_ignore = utils.get_arg_list(args.strings_to_ignore) args.markersizes = utils.get_arg_list(args.markersizes, intify=True) args.linewidths = utils.get_arg_list(args.linewidths, intify=True) for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') assert len(args.plotdirs) == len(args.names) with opener('r')(args.datadir + '/v-meta.json') as json_file: # get location of <begin> cysteine in each v region args.cyst_positions = json.load(json_file)
"--n-sim-events", default="2000" ) # NOTE still have to multiply by the number of leaves to get the number of sequences (default is 5, though, which'll give you 10k seqs) parser.add_argument( "--extra-args" ) # args to pass on to commands (colon-separated) NOTE have to add space and quote like so: --extra-args ' __option' (NOTE replaces __ with --) parser.add_argument("--datafname") parser.add_argument("--simfname") parser.add_argument("--plotdir", required=True) parser.add_argument("--n-procs", type=int, default=max(1, multiprocessing.cpu_count() / 2)) all_actions = ("cache-data-parameters", "simulate", "cache-simu-parameters", "plot-performance") parser.add_argument( "--actions", default=":".join(all_actions), choices=all_actions, help="Colon-separated list of actions to perform" ) args = parser.parse_args() args.extra_args = utils.get_arg_list(args.extra_args) args.actions = utils.get_arg_list(args.actions) cmd = "./bin/partis.py" common_args = " --n-procs " + str(args.n_procs) if args.extra_args != None: common_args += " " + " ".join(args.extra_args).replace("__", "--") if args.simfname == None: args.simfname = "_output/" + args.label + "/simu.csv" param_dir = "_output/" + args.label if "cache-data-parameters" in args.actions: if args.datafname is None or not os.path.exists(args.datafname): raise Exception("ERROR datafname d.n.e.: " + str(args.datafname)) # cache parameters from data cmd_str = " --action cache-parameters --seqfile " + args.datafname + " --is-data --skip-unproductive" + common_args
parser.add_argument('--locus') parser.add_argument('--parameter-dirs') parser.add_argument('--min-outer-size', default=10, type=int) parser.add_argument('--min-inner-size', default=5, type=int) parser.add_argument('--min-outer-rep-frac', type=float) parser.add_argument('--min-inner-rep-frac', type=float) parser.add_argument( '--max-cdr3-distance', default=5, type=int, help= 'ignore clusters with a cdr3 that differs by more than this many nucleotides' ) args = parser.parse_args() args.infiles = utils.get_arg_list(args.infiles) args.labels = utils.get_arg_list(args.labels) args.parameter_dirs = utils.get_arg_list(args.parameter_dirs) assert len(args.infiles) == len(args.labels) if len(args.parameter_dirs) == 1: print ' note: using same glfo for all infiles' args.parameter_dirs = [args.parameter_dirs[0] for _ in args.labels] assert len(args.parameter_dirs) == len(args.labels) glfos = [ glutils.read_glfo(pdir + '/hmm/germline-sets', locus=args.locus) for pdir in args.parameter_dirs ] # ----------------------------------------------------------------------------------------
if self.args.debug: print ' %s match: %s' % ( region, clean_alignment_crap(qr_seq, gl_seq)) # ---------------------------------------------------------------------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-b', action='store_true') # passed on to ROOT when plotting parser.add_argument('--n-max-queries', type=int, default=-1) parser.add_argument('--queries') parser.add_argument('--plotdir', required=True) parser.add_argument('--debug', type=int, default=0, choices=[0, 1, 2]) parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--infname', default='data/performance/igblast/igblast.html') args = parser.parse_args() args.queries = utils.get_arg_list(args.queries, intify=True) args.simfname = 'data/performance/simu.csv' if os.path.isdir('data/performance/igblast'): print 'skipping tar xzf \'cause output\'s already there' else: print 'untgzing...' check_call([ 'tar', 'xzf', 'data/performance/igblast.tgz', '-C', 'data/performance/' ]) # untar the igblast output igblastparser = IgblastParser(args)
def process(args): if args.action == 'run-viterbi': print ' note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (you don\'t need to change anything unless you want this warning message to go away)' args.action = 'annotate' if args.action == 'view-alternative-naive-seqs': print ' note: replacing deprecated action name \'view-alternative-naive-seqs\' with current name \'view-alternative-annotations\' (you don\'t need to change anything unless you want this warning message to go away)' args.action = 'view-alternative-annotations' args.light_chain_fractions = utils.get_arg_list(args.light_chain_fractions, key_val_pairs=True, floatify=True) if args.light_chain_fractions is not None and not utils.is_normed( args.light_chain_fractions.values()): raise Exception('--light-chain-fractions %s don\'t add to 1: %f' % (args.light_chain_fractions, sum(args.light_chain_fractions.values()))) if args.action == 'merge-paired-partitions': assert args.paired_loci if args.paired_loci: args.locus = None if [args.infname, args.paired_indir].count(None) == 0: raise Exception('can\'t specify both --infname and --paired-indir') if args.outfname is not None: raise Exception( 'can\'t set --outfname if --paired-loci is set (use --paired-outdir)' ) if args.plotdir == 'paired-outdir': args.plotdir = args.paired_outdir if args.plotdir is None and args.action == 'plot-partitions': args.plotdir = args.paired_outdir else: assert args.paired_indir is None if not args.paired_loci and (args.paired_indir is not None or args.paired_outdir is not None): raise Exception( '--paired-loci must be set if either --paired-indir or --paired-outdir is set' ) if args.reverse_negative_strands and not args.paired_loci: raise Exception( '--reverse-negative-strands has no effect unless --paired-loci is set (maybe need to run bin/split-loci.py separately?)' ) args.only_genes = utils.get_arg_list(args.only_genes) args.queries = utils.get_arg_list(args.queries) args.queries_to_include = utils.get_arg_list(args.queries_to_include) args.reco_ids = utils.get_arg_list(args.reco_ids) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) if args.istartstop is not None: if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0: raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1])) args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True) if len(args.n_max_per_region) != 3: raise Exception( 'n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region)) args.write_additional_cluster_annotations = utils.get_arg_list( args.write_additional_cluster_annotations, intify=True) if args.write_additional_cluster_annotations is not None and len( args.write_additional_cluster_annotations) != 2: raise Exception( '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations) args.extra_annotation_columns = utils.get_arg_list( args.extra_annotation_columns, choices=utils.extra_annotation_headers) args.cluster_indices = utils.get_arg_list(args.cluster_indices, intify_with_ranges=True) args.allowed_cdr3_lengths = utils.get_arg_list(args.allowed_cdr3_lengths, intify=True) args.region_end_exclusions = { r: [ args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p'] ] for r in utils.regions } args.region_end_exclusion_length = None # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version args.typical_genes_per_region_per_subject = utils.get_arg_list( args.typical_genes_per_region_per_subject, intify=True) if len(args.typical_genes_per_region_per_subject) != len(utils.regions): raise Exception( 'wrong length for --typical-genes-per-region-per-subject, has to be three' ) tmpfrac, ntmp = args.min_allele_prevalence_fraction, args.typical_genes_per_region_per_subject args.min_allele_prevalence_fractions = { r: tmpfrac * ntmp[utils.regions.index('v')] / ntmp[utils.regions.index(r)] for r in utils.regions } delattr(args, 'min_allele_prevalence_fraction') # delete the non-plural version delattr(args, 'typical_genes_per_region_per_subject' ) # and we don't need this any more either args.annotation_clustering_thresholds = utils.get_arg_list( args.annotation_clustering_thresholds, floatify=True) args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True) if args.small_clusters_to_ignore is not None: if '-' in args.small_clusters_to_ignore: lo, hi = [ int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-') ] args.small_clusters_to_ignore = range(lo, hi + 1) else: args.small_clusters_to_ignore = utils.get_arg_list( args.small_clusters_to_ignore, intify=True) if args.seed_unique_id is not None: args.seed_unique_id = args.seed_unique_id.strip( ) # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign) if args.queries is not None and args.seed_unique_id not in args.queries: raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries))) if args.random_seed_seq: raise Exception( 'can\'t specify both --seed-unique-id and --random-seed-seq') if args.queries_to_include is None: # make sure the seed is in --queries-to-include args.queries_to_include = [args.seed_unique_id] elif args.seed_unique_id not in args.queries_to_include: args.queries_to_include = [ args.seed_unique_id ] + args.queries_to_include # may as well put it first, I guess (?) elif args.seed_seq is not None: args.seed_unique_id = 'seed-seq' if args.sw_debug is None: # if not explicitly set, set equal to regular debug args.sw_debug = args.debug if args.only_genes is not None: for gene in args.only_genes: # make sure they're all at least valid ig genes utils.split_gene(gene) if args.print_git_commit or args.action == 'version': utils.get_version_info(debug=True) if args.action == 'version': sys.exit(0) args.is_data = not args.is_simu # whole code base uses is_data, this is better than changing all of that if args.collapse_duplicate_sequences and not args.is_data: print ' %s collapsing duplicates on simulation, which is often not a good idea since it makes keeping track of performance harder (e.g. purity/completeness of partitions is harder to calculate)' % utils.color( 'red', 'warning') if args.simultaneous_true_clonal_seqs: if args.is_data: raise Exception( 'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set' ) if args.n_simultaneous_seqs is not None: raise Exception( 'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs' ) if args.all_seqs_simultaneous: raise Exception( 'can\'t specify both --all-seqs-simultaneous and --simultaneous-true-clonal-seqs' ) if args.action == 'partition': raise Exception( 'can\'t set --simultaneous-true-clonal-seqs when partitioning') if args.n_simultaneous_seqs is not None and args.all_seqs_simultaneous: raise Exception( 'doesn\'t make sense to set both --n-simultaneous-seqs and --all-seqs-simultaneous.' ) if args.no_indels: print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty args.gap_open_penalty = args.no_indel_gap_open_penalty if args.indel_frequency > 0.: if args.indel_frequency < 0. or args.indel_frequency > 1.: raise Exception('--indel-frequency must be in [0., 1.] (got %f)' % args.indel_frequency) args.n_indels_per_indeld_seq = utils.get_arg_list( args.n_indels_per_indeld_seq, intify=True) if args.indel_location not in [None, 'v', 'cdr3']: if int(args.indel_location) in range(500): args.indel_location = int(args.indel_location) if any(n > 1 for n in args.n_indels_per_indeld_seq): print ' note: removing entries from --n-indels-per-indeld-seq (%s), since --indel-location was set to a single position.' % [ n for n in args.n_indels_per_indeld_seq if n > 1 ] args.n_indels_per_indeld_seq = [ n for n in args.n_indels_per_indeld_seq if n <= 1 ] else: raise Exception( '--indel-location \'%s\' neither one of None, \'v\' or \'cdr3\', nor an integer less than 500' % args.indel_location) if args.locus is not None and 'tr' in args.locus and args.mutation_multiplier is None: args.mutation_multiplier = 0. if args.workdir is None: # set default here so we know whether it was set by hand or not args.workdir = get_workdir(args.batch_system) else: args.workdir = args.workdir.rstrip('/') if os.path.exists(args.workdir): raise Exception('workdir %s already exists' % args.workdir) if args.batch_system == 'sge' and args.batch_options is not None: if '-e' in args.batch_options or '-o' in args.batch_options: print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % ( utils.color('red', 'warning'), args.workdir) if args.outfname is not None and not args.presto_output and not args.airr_output and not args.generate_trees: if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']: raise Exception('unhandled --outfname suffix %s' % utils.getsuffix(args.outfname)) if utils.getsuffix(args.outfname) != '.yaml': print ' %s --outfname uses deprecated file format %s. This will still mostly work ok, but the new default .yaml format doesn\'t have to do all the string conversions by hand (so is less buggy), and includes annotations, partitions, and germline info in the same file (so you don\'t get crashes or inconsistent results if you don\'t keep track of what germline info goes with what output file).' % ( utils.color('yellow', 'note:'), utils.getsuffix(args.outfname)) if args.action in ['view-annotations', 'view-partitions' ] and utils.getsuffix(args.outfname) == '.yaml': raise Exception( 'have to use \'view-output\' action to view .yaml output files' ) if args.presto_output: if args.outfname is None: raise Exception('have to set --outfname if --presto-output is set') if args.action == 'annotate' and utils.getsuffix( args.outfname) != '.tsv': raise Exception( '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.action == 'partition' and utils.getsuffix( args.outfname) not in ['.fa', '.fasta']: raise Exception( '--outfname suffix has to be .fa or .fasta for partitioning with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.aligned_germline_fname is None: assert args.locus is not None args.aligned_germline_fname = '%s/%s/imgt-alignments/%s.fa' % ( args.default_initial_germline_dir, args.species, args.locus) if not os.path.exists(args.aligned_germline_fname): raise Exception( '--aligned-germline-fname %s doesn\'t exist, but we need it in order to write presto output' % args.aligned_germline_fname) if args.airr_output: if args.outfname is None: raise Exception('have to set --outfname if --airr-output is set') if utils.getsuffix(args.outfname) == '.tsv': print ' note: writing only airr .tsv to %s' % args.outfname elif utils.getsuffix(args.outfname) in ['.yaml', '.csv']: print ' note: writing both partis %s to %s and airr .tsv to %s' % ( utils.getsuffix(args.outfname), args.outfname, utils.replace_suffix(args.outfname, '.tsv')) else: raise Exception( '--outfname suffix has to be either .tsv or .yaml if --airr-output is set (got %s)' % utils.getsuffix(args.outfname)) if args.airr_input: args.seq_column = 'sequence' args.name_column = 'sequence_id' if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix( args.outfname ) == '.csv': # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format args.cluster_annotation_fname = utils.insert_before_suffix( '-cluster-annotations', args.outfname) if args.calculate_alternative_annotations and args.outfname is None and args.paired_outdir is None: raise Exception( 'have to specify --outfname in order to calculate alternative annotations' ) if args.subcluster_annotation_size == 'None': # i want it turned on by default, but also to be able to turn it off on the command line args.subcluster_annotation_size = None else: args.subcluster_annotation_size = int( args.subcluster_annotation_size ) # can't set it in add_argument(), sigh if args.subcluster_annotation_size is not None: if args.calculate_alternative_annotations or args.write_additional_cluster_annotations is not None: raise Exception( 'can\'t set either --calculate-alternative-annotations or --write-additional-cluster-annotations if --subcluster-annotation-size is also set (you get duplicate annotations, which confuses and crashes things, plus it doesn\'t really make sense -- alternative annotations should be calculated on the subcluster annotations now)' ) if args.action == 'view-alternative-annotations' and args.persistent_cachefname is None: # handle existing old-style output assert args.outfname is not None if os.path.exists(utils.getprefix(args.outfname) + '-hmm-cache.csv'): args.persistent_cachefname = utils.getprefix( args.outfname ) + '-hmm-cache.csv' # written by bcrham, so has to be csv, not yaml if args.min_largest_cluster_size is not None and args.n_final_clusters is not None: print ' note: both --min-largest-cluster-size and --n-final-clusters are set, which means we\'ll stop clustering when *either* of their criteria are satisfied (not both)' # maybe it should be both, but whatever if not args.paired_loci and (args.action == 'get-selection-metrics' or args.get_selection_metrics): if args.outfname is None and args.selection_metric_fname is None: print ' %s calculating selection metrics, but neither --outfname nor --selection-metric-fname were set, which means nothing will be written to disk' % utils.color( 'yellow', 'warning') elif args.selection_metric_fname is None and args.action == 'get-selection-metrics' and not args.add_selection_metrics_to_outfname: args.selection_metric_fname = utils.insert_before_suffix( '-selection-metrics', args.outfname) if args.plot_annotation_performance: if args.plotdir is None and args.print_n_worst_annotations is None: raise Exception( 'doesn\'t make sense to set --plot-annotation-performance but not either of --plotdir or --print-n-worst-annotations (we\'ll spend all the cycles counting things up but then they\'ll just disappear from memory without being recorded).' ) if not args.is_simu: raise Exception( 'can\'t plot performance unless --is-simu is set (and this is simulation)' ) if args.print_n_worst_annotations is not None and not args.plot_annotation_performance: raise Exception( '--plot-annotation-performance must be set if you\'re setting --print-worst-annotations' ) if not args.paired_loci and ( args.action == 'plot-partitions' or args.action == 'annotate' and args.plot_partitions) and args.plotdir is None: raise Exception('--plotdir must be specified if plotting partitions') if args.action == 'annotate' and args.plot_partitions and args.input_partition_fname is None: # could set this up to use e.g. --simultaneous-true-clonal-seqs as well, but it can't atm print ' %s running annotate with --plot-partitions, but --input-partition-fname is not set, which likely means the partitions will be trivial/singleton partitions' % utils.color( 'yellow', 'warning') if args.make_per_gene_per_base_plots and not args.make_per_gene_plots: # the former doesn't do anything unless the latter is turned on args.make_per_gene_plots = True if args.action == 'simulate': if args.n_trees is None and not args.paired_loci: args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs)) if args.n_procs > args.n_sim_events: print ' note: reducing --n-procs to %d (was %d) so it isn\'t bigger than --n-sim-events' % ( args.n_sim_events, args.n_procs) args.n_procs = args.n_sim_events if args.n_max_queries != -1: print ' note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)' if args.outfname is None and args.paired_outdir is None: print ' note: no %s specified, so nothing will be written to disk' % ( '--paired-outdir' if args.paired_loci else '--outfname') args.outfname = get_dummy_outfname( args.workdir ) # hackey, but otherwise I have to rewrite the whole run_simulation() in bin/partis to handle None type outfname if args.simulate_from_scratch: args.rearrange_from_scratch = True args.mutate_from_scratch = True if args.rearrange_from_scratch and not args.force_dont_generate_germline_set: # i would probably just default to always generating germline sets when rearranging from scratch, but bin/test-germline-inference.py (and any other case where you want to dramatically restrict the germline set) really argue for a way to force just using the genes in the germline dir args.generate_germline_set = True if args.flat_mute_freq or args.same_mute_freq_for_all_seqs: assert args.mutate_from_scratch if args.mutate_from_scratch and not args.no_per_base_mutation: print ' note: setting --no-per-base-mutation since --mutate-from-scratch was set' args.no_per_base_mutation = True # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed) if args.parameter_dir is not None: if args.rearrange_from_scratch or args.mutate_from_scratch: raise Exception( 'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)' ) if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None: raise Exception( 'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set' ) args.reco_parameter_dir = args.parameter_dir args.shm_parameter_dir = args.parameter_dir args.parameter_dir = None if args.rearrange_from_scratch and args.reco_parameter_dir is not None: raise Exception( 'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir' ) if args.mutate_from_scratch and args.shm_parameter_dir is not None: raise Exception( 'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir' ) if args.reco_parameter_dir is None and not args.rearrange_from_scratch: raise Exception( 'have to either set --rearrange-from-scratch or --reco-parameter-dir (or --simulate-from-scratch)' ) if args.shm_parameter_dir is None and not args.mutate_from_scratch: raise Exception( 'have to either set --mutate-from-scratch or --shm-parameter-dir (or --simulate-from-scratch)' ) if args.generate_germline_set and not args.rearrange_from_scratch: raise Exception( 'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)' ) if args.generate_germline_set: args.snp_positions = None # if you want to control the exact positions, you have to use bin/test-germline-inference.py args.indel_positions = None process_gls_gen_args(args) if args.generate_trees: assert args.n_procs == 1 # not set up to handle output, and also no need if args.treefname is not None: raise Exception( '--treefname was set for simulation action (probably meant to use --input-simulation-treefname)' ) if args.parameter_dir is not None and not args.paired_loci: # if we're splitting loci, this isn't the normal parameter dir, it's a parent of that args.parameter_dir = args.parameter_dir.rstrip('/') if os.path.exists(args.parameter_dir): pdirs = [ d for d in os.listdir(args.parameter_dir) if os.path.isdir(d) ] if len(pdirs) > 0 and len( set(pdirs) & set(utils.parameter_type_choices)) == 0: raise Exception( 'couldn\'t find any expected parameter types (i.e. subdirs) in --parameter-dir \'%s\'. Allowed types: %s, found: %s. Maybe you added the parameter type to the parameter dir path?' % (args.parameter_dir, ' '.join( utils.parameter_type_choices), ' '.join( os.listdir(args.parameter_dir)))) if os.path.exists(args.default_initial_germline_dir + '/' + args.species): # ick that is hackey args.default_initial_germline_dir += '/' + args.species if args.species != 'human' and not args.allele_cluster: print ' non-human species \'%s\', turning on allele clustering' % args.species args.allele_cluster = True if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None: if args.n_max_snps > args.n_max_mutations_per_segment - 10: raise Exception( '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment)) if args.leave_default_germline: args.dont_remove_unlikely_alleles = True args.allele_cluster = False args.dont_find_new_alleles = True if args.action not in actions_not_requiring_input and [ args.infname, args.paired_indir ].count(None) == 2: if args.paired_loci: raise Exception( '--infname or --paired-indir is required for action \'%s\' with --paired-loci' % args.action) else: raise Exception('--infname is required for action \'%s\'' % args.action) if args.action == 'get-linearham-info': if args.linearham_info_fname is None: # for some reason setting required=True isn't working raise Exception('have to specify --linearham-info-fname') if args.sw_cachefname is None and args.parameter_dir is None: raise Exception( 'have to specify --sw-cachefname or --parameter-dir, since we need sw info to calculate linearham inputs' ) if args.extra_annotation_columns is None or 'linearham-info' not in args.extra_annotation_columns: args.extra_annotation_columns = utils.add_lists( args.extra_annotation_columns, ['linearham-info']) if args.ete_path is not None and args.ete_path == 'None': # it's nice to be able to unset this from the command line (so we don't make the slow tree plots) args.ete_path = None
args.scan_vars = { 'simu' : ['carry-cap', 'n-sim-seqs-per-gen', 'obs-times', 'seed'], 'partition' : ['carry-cap', 'n-sim-seqs-per-gen', 'obs-times', 'seed', 'lb-tau'], } sys.path.insert(1, args.partis_dir + '/python') try: import utils import treeutils import plotting except ImportError as e: print e raise Exception('couldn\'t import from main partis dir \'%s\' (set with --partis-dir)' % args.partis_dir) args.carry_cap_list = utils.get_arg_list(args.carry_cap_list, intify=True) args.n_sim_seqs_per_gen_list = utils.get_arg_list(args.n_sim_seqs_per_gen_list, list_of_lists=True, intify=True) args.obs_times_list = utils.get_arg_list(args.obs_times_list, list_of_lists=True, intify=True) args.lb_tau_list = utils.get_arg_list(args.lb_tau_list, floatify=True) args.n_tau_lengths_list = utils.get_arg_list(args.n_tau_lengths_list, floatify=True) args.n_generations_list = utils.get_arg_list(args.n_generations_list, intify=True) args.only_metrics = utils.get_arg_list(args.only_metrics) if [args.n_tau_lengths_list, args.n_generations_list].count(None) != 1: raise Exception('have to set exactly one of --n-tau-lengths, --n-generations') if args.workdir is None: args.workdir = utils.choose_random_subdir('/tmp/%s/hmms' % (os.getenv('USER', default='partis-work'))) # ---------------------------------------------------------------------------------------- if args.action == 'get-lb-bounds': calc_lb_bounds(args)
parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters. This index is with respect to the cluster order found in the file (which, in contrast to plots made by --plotdir, is *not* sorted by size)') parser.add_argument('--indel-reversed-seqs', action='store_true', help='if set, take sequences that have had any shm indels "reversed" (i.e. insertions are reversed, and deletions are replaced with the germline bases) rather than the default of using sequences from the original input file. Indel-reversed sequences can be convenient because they are by definition the same length as and aligned to the naive sequence.') parser.add_argument('--glfo-dir', help='Directory with germline info. Only necessary for old-style csv output files. Equivalent to a parameter dir with \'/hmm/germline-sets\' appended.') parser.add_argument('--locus', default='igh', help='only used for old-style csv output files') parser.add_argument('--plotdir', help='if set, plot annotation parameters from --fname to --plotdir and exit (you still have to set outfile, sorry, it\'s nice having it be a positional arg, but it doesn\'t get used). To add e.g. per-gene-per-position plots comment/uncomment args in the call below.') parser.add_argument('--fasta-info-separator', default=' ', help='character to use ') if 'extract-fasta.py' in sys.argv[0]: # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works print ' note: running deprecated script %s, which currently is just a link pointing to %s' % (os.path.basename(sys.argv[0]), os.path.basename(os.path.realpath( __file__))) print ' note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)' utils.insert_in_arglist(sys.argv, [utils.get_val_from_arglist(sys.argv, '--input-file'), utils.get_val_from_arglist(sys.argv, '--fasta-output-file')], sys.argv[0]) utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True) utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True) args = parser.parse_args() args.extra_columns = utils.get_arg_list(args.extra_columns) assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta'] default_glfo_dir = partis_dir + '/data/germlines/human' if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None: print ' note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir args.glfo_dir = default_glfo_dir glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus) if args.plotdir is not None: from parametercounter import ParameterCounter setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions}) # hackity hackity hackity pcounter = ParameterCounter(glfo, args) for line in annotation_list: pcounter.increment(line) pcounter.plot(args.plotdir) #, make_per_base_plots=True) #, only_overall=True, make_per_base_plots=True
parser.add_argument('--seed', type=int, default=int(time.time())) parser.add_argument('--gen-gset', action='store_true') parser.add_argument('--dj-genes', default='IGHD6-19*01:IGHJ4*02', help='.') parser.add_argument('--sim-v-genes', default='IGHV4-39*01:IGHV4-39*06', help='.') parser.add_argument('--inf-v-genes', default='IGHV4-39*01', help='.') parser.add_argument('--snp-positions') parser.add_argument('--remove-template-genes', action='store_true') parser.add_argument('--mut-mult', type=float, default=0.5) parser.add_argument('--slurm', action='store_true') parser.add_argument('--outdir', default=fsdir + '/partis/allele-finder') parser.add_argument('--workdir', default=fsdir + '/_tmp/hmms/' + str(random.randint(0, 999999))) parser.add_argument('--comprehensive', action='store_true') parser.add_argument('--n-tests', type=int, default=3) parser.add_argument('--allele-prevalence-freqs') args = parser.parse_args() args.dj_genes = utils.get_arg_list(args.dj_genes) args.sim_v_genes = utils.get_arg_list(args.sim_v_genes) args.inf_v_genes = utils.get_arg_list(args.inf_v_genes) args.snp_positions = utils.get_arg_list(args.snp_positions) args.allele_prevalence_freqs = utils.get_arg_list(args.allele_prevalence_freqs, floatify=True) if args.snp_positions is not None: args.snp_positions = [[int(p) for p in pos_str.split(',')] for pos_str in args.snp_positions] assert len(args.snp_positions) == len(args.sim_v_genes) # args.snp_positions = {args.sim_v_genes[ig] : args.snp_positions[ig] for ig in range(len(args.sim_v_genes))} if args.seed is not None: random.seed(args.seed) numpy.random.seed(args.seed) if args.comprehensive: comprehensive_test(args)
import random import argparse import re import time from subprocess import check_call, Popen sys.path.insert(1, './python') from humans import humans import utils parser = argparse.ArgumentParser() parser.add_argument('--dataset', choices=['stanford', 'adaptive'], default='adaptive') parser.add_argument('--only-run') # colon-separated list of human,subset pairs to run, e.g. A,3:C,8 parser.add_argument('--action', required=True) args = parser.parse_args() args.only_run = utils.get_arg_list(args.only_run) if args.only_run is not None: tmp_items = [] for item in args.only_run: tmp_items.append(item.split(',')) args.only_run = tmp_items if args.dataset == 'stanford': datadir = '/shared/silo_researcher/Matsen_F/MatsenGrp/data/stanford-lineage/2014-11-17-vollmers' files = os.listdir(datadir) elif args.dataset == 'adaptive': datadirs = [ '/shared/silo_researcher/Matsen_F/MatsenGrp/data/bcr/output_sw/' + h for h in humans['adaptive'] ] files = [] for datadir in datadirs: files += [ fname for fname in os.listdir(datadir) if '-M_merged.tsv.bz2' in fname ] # if you switch to naive (N), be careful 'cause A is split in pieces
def process(args): if args.action == 'run-viterbi': print ' note: replacing deprecated action name \'run-viterbi\' with current name \'annotate\' (this doesn\'t change any actual behavior)' args.action = 'annotate' if args.chain is not None: print ' note: transferring argument from deprecated option \'--chain %s\' to new option \'--locus %s\'' % ( args.chain, 'ig' + args.chain) args.locus = 'ig' + args.chain args.chain = None args.loci = utils.get_arg_list(args.loci, choices=utils.loci) if args.loci is None: # in principle I should check that at least one of 'em isn't None, but if that's the case it'll crash soon enough args.loci = [args.locus] else: args.locus = args.loci[0] args.only_genes = utils.get_arg_list(args.only_genes) args.queries = utils.get_arg_list(args.queries) args.queries_to_include = utils.get_arg_list(args.queries_to_include) args.reco_ids = utils.get_arg_list(args.reco_ids) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) if args.istartstop is not None: if args.istartstop[0] >= args.istartstop[1] or args.istartstop[0] < 0: raise Exception('invalid --istartstop specification: %d %d' % (args.istartstop[0], args.istartstop[1])) args.n_max_per_region = utils.get_arg_list(args.n_max_per_region, intify=True) if len(args.n_max_per_region) != 3: raise Exception( 'n-max-per-region should be of the form \'x:y:z\', but I got ' + str(args.n_max_per_region)) args.write_additional_cluster_annotations = utils.get_arg_list( args.write_additional_cluster_annotations, intify=True) if args.write_additional_cluster_annotations is not None and len( args.write_additional_cluster_annotations) != 2: raise Exception( '--write-additional-cluster-annotations must be specified as two numbers \'m:n\', but I got %s' % args.write_additional_cluster_annotations) args.extra_annotation_columns = utils.get_arg_list( args.extra_annotation_columns, choices=utils.extra_annotation_headers) if args.linearham: assert args.action == 'partition', '--linearham mode must be run with \'partis partition\'' args.extra_annotation_columns = utils.add_lists( args.extra_annotation_columns, ['flexbounds', 'relpos']) args.cluster_indices = utils.get_arg_list(args.cluster_indices, intify=True) args.region_end_exclusions = { r: [ args.region_end_exclusion_length if ('%s_%s' % (r, e)) in utils.real_erosions else 0 for e in ['5p', '3p'] ] for r in utils.regions } args.region_end_exclusion_length = None # there isn't really a big reason to set it to None, but this makes clear that I should only be using the dict version args.annotation_clustering_thresholds = utils.get_arg_list( args.annotation_clustering_thresholds, floatify=True) args.naive_hamming_bounds = utils.get_arg_list(args.naive_hamming_bounds, floatify=True) if args.small_clusters_to_ignore is not None: if '-' in args.small_clusters_to_ignore: lo, hi = [ int(cluster_size) for cluster_size in args.small_clusters_to_ignore.split('-') ] args.small_clusters_to_ignore = range(lo, hi + 1) else: args.small_clusters_to_ignore = utils.get_arg_list( args.small_clusters_to_ignore, intify=True) if args.seed_unique_id is not None: args.seed_unique_id = args.seed_unique_id.strip( ) # protect against the space you may put in front of it if it's got an initial minus sign (better way is to use an equals sign) if args.queries is not None and args.seed_unique_id not in args.queries: raise Exception('seed uid %s not in --queries %s' % (args.seed_unique_id, ' '.join(args.queries))) if args.random_seed_seq: raise Exception( 'can\'t specify both --seed-unique-id and --random-seed-seq') if args.queries_to_include is None: # make sure the seed is in --queries-to-include args.queries_to_include = [args.seed_unique_id] elif args.seed_unique_id not in args.queries_to_include: args.queries_to_include = [ args.seed_unique_id ] + args.queries_to_include # may as well put it first, I guess (?) elif args.seed_seq is not None: args.seed_unique_id = 'seed-seq' if args.sw_debug is None: # if not explicitly set, set equal to regular debug args.sw_debug = args.debug if args.only_genes is not None: for gene in args.only_genes: # make sure they're all at least valid ig genes utils.split_gene(gene) if args.print_git_commit or args.action == 'version': print 'RUN ' + ' '.join(sys.argv) tag = subprocess.check_output(['git', 'tag']).split()[-1] print ' tag %s' % tag print ' commit %s' % subprocess.check_output( ['git', 'rev-parse', 'HEAD']).strip() if args.action == 'version': sys.exit(0) args.is_data = not args.is_simu # whole code base uses is_data, this is better than changing all of that if args.simultaneous_true_clonal_seqs: if args.is_data: raise Exception( 'can only pass true clonal families to multi-hmm together on simulation and with --is-simu set' ) if args.n_simultaneous_seqs is not None: raise Exception( 'can\'t specify both --n-simultaneous-seqs and --simultaneous-true-clonal-seqs' ) if args.no_indels: print 'forcing --gap-open-penalty to %d to prevent indels, since --no-indels was specified (you can also adjust this penalty directly)' % args.no_indel_gap_open_penalty args.gap_open_penalty = args.no_indel_gap_open_penalty if args.indel_frequency > 0.: if args.indel_frequency < 0. or args.indel_frequency > 1.: raise Exception('--indel-frequency must be in [0., 1.] (got %f)' % args.indel_frequency) args.n_indels_per_indeld_seq = utils.get_arg_list( args.n_indels_per_indeld_seq, intify=True) if 'tr' in args.locus and args.mutation_multiplier is None: args.mutation_multiplier = 0. if args.workdir is None: # set default here so we know whether it was set by hand or not def choose_random_subdir(dirname): subname = str(random.randint(0, 999999)) while os.path.exists(dirname + '/' + subname): subname = str(random.randint(0, 999999)) return dirname + '/' + subname if args.batch_system is not None and os.path.exists( '/fh/fast/matsen_e'): args.workdir = choose_random_subdir( '/fh/fast/matsen_e/' + os.path.basename(os.getenv('HOME')) + '/_tmp/hmms') else: args.workdir = choose_random_subdir( '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms') if args.batch_system is not None: print ' %s: using batch system %s with default --workdir (%s) -- if this isn\'t visible to the batch nodes on your system, you\'ll need to change it' % ( utils.color('red', 'warning'), args.batch_system, args.workdir) else: args.workdir = args.workdir.rstrip('/') if os.path.exists(args.workdir): raise Exception('workdir %s already exists' % args.workdir) if args.batch_system == 'sge' and args.batch_options is not None: if '-e' in args.batch_options or '-o' in args.batch_options: print '%s --batch-options contains \'-e\' or \'-o\', but we add these automatically since we need to be able to parse each job\'s stdout and stderr. You can control the directory under which they\'re written with --workdir (which is currently %s).' % ( utils.color('red', 'warning'), args.workdir) if args.outfname is not None and not args.presto_output: if utils.getsuffix(args.outfname) not in ['.csv', '.yaml']: raise Exception('unhandled --outfname suffix %s' % utils.getsuffix(args.outfname)) if utils.getsuffix(args.outfname) != '.yaml': print ' %s --outfname uses deprecated file format %s. This will still work fine, but the new default .yaml format is much cleaner, and includes annotations, partitions, and germline info in the same file.' % ( utils.color('yellow', 'note:'), utils.getsuffix(args.outfname)) if args.action in ['view-annotations', 'view-partitions' ] and utils.getsuffix(args.outfname) == '.yaml': raise Exception( 'have to use \'view-output\' action to view .yaml output files' ) if args.presto_output: if args.action == 'annotate' and utils.getsuffix( args.outfname) != '.tsv': raise Exception( '--outfname suffix has to be .tsv for annotation with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.action == 'partition' and utils.getsuffix( args.outfname) not in ['.fa', 'fasta']: raise Exception( '--outfname suffix has to be .fa or .fasta for partition with --presto-output (got %s)' % utils.getsuffix(args.outfname)) if args.aligned_germline_fname is None: raise Exception( 'in order to get presto output, you have to set --aligned-germline-fname to a fasta file with germline alignments for every germline gene, an example is located in data/germlines/imgt-aligned-igh.fa (this isn\'t set by default because imgt alignments are subject to change)' ) if args.cluster_annotation_fname is None and args.outfname is not None and utils.getsuffix( args.outfname ) == '.csv': # if it wasn't set on the command line (<outfname> _was_ set), _and_ if we were asked for a csv, then use the old file name format args.cluster_annotation_fname = utils.insert_before_suffix( '-cluster-annotations', args.outfname) if args.calculate_alternative_naive_seqs or ( args.action == 'view-alternative-naive-seqs' and args.persistent_cachefname is None): if args.outfname is None: raise Exception( 'have to specify --outfname in order to calculate alternative naive sequences' ) args.persistent_cachefname = utils.insert_before_suffix( '-hmm-cache', args.outfname) if args.calculate_alternative_naive_seqs and os.path.exists( args.persistent_cachefname): if os.stat(args.persistent_cachefname).st_size == 0: print ' note: removing existing zero-length persistent cache file %s' % args.persistent_cachefname os.remove(args.persistent_cachefname) else: raise Exception( 'persistent cache file %s already exists, but we were asked to --calculate-alternative-naive-seqs. Either it\'s an old file (in which case you should delete it), or you\'ve already got the alternative annotations (so you can just run view-alternative-naive-seqs)' % args.persistent_cachefname) if args.plot_performance: print '%s encountered deprecated argument --plot-performance, moving value to --plot-annotation-performance' % utils.color( 'yellow', 'warning') args.plot_annotation_performance = True if args.plot_annotation_performance: if args.plotdir is None: raise Exception( 'can\'t plot performance unless --plotdir is specified') if not args.is_simu: raise Exception('can\'t plot performance unless --is-simu is set') if args.action == 'plot-partitions' and args.plotdir is None: raise Exception('--plotdir must be specified ') if args.parameter_type != 'hmm': print ' using non-default parameter type \'%s\'' % args.parameter_type if args.simulate_from_scratch: args.rearrange_from_scratch = True args.mutate_from_scratch = True if args.flat_mute_freq or args.same_mute_freq_for_all_seqs: assert args.mutate_from_scratch if args.action == 'simulate': if len(args.loci) != 1: raise Exception('needs to be implemented') if args.batch_system is not None and args.n_procs > 1 and not args.subsimproc: print ' %s setting subsimproc' % utils.color('red', 'warning') args.subsimproc = True if args.n_trees is None: args.n_trees = max(1, int(float(args.n_sim_events) / args.n_procs)) if args.outfname is None: print ' note: no --outfname specified, so nothing will be written to disk' if args.n_max_queries != -1: print ' note: --n-max-queries is not used when simulating (use --n-sim-events to set the simulated number of rearrangemt events)' # end result of this block: shm/reco parameter dirs are set (unless we're doing their bit from scratch), --parameter-dir is set to None (and if --parameter-dir was set but shm/reco were _not_ set, we've just used --parameter-dir for either/both as needed) if args.parameter_dir is not None: if args.rearrange_from_scratch or args.mutate_from_scratch: raise Exception( 'can\'t set --parameter-dir if rearranging or mutating from scratch (use --reco-parameter-dir and/or --shm-parameter-dir)' ) if args.reco_parameter_dir is not None or args.shm_parameter_dir is not None: raise Exception( 'can\'t set --parameter-dir if either --reco-parameter-dir or --shm-parameter-dir are also set' ) args.reco_parameter_dir = args.parameter_dir args.shm_parameter_dir = args.parameter_dir args.parameter_dir = None if args.rearrange_from_scratch and args.reco_parameter_dir is not None: raise Exception( 'doesn\'t make sense to set both --rearrange-from-scratch and --reco-parameter-dir' ) if args.mutate_from_scratch and args.shm_parameter_dir is not None: raise Exception( 'doesn\'t make sense to set both --mutate-from-scratch and --shm-parameter-dir' ) if args.reco_parameter_dir is None and not args.rearrange_from_scratch: raise Exception( 'have to either set --rearrange-from-scratch or --reco-parameter-dir' ) if args.shm_parameter_dir is None and not args.mutate_from_scratch: raise Exception( 'have to either set --mutate-from-scratch or --shm-parameter-dir' ) if args.generate_germline_set and not args.rearrange_from_scratch: raise Exception( 'can only --generate-germline-set if also rearranging from scratch (set --rearrange-from-scratch)' ) if args.parameter_dir is not None: args.parameter_dir = args.parameter_dir.rstrip('/') if args.count_parameters and not args.dont_write_parameters: raise Exception( 'if you set --count-parameters, you should also set --dont-write-parameters to make sure you\'re not accidentally overwriting existing parameters ' ) if os.path.exists(args.default_initial_germline_dir + '/' + args.species): # ick that is hackey args.default_initial_germline_dir += '/' + args.species if args.species != 'human' and not args.allele_cluster: print ' non-human species \'%s\', turning on allele clustering' % args.species args.allele_cluster = True if args.n_max_snps is not None and args.n_max_mutations_per_segment is not None: if args.n_max_snps > args.n_max_mutations_per_segment - 10: raise Exception( '--n-max-snps should be at least ten less than --n-max-mutations-per-segment, but I got %d and %d' % (args.n_max_snps, args.n_max_mutations_per_segment)) if args.n_alleles_per_gene is None: if not args.dont_find_new_alleles: args.n_alleles_per_gene = 1 else: args.n_alleles_per_gene = 2 if args.leave_default_germline: args.dont_remove_unlikely_alleles = True args.allele_cluster = False args.dont_find_new_alleles = True if args.infname is None and args.action not in [ 'simulate', 'view-output', 'view-annotations', 'view-partitions', 'view-cluster-annotations', 'plot-partitions', 'view-alternative-naive-seqs' ]: raise Exception('--infname is required for action \'%s\'' % args.action)
import os from subprocess import Popen, PIPE, check_call, check_output, CalledProcessError import argparse import random current_script_dir = os.path.dirname(os.path.realpath(__file__)).replace('/bin', '/python') if not os.path.exists(current_script_dir): print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % current_script_dir sys.path.insert(1, current_script_dir) import utils parser = argparse.ArgumentParser() parser.add_argument('--actions', required=True) parser.add_argument('--timegrep', action='store_true') args = parser.parse_args() args.actions = utils.get_arg_list(args.actions) fsdir = '/fh/fast/matsen_e/dralph/work/partis-dev/_output' simfbase = 'simu-7-leaves-1.0-mutate' simfbase_seed = 'simu-2.3-leaves-1.0-mutate-zipf' human = '021-018' istartstopstr_list_str = '0:250 250:750 750:1500 1500:2500 2500:4000 4000:6500 6500:9500 9500:13500 13500:18500 18500:26000 26000:36000 36000:51000 51000:71000 71000:101000 101000:141000 141000:191000 191000:266000 266000:366000 350000:500000 366000:516000' istartstopstr_list_str_seed = '0:1500 1500:4500 4500:8500 8500:13500 13500:21000 21000:31000 51000:71000 71000:101000 101000:141000 141000:191000 191000:266000 266000:366000 366000:516000 516000:816000 816000:1316000 7:500007 500007:1000007 1000007:1500007 1316000:2066000 7:1000007' istartstopstr_list = istartstopstr_list_str.split(' ') istartstopstr_list_seed = istartstopstr_list_str_seed.split(' ') istartstoplist = [] for istartstopstr in istartstopstr_list: istartstoplist.append([int(iss) for iss in istartstopstr.split(':')]) n_query_list = [istartstop[1] - istartstop[0] for istartstop in istartstoplist] istartstoplist_seed = [] for istartstopstr in istartstopstr_list_seed:
'--old-output-structure', action='store_true', help= 'output paths corresponding to clustering paper, i.e. with everything in /fh/fast/matsen_e/dralph' ) all_actions = [ 'cache-parameters', 'simulate', 'vjcdr3-partition', 'partition', 'naive-hamming-partition', 'vsearch-partition', 'seed-partition', 'seed-naive-hamming-partition', 'run-viterbi', 'run-changeo', 'run-mixcr', 'run-igscueal', 'synthetic', 'write-plots', 'compare-subsets', 'annotate-seed-clusters' ] parser.add_argument('--actions', required=True, choices=all_actions) #default=':'.join(all_actions)) args = parser.parse_args() args.actions = utils.get_arg_list(args.actions) args.mutation_multipliers = utils.get_arg_list(args.mutation_multipliers, floatify=True) args.n_leaf_list = utils.get_arg_list(args.n_leaf_list, floatify=True) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) args.istartstoplist = utils.get_arg_list(args.istartstoplist, intify=True, list_of_lists=True) args.humans = utils.get_arg_list(args.humans) args.hfrac_bound_list = utils.get_arg_list(args.hfrac_bound_list, floatify=True, list_of_lists=True) args.expected_methods = utils.get_arg_list(args.expected_methods) args.synthetic_partitions = utils.get_arg_list(args.synthetic_partitions) for isp in range(len(args.synthetic_partitions) ): # I really shouldn't have set it up this way
parser.add_argument('--n-to-partition', type=int, default=5000) parser.add_argument('--n-data-to-cache', type=int, default=50000) parser.add_argument('--n-sim-seqs', type=int, default=10000) parser.add_argument('--n-subsets', type=int) parser.add_argument('--istartstop') # NOTE usual zero indexing parser.add_argument('--startstoplist') # list of istartstops for comparisons parser.add_argument('--dont-normalize', action='store_true') parser.add_argument('--logaxis', action='store_true') parser.add_argument('--zoom', action='store_true') parser.add_argument('--humans', default=None) #'A') parser.add_argument('--no-mixcr', action='store_true') parser.add_argument('--no-changeo', action='store_true') all_actions = ['cache-data-parameters', 'simulate', 'cache-simu-parameters', 'partition', 'naive-hamming-partition', 'vsearch-partition', 'run-viterbi', 'run-changeo', 'run-mixcr', 'run-igscueal', 'write-plots', 'compare-sample-sizes', 'compare-subsets'] parser.add_argument('--actions', required=True) #, choices=all_actions) #default=':'.join(all_actions)) args = parser.parse_args() args.actions = utils.get_arg_list(args.actions) args.mutation_multipliers = utils.get_arg_list(args.mutation_multipliers, intify=True) args.n_leaf_list = utils.get_arg_list(args.n_leaf_list, intify=True) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) args.startstoplist = utils.get_arg_list(args.startstoplist) args.humans = utils.get_arg_list(args.humans) if 'cache-data-parameters' in args.actions: args.data = True assert args.subset is None or args.istartstop is None # dosn't make sense to set both of them if args.subset is not None: if 'write-plots' not in args.actions: assert args.n_subsets == 10 # for all the subset plots, I split into ten subsets, then ended up only using the first thre of 'em, so you have to set n_subsets to 10 if you're running methods, but then to 3 when you're writing plots args.n_to_partition = 1300
import glutils import collections import colored_traceback.always parser = argparse.ArgumentParser() parser.add_argument('gldir1') parser.add_argument('gldir2') parser.add_argument( '--names', default='+gl-1:+gl-2', help= 'colon-separated list of length 2 with labels for gldir1 and gldir2, which will be appended to each gene name in the ascii output' ) parser.add_argument('--locus', default='igh') args = parser.parse_args() args.names = utils.get_arg_list(args.names) glfos = [] for name, gldir in zip(args.names, [args.gldir1, args.gldir2]): print '%s:' % utils.color('yellow', name) glfos.append(glutils.read_glfo(gldir, args.locus, debug=True)) for region in [r for r in utils.regions if r in glfos[0]['seqs']]: aset, bset = [set(g['seqs'][region]) for g in glfos] tmpfo = glutils.get_empty_glfo( args.locus) # make a new glfo that will only have non-shared genes for glabel, gset, gfo in zip( args.names, [aset - bset, bset - aset], glfos): # <gset> is the genes that're only in <glabel> for ogene in gset:
help='see bcr-phylo docs') parser.add_argument('--selection-strength', type=float, default=1., help='see bcr-phylo docs') parser.add_argument('--lb-tau', type=float, help='') parser.add_argument('--dont-observe-common-ancestors', action='store_true') parser.add_argument( '--parameter-variances', help= 'if set, the specified parameters are drawn from a uniform distribution of the specified (half-)width (with mean from the regular argument) for each family, rather than having the same value for all families. Format example: n-sim-seqs-per-generation,10:carry-cap,150' ) args = parser.parse_args() args.obs_times = utils.get_arg_list(args.obs_times, intify=True) args.n_sim_seqs_per_generation = utils.get_arg_list( args.n_sim_seqs_per_generation, intify=True) args.actions = utils.get_arg_list(args.actions, choices=all_actions) args.parameter_variances = utils.get_arg_list( args.parameter_variances, key_val_pairs=True, floatify=True, choices=[ 'selection-strength', 'obs-times', 'n-sim-seqs-per-generation', 'carry-cap' ] ) # if you add more, make sure the bounds enforcement and conversion stuff in get_vpar_val() are still ok # ---------------------------------------------------------------------------------------- if 'simu' in args.actions:
parser.add_argument('--outdir', required=True) parser.add_argument('--plotdirs', required=True) parser.add_argument('--names', required=True) parser.add_argument('--performance-plots', action='store_true') parser.add_argument('--colors', default=':'.join(plotting.default_colors)) parser.add_argument('--linewidths', default=':'.join(plotting.default_linewidths)) parser.add_argument('--gldirs') #, default=['data/germlines/human']) parser.add_argument('--locus', default='igh') parser.add_argument('--normalize', action='store_true') parser.add_argument('--extra-stats') parser.add_argument('--translegend') parser.add_argument('--log', default='') args = parser.parse_args() args.plotdirs = utils.get_arg_list(args.plotdirs) args.names = utils.get_arg_list(args.names) args.colors = utils.get_arg_list(args.colors) args.linewidths = utils.get_arg_list(args.linewidths) args.gldirs = utils.get_arg_list(args.gldirs) args.translegend = utils.get_arg_list(args.translegend, floatify=True) for iname in range(len(args.names)): args.names[iname] = args.names[iname].replace('@', ' ') # if you just pass in one parent directory, we assume <args.names> contains the desired subdirs if len(args.plotdirs) == 1: parentdir = args.plotdirs[0] args.plotdirs = [parentdir + '/' + n for n in args.names] if len(args.plotdirs) != len(args.names): raise Exception('poorly formatted args:\n %s\n %s' %
parser.add_argument('--smc-particles', type=int, default=1, help='Number of particles (clustering paths) to simulate with SMC') parser.add_argument('--gap-open-penalty', type=int, default=30, help='Penalty for indel creation in Smith-Waterman step.') parser.add_argument('--match-mismatch', default='5:1', help='match:mismatch scores for smith-waterman.') parser.add_argument('--max-logprob-drop', type=float, default=5., help='stop glomerating when the total logprob has dropped by this much') parser.add_argument('--n-partitions-to-write', type=int, default=100, help='') # temporary arguments (i.e. will be removed as soon as they're not needed) parser.add_argument('--gtrfname', default='data/recombinator/gtr.txt', help='File with list of GTR parameters. Fed into bppseqgen along with the chosen tree') # NOTE command to generate gtr parameter file: [stoat] partis/ > zcat /shared/silo_researcher/Matsen_F/MatsenGrp/data/bcr/output_sw/A/04-A-M_gtr_tr-qi-gi.json.gz | jq .independentParameters | grep -v '[{}]' | sed 's/["\:,]//g' | sed 's/^[ ][ ]*//' | sed 's/ /,/' | sort >data/gtr.txt # uncommon arguments parser.add_argument('--apply-choice_probs_in_sw', action='store_true', help='Apply gene choice probs in Smith-Waterman step. Probably not a good idea (see comments in waterer.py).') parser.add_argument('--joint-emission', action='store_true', help='Use information about both sequences when writing pair emission probabilities?') args = parser.parse_args() args.only_genes = utils.get_arg_list(args.only_genes) args.n_procs = utils.get_arg_list(args.n_procs, intify=True) args.n_fewer_procs = args.n_procs[0] if len(args.n_procs) == 1 else args.n_procs[1] args.n_procs = args.n_procs[0] if args.slurm and '/tmp' in args.workdir: raise Exception('it appears that <workdir> isn\'t set to something visible to all slurm nodes') if args.smc_particles != 1: raise Exception('sequential monte carlo is not supported at this juncture.') if args.workdir is None: # set default here so we know whether it was set by hand or not args.workdir = '/tmp/' + os.path.basename(os.getenv('HOME')) + '/hmms/' + str(random.randint(0, 999999)) if os.path.exists(args.workdir): raise Exception('workdir %s already exists' % args.workdir)
import argparse import random current_script_dir = os.path.dirname(os.path.realpath(__file__)).replace( '/bin', '/python') if not os.path.exists(current_script_dir): print 'WARNING current script dir %s doesn\'t exist, so python path may not be correctly set' % current_script_dir sys.path.insert(1, current_script_dir) import utils parser = argparse.ArgumentParser() parser.add_argument('--actions', required=True) parser.add_argument('--timegrep', action='store_true') args = parser.parse_args() args.actions = utils.get_arg_list(args.actions) fsdir = '/fh/fast/matsen_e/dralph/work/partis-dev/_output' + '/update-17' # fsdir = '/fh/fast/matsen_e/processed-data/partis/clustering-paper/vollmers' #/021-018/ simfbase = 'simu-7-leaves-1.0-mutate' simfbase_seed = 'simu-2.3-leaves-1.0-mutate-zipf' human = '021-018' istartstopstr_list_str = '0:250 250:750 750:1500 1500:2500 2500:4000 4000:6500 6500:9500 9500:13500 13500:18500 18500:26000 26000:36000 36000:51000 51000:71000 71000:101000' # 101000:141000 141000:191000 191000:266000 266000:366000 350000:500000 366000:516000' istartstopstr_list_str_seed = '0:1500 1500:4500 4500:8500 8500:13500 13500:21000 21000:31000 51000:71000 71000:101000 101000:141000 141000:191000 191000:266000 266000:366000 366000:516000 516000:816000 816000:1316000 7:500007 500007:1000007 1000007:1500007 1316000:2066000 7:1000007' istartstopstr_list = istartstopstr_list_str.split(' ') istartstopstr_list_seed = istartstopstr_list_str_seed.split(' ') istartstoplist = [] for istartstopstr in istartstopstr_list: istartstoplist.append([int(iss) for iss in istartstopstr.split(':')]) n_query_list = [istartstop[1] - istartstop[0] for istartstop in istartstoplist] istartstoplist_seed = []
"naive-hamming-partition", "vsearch-partition", "seed-partition", "seed-naive-hamming-partition", "run-viterbi", "run-changeo", "run-mixcr", "run-igscueal", "synthetic", "write-plots", "compare-subsets", "annotate-seed-clusters", ] parser.add_argument("--actions", required=True, choices=all_actions) # default=':'.join(all_actions)) args = parser.parse_args() args.actions = utils.get_arg_list(args.actions) args.mutation_multipliers = utils.get_arg_list(args.mutation_multipliers, floatify=True) args.n_leaf_list = utils.get_arg_list(args.n_leaf_list, floatify=True) args.istartstop = utils.get_arg_list(args.istartstop, intify=True) args.istartstoplist = utils.get_arg_list(args.istartstoplist, intify=True, list_of_pairs=True) args.humans = utils.get_arg_list(args.humans) args.hfrac_bound_list = utils.get_arg_list(args.hfrac_bound_list, floatify=True, list_of_pairs=True) args.expected_methods = utils.get_arg_list(args.expected_methods) args.synthetic_partitions = utils.get_arg_list(args.synthetic_partitions) for isp in range(len(args.synthetic_partitions)): # I really shouldn't have set it up this way args.synthetic_partitions[isp] = "misassign-" + args.synthetic_partitions[isp] args.seed_cluster_bounds = utils.get_arg_list(args.seed_cluster_bounds, intify=True) assert args.subset is None or args.istartstop is None # dosn't make sense to set both of them if args.subset is not None:
parser.add_argument('--n-procs-per-test', type=int, default=5) parser.add_argument('--plot', action='store_true') parser.add_argument('--write-zenodo-files', action='store_true') parser.add_argument('--plot-annotation-performance', action='store_true') parser.add_argument('--add-gene-counts-to-tree-plots', action='store_true') parser.add_argument('--print-table', action='store_true') parser.add_argument('--no-slurm', action='store_true') parser.add_argument('--plotcache', action='store_true') parser.add_argument('--only-print', action='store_true') parser.add_argument('--check', action='store_true') parser.add_argument('--dryrun', action='store_true') parser.add_argument('--label', default='xxx') parser.add_argument('--ete-path', default='/home/' + os.getenv('USER') + '/anaconda_ete/bin') args = parser.parse_args() args.methods = sorted(utils.get_arg_list(args.methods)) args.v_genes = utils.get_arg_list(args.v_genes) args.n_event_list = utils.get_arg_list(args.n_event_list, intify=True) if args.print_table and args.action == 'data': # only want to print table for single samples assert not args.sample_vs_sample and not args.method_vs_method # ---------------------------------------------------------------------------------------- alfdir = utils.fsdir() + '/partis/allele-finder' baseoutdir = alfdir if args.label is not None: baseoutdir += '/' + args.label baseoutdir += '/' + args.action if args.varvals is None: args.varvals = default_varvals[args.action] kwargs = {}
parser.add_argument('--plot-annotation-performance', action='store_true', help='see bin/partis --help') parser.add_argument('--methods', default='simu:partis', help='colon-separated list of methods to run. By default runs simulation, and then partis inference (igdiscover and tigger, if installed, are the other options)') parser.add_argument('--outdir', default=utils.fsdir() + '/partis/allele-finder') parser.add_argument('--inf-glfo-dir', help='default set below') parser.add_argument('--simfname', help='default set below') parser.add_argument('--workdir', default=utils.fsdir() + '/_tmp/hmms/' + str(random.randint(0, 999999))) parser.add_argument('--n-tests', type=int, help='instead of just running once, run <N> independent tests simultaneously') parser.add_argument('--iteststart', type=int, default=0, help='for use with --n-tests, if you want to add more tests on') parser.add_argument('--plot-and-fit-absolutely-everything', type=int, help='fit every single position for this <istart> and write every single corresponding plot (slow as hell, and only for debugging/making plots for paper)') parser.add_argument('--partis-path', default='./bin/partis') parser.add_argument('--species', default='human', choices=('human', 'macaque')) parser.add_argument('--locus', default='igh') args = parser.parse_args() assert args.locus == 'igh' # would just need to update some things, e.g. propagate through to the various methods args.dj_genes = utils.get_arg_list(args.dj_genes) args.sim_v_genes = utils.get_arg_list(args.sim_v_genes) args.inf_v_genes = utils.get_arg_list(args.inf_v_genes) args.allele_prevalence_freqs = utils.get_arg_list(args.allele_prevalence_freqs, floatify=True) args.methods = utils.get_arg_list(args.methods) available_methods = set(['simu', 'partis', 'full', 'tigger-default', 'tigger-tuned', 'igdiscover']) if len(set(args.methods) - available_methods) > 0: raise Exception('unexpected --methods: %s' % ' '.join(set(args.methods) - available_methods)) args.default_germline_dir = 'old-glfo/%s' % args.species # 'data/germlines/%s' % args.species positions = { 'snp' : utils.get_arg_list(args.snp_positions), 'indel' : utils.get_arg_list(args.indel_positions), } numbers = {
'--ref-label', help='label (in --glslabels) corresponding to simulation/truth') args = parser.parse_args() sys.path.insert(1, args.partis_dir + '/python') try: import utils import glutils except ImportError as e: print e raise Exception( 'couldn\'t import from main partis dir \'%s\' (set with --partis-dir)' % args.partis_dir) args.glsfnames = utils.get_arg_list(args.glsfnames) args.glslabels = utils.get_arg_list(args.glslabels) args.legends = utils.get_arg_list(args.legends) if not os.path.exists(args.muscle_path): raise Exception( 'muscle binary %s doesn\'t exist (set with --muscle-path)' % args.muscle_path) if not os.path.exists(args.raxml_path): raise Exception('raxml binary %s doesn\'t exist (set with --raxml-path)' % args.raxml_path) if not os.path.exists(args.plotdir): os.makedirs(args.plotdir) args.leafheight = 20 if args.leaf_names else 10 # arg, kinda messy args.novel_dot_size = 2.5 assert len(args.glslabels) == len(set(args.glslabels)) # no duplicates
if alleles is None: # take all of 'em alleles = [ utils.allele(g) for g in glfo['seqs'][args.region] if base == get_base(g) ] return [ args.locus.upper() + args.region.upper() + base + '*' + al for al in alleles ] if args.bases == 'all': glutils.print_glfo(glfo) sys.exit(0) args.bases = utils.get_arg_list(args.bases) args.allele_numbers = utils.get_arg_list(args.allele_numbers) genes = [ g for base in args.bases for g in get_genes(base, args.allele_numbers) ] if len(genes) == 0: raise Exception( 'couldn\'t find any genes for the specified --bases %s\n choices:\n %s' % (' '.join(args.bases), ' '.join( sorted(set([get_base(g) for g in glfo['seqs'][args.region]]))))) args.other_genes = utils.get_arg_list(args.other_genes) if args.other_genes is not None: genes += args.other_genes seqstrs = ['' for _ in range(len(genes))] snpstrs = ['' for _ in range(len(genes))]
import utils import glutils parser = argparse.ArgumentParser() parser.add_argument('--base', required=True) parser.add_argument('--alleles') parser.add_argument('--other-genes') parser.add_argument('--region', default='v') parser.add_argument('--chain', default='h') parser.add_argument('--glfo-dir', default='data/germlines/human') args = parser.parse_args() glfo = glutils.read_glfo(args.glfo_dir, args.chain) if args.alleles is None: args.alleles = [utils.allele(g) for g in glfo['seqs'][args.region] if args.base == utils.primary_version(g) + '-' + utils.sub_version(g)] else: args.alleles = utils.get_arg_list(args.alleles) args.other_genes = utils.get_arg_list(args.other_genes) # for g, s in glfo['seqs']['v'].items(): # print '%s %3d' % (utils.color_gene(g, width=20), len(s) - glfo['cyst-positions'][g]) # sys.exit() # base = '4-59' # a1, a2 = '12', '01' # gene1, gene2 = 'IGHV' + base + '*' + a1, 'IGHV' + base + '*' + a2 genes = ['IG' + args.chain.upper() + args.region.upper() + args.base + '*' + al for al in args.alleles] if args.other_genes is not None: genes += args.other_genes codon_positions = glfo[utils.conserved_codons[args.chain][args.region] + '-positions'] if args.region != 'd' else None
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--treefname', required=True) parser.add_argument('--outfname', required=True) parser.add_argument('--lb-metric', default='lbi', choices=affy_metrics+delta_affy_metrics) parser.add_argument('--affy-key', default='affinity', choices=['affinity', 'relative_affinity']) # parser.add_argument('--lb-tau', required=True, type=float) parser.add_argument('--metafname') parser.add_argument('--queries-to-include') parser.add_argument('--tree-style', default='rectangular', choices=['rectangular', 'circular']) parser.add_argument('--partis-dir', default=os.path.dirname(os.path.realpath(__file__)).replace('/bin', ''), help='path to main partis install dir') parser.add_argument('--log-lbr', action='store_true') args = parser.parse_args() sys.path.insert(1, args.partis_dir + '/python') try: import utils import treeutils import glutils import plotting except ImportError as e: print e raise Exception('couldn\'t import from main partis dir \'%s\' (set with --partis-dir)' % args.partis_dir) args.queries_to_include = utils.get_arg_list(args.queries_to_include) args.metafo = None if args.metafname is not None: with open(args.metafname) as metafile: args.metafo = yaml.load(metafile, Loader=yaml.CLoader) plot_trees(args)
parser.add_argument('--n-queries', type=int, default=-1) parser.add_argument('--queries') parser.add_argument('--plotdir', required=True) parser.add_argument('--debug', type=int, default=0, choices=[0, 1, 2]) parser.add_argument('--datadir', default='data/imgt') parser.add_argument( '--infname' ) # input html file, if you chose the 'html' option on the imgt website parser.add_argument( '--simfname' ) # simulation csv file corresponding to the queries in <infname> or <indir> parser.add_argument( '--indir' ) # folder with imgt result files data/performance/imgt/IMGT_HighV-QUEST_individual_files_folder' parser.add_argument('-skip-missing-genes', action='store_true') parser.add_argument( '-dont-skip-or15-genes', action='store_true', help= 'by default skip all the genes with the /OR1[56] bullshit, since they don\'t seem to be in imgt\'s output' ) args = parser.parse_args() args.queries = utils.get_arg_list(args.queries) # if os.path.isdir('data/performance/imgt'): # print 'skipping tar xzf \'cause output\'s already there' # else: # print 'untgzing...' # check_call(['tar', 'xzf', 'data/performance/imgt.tgz', '-C', 'data/performance/']) # untar the imgt output imgtparser = IMGTParser(args)