def parse_arguments(): info = 'Checks mutations to see what strand they are reported on and for unmapped mutations.' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument('-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='stdout', help='Path to log file. (Default: stdout)') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # program arguments help_str = 'Human genome FASTA file' parser.add_argument('-f', '--fasta', type=str, required=True, help=help_str) help_str = 'Text file specifying mutations in the format required for permutation test' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file of reference transcripts' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = 'Save mutations that could not be found on the reference transcript' parser.add_argument('-u', '--unmapped', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging return vars(args)
def parse_arguments(): info = 'Extracts gene sequences from a genomic FASTA file' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument('-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='', help='Path to log file. (accepts stdout)') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # program arguments help_str = 'Human genome FASTA file' parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = 'Output a single FASTA file with gene sequences' parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging # log user entered command logger.info('Command: {0}'.format(' '.join(sys.argv))) return vars(args)
def parse_arguments(): # make a parser info = 'Performs a randomization-based test on the oncogene and TSG score' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument( '-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='', help='Path to log file. (accepts "stdout")') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # program arguments help_str = 'gene FASTA file from extract_gene_seq.py script' parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = 'DNA mutations file' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = 'Directory containing score information in pickle files (Default: None).' parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = 'Directory containing neighbor graph information in pickle files (Default: None).' parser.add_argument('-ng', '--neighbor-graph-dir', type=str, default=None, help=help_str) help_str = ('Number of processes to use. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ('Number of iterations for null model. p-value precision ' 'increases with more iterations, however this will also ' 'increase the run time (Default: 10000).') parser.add_argument('-n', '--num-iterations', type=int, default=10000, help=help_str) help_str = ( 'Number of iterations more significant then the observed statistic ' 'to stop further computations. This decreases compute time spent in resolving ' 'p-values for non-significant genes. (Default: 1000).') parser.add_argument('-sc', '--stop-criteria', type=int, default=1000, help=help_str) help_str = ( 'Kind of permutation test to perform ("oncogene" or "tsg"). "position-based" permutation ' 'test is intended to find oncogenes using position based statistics. ' 'The "deleterious" permutation test is intended to find tumor ' 'suppressor genes. (Default: oncogene)') parser.add_argument('-k', '--kind', type=str, default='oncogene', help=help_str) help_str = ( 'Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) help_str = ( 'Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ( 'Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = ('Only keep unique mutations for each tumor sample.' 'Mutations reproted from heterogeneous sources may contain' ' duplicates, e.g. a tumor sample was sequenced twice.') parser.add_argument('--unique', action='store_true', default=False, help=help_str) help_str = ('Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ('Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Defaul: .02).') parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) help_str = ( 'Perform tsg permutation test if gene has ' 'at least a user specified number of deleterious mutations (default: 1)' ) parser.add_argument('-d', '--deleterious', type=int, default=1, help=help_str) help_str = ('Maximum TSG score to allow gene to be tested for oncogene ' 'permutation test. Values greater than one indicate all ' 'genes will be tested (Default: 1.01).') parser.add_argument('-t', '--tsg-score', type=float, default=1.01, help=help_str) help_str = ('Deleterious mutation pseudo-count for null distribution ' 'statistics. (Default: 0)') parser.add_argument('-dp', '--deleterious-pseudo-count', type=int, default=0, help=help_str) help_str = ( 'Recurrent missense mutation pseudo-count for null distribution ' 'statistics. (Default: 0)') parser.add_argument('-rp', '--recurrent-pseudo-count', type=int, default=0, help=help_str) help_str = ('Specify the seed for the pseudo random number generator. ' 'By default, the seed is randomly chosen based. The seed will ' 'be used for the permutation test monte carlo simulations.') parser.add_argument('-seed', '--seed', type=int, default=None, help=help_str) help_str = 'Output of probabilistic 20/20 results' parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts
def parse_arguments(): # make a parser info = 'Performs a statistical test for oncogene, TSG, or driver gene' parent_parser = argparse.ArgumentParser(description=info) # logging arguments parent_parser.add_argument( '-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parent_parser.add_argument('-l', '--log', type=str, action='store', default='stdout', help='Path to log file. (accepts "stdout")') parent_parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # add subparsers subparsers = parent_parser.add_subparsers(title='Driver Gene Type', dest='kind') parser_og = subparsers.add_parser( 'oncogene', help='Find statistically significant oncogene-like genes.', description='Find statsitically significant oncogene-like genes. ' 'Evaluates clustering of missense mutations and high in ' 'silico pathogenicity scores for missense mutations.') help_info = 'Find statistically significant Tumor Suppressor-like genes.' parser_tsg = subparsers.add_parser( 'tsg', help=help_info, description=help_info + ' Evaluates for a higher proportion ' 'of inactivating mutations than expected.') #parser_protein = subparsers.add_parser('protein', help='Find statistically significant ' #'3D clustering in genes based on protein structure.') # program arguments for i, parser in enumerate([parser_og, parser_tsg]): # group of parameters major_parser = parser.add_argument_group(title='Major options') advance_parser = parser.add_argument_group(title='Advanced options') # set the CLI params help_str = 'gene FASTA file from extract_gene_seq.py script' major_parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = ( 'DNA mutations file (MAF file). Columns can be in any order, ' 'but should contain the correct column header names.') major_parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' major_parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = ( 'Number of processes to use for parallelization. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') major_parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ('Number of iterations for null model. p-value precision ' 'increases with more iterations, however this will also ' 'increase the run time (Default: 100,000).') major_parser.add_argument('-n', '--num-iterations', type=int, default=100000, help=help_str) help_str = ( 'Number of iterations more significant then the observed statistic ' 'to stop further computations. This decreases compute time spent in resolving ' 'p-values for non-significant genes. (Default: 1000).') advance_parser.add_argument('-sc', '--stop-criteria', type=int, default=1000, help=help_str) help_str = ( 'Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') major_parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) if i == 0: help_str = 'Directory containing VEST score information in pickle files (Default: None).' major_parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = ( 'Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') advance_parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ( 'Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Defaul: .02).') advance_parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) elif i == 1: help_str = ( 'Perform tsg randomization-based test if gene has ' 'at least a user specified number of deleterious mutations (default: 1)' ) advance_parser.add_argument('-d', '--deleterious', type=int, default=1, help=help_str) elif i == 2: help_str = 'Directory containing codon neighbor graph information in pickle files (Default: None).' major_parser.add_argument('-ng', '--neighbor-graph-dir', type=str, required=True, help=help_str) help_str = ( 'Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') advance_parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ( 'Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Defaul: .02).') advance_parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) help_str = ('Only keep unique mutations for each tumor sample. ' 'Mutations reported from heterogeneous sources may contain' ' duplicates, e.g. a tumor sample was sequenced twice.') advance_parser.add_argument('--unique', action='store_true', default=False, help=help_str) help_str = ( 'Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') advance_parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ( 'Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') advance_parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = ('Specify the seed for the pseudo random number generator. ' 'By default, the seed is randomly chosen. The seed will ' 'be used for the monte carlo simulations (Default: 101).') advance_parser.add_argument('-seed', '--seed', type=int, default=101, help=help_str) help_str = 'Output text file of probabilistic 20/20 results' major_parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parent_parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Version: {0}'.format(prob2020.__version__)) logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts
def parse_arguments(): # make a parser info = 'Simulates the non-silent mutation ratio by randomly permuting mutations' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument( '-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='', help='Path to log file. (accepts "stdout")') # program arguments help_str = 'gene FASTA file from extract_gene_seq.py script' parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = 'DNA mutations file' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = ('Number of processes to use. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ('Number of permutations for null model. p-value precision ' 'increases with more permutations (Default: 10000).') parser.add_argument('-n', '--num-permutations', type=int, default=10000, help=help_str) help_str = ( 'Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) help_str = 'Directory containing score information in pickle files (Default: None).' parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = 'Report counts for observed mutations stratified by the tumor sample' parser.add_argument('-bs', '--by-sample', action='store_true', help=help_str) help_str = ( 'Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ( 'Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = 'Output text file of observed results (optional).' parser.add_argument('-oo', '--observed-output', type=str, default=None, help=help_str) help_str = 'Output text file of simulation results' parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts
def parse_arguments(): info = 'Checks mutations to see what strand they are reported on and for unmapped mutations.' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument( '-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='stdout', help='Path to log file. (Default: stdout)') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # program arguments help_str = 'Human genome FASTA file' parser.add_argument('-f', '--fasta', type=str, required=True, help=help_str) help_str = 'Text file specifying mutations in the format required for permutation test' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file of reference transcripts' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = 'Save mutations that could not be found on the reference transcript' parser.add_argument('-u', '--unmapped', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging return vars(args)
def parse_arguments(): # make a parser info = 'Either simulates or summarizes observed mutation data.' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument( '-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='stdout', help='Path to log file. (accepts "stdout")') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # program arguments help_str = 'gene FASTA file from extract_gene_seq script' parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = 'DNA mutations file (MAF file)' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = ( 'Directory containing pre-compute score information in ' 'for VEST and evolutionary conservation in pickle format (Default: None).' ) parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = ('Number of processes to use. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ( 'Number of iterations for null model simulations. If zero is ' 'specified then output represents a result from actually observed mutations (provided by -m parameter), ' 'otherwise results will be from simulated mutations. (Default: 0).') parser.add_argument('-n', '--num-iterations', type=int, default=0, help=help_str) help_str = ( 'Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) parser_grouper = parser.add_mutually_exclusive_group(required=True) parser_grouper.add_argument('--summary', action='store_true', help='Flag for saving results as summarized ' 'features used (Default: True).') parser_grouper.add_argument('--maf', action='store_true', help='Flag for saving results in MAF format ' '(Default: False).') help_str = ( 'Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ( 'Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = ('Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ('Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Default: .02).') parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) help_str = ('Only keep unique mutations for each tumor sample.' 'Mutations reproted from heterogeneous sources may contain' ' duplicates, e.g. a tumor sample was sequenced twice.') parser.add_argument('--unique', action='store_true', default=False, help=help_str) help_str = ('Specify the seed for the pseudo random number generator. ' 'By default, the seed is randomly chosen based. The seed will ' 'be used for the monte carlo simulations (Default: 101).') parser.add_argument('-seed', '--seed', type=int, default=101, help=help_str) help_str = 'Output text file of results' parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts
def parse_arguments(): # make a parser info = 'Simulates the non-silent mutation ratio by randomly permuting mutations' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument('-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='', help='Path to log file. (accepts "stdout")') # program arguments help_str = 'gene FASTA file from extract_gene_seq.py script' parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = 'DNA mutations file' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = ('Number of processes to use. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ('Number of permutations for null model. p-value precision ' 'increases with more permutations (Default: 10000).') parser.add_argument('-n', '--num-permutations', type=int, default=10000, help=help_str) help_str = ('Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) help_str = 'Directory containing score information in pickle files (Default: None).' parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = 'Report counts for observed mutations stratified by the tumor sample' parser.add_argument('-bs', '--by-sample', action='store_true', help=help_str) help_str = ('Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ('Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = 'Output text file of observed results (optional).' parser.add_argument('-oo', '--observed-output', type=str, default=None, help=help_str) help_str = 'Output text file of simulation results' parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts
def parse_arguments(): # make a parser info = 'Performs a randomization-based test on the oncogene and TSG score' parser = argparse.ArgumentParser(description=info) # logging arguments parser.add_argument('-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parser.add_argument('-l', '--log', type=str, action='store', default='', help='Path to log file. (accepts "stdout")') parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # program arguments help_str = 'gene FASTA file from extract_gene_seq.py script' parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = 'DNA mutations file' parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = 'Directory containing score information in pickle files (Default: None).' parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = 'Directory containing neighbor graph information in pickle files (Default: None).' parser.add_argument('-ng', '--neighbor-graph-dir', type=str, default=None, help=help_str) help_str = ('Number of processes to use. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ('Number of iterations for null model. p-value precision ' 'increases with more iterations, however this will also ' 'increase the run time (Default: 10000).') parser.add_argument('-n', '--num-iterations', type=int, default=10000, help=help_str) help_str = ('Number of iterations more significant then the observed statistic ' 'to stop further computations. This decreases compute time spent in resolving ' 'p-values for non-significant genes. (Default: 1000).') parser.add_argument('-sc', '--stop-criteria', type=int, default=1000, help=help_str) help_str = ('Kind of permutation test to perform ("oncogene" or "tsg"). "position-based" permutation ' 'test is intended to find oncogenes using position based statistics. ' 'The "deleterious" permutation test is intended to find tumor ' 'suppressor genes. (Default: oncogene)') parser.add_argument('-k', '--kind', type=str, default='oncogene', help=help_str) help_str = ('Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) help_str = ('Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ('Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = ('Only keep unique mutations for each tumor sample.' 'Mutations reproted from heterogeneous sources may contain' ' duplicates, e.g. a tumor sample was sequenced twice.') parser.add_argument('--unique', action='store_true', default=False, help=help_str) help_str = ('Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ('Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Defaul: .02).') parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) help_str = ('Perform tsg permutation test if gene has ' 'at least a user specified number of deleterious mutations (default: 1)') parser.add_argument('-d', '--deleterious', type=int, default=1, help=help_str) help_str = ('Maximum TSG score to allow gene to be tested for oncogene ' 'permutation test. Values greater than one indicate all ' 'genes will be tested (Default: 1.01).') parser.add_argument('-t', '--tsg-score', type=float, default=1.01, help=help_str) help_str = ('Deleterious mutation pseudo-count for null distribution ' 'statistics. (Default: 0)') parser.add_argument('-dp', '--deleterious-pseudo-count', type=int, default=0, help=help_str) help_str = ('Recurrent missense mutation pseudo-count for null distribution ' 'statistics. (Default: 0)') parser.add_argument('-rp', '--recurrent-pseudo-count', type=int, default=0, help=help_str) help_str = ('Specify the seed for the pseudo random number generator. ' 'By default, the seed is randomly chosen based. The seed will ' 'be used for the permutation test monte carlo simulations.') parser.add_argument('-seed', '--seed', type=int, default=None, help=help_str) help_str = 'Output of probabilistic 20/20 results' parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts
def parse_arguments(): # make a parser info = 'Performs a statistical test for oncogene, TSG, or driver gene' parent_parser = argparse.ArgumentParser(description=info) # logging arguments parent_parser.add_argument('-ll', '--log-level', type=str, action='store', default='', help='Write a log file (--log-level=DEBUG for debug mode, ' '--log-level=INFO for info mode)') parent_parser.add_argument('-l', '--log', type=str, action='store', default='stdout', help='Path to log file. (accepts "stdout")') parent_parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Flag for more verbose log output') # add subparsers subparsers = parent_parser.add_subparsers(title='Driver Gene Type', dest='kind') parser_og = subparsers.add_parser('oncogene', help='Find statistically significant oncogene-like genes.', description='Find statsitically significant oncogene-like genes. ' 'Evaluates clustering of missense mutations and high in ' 'silico pathogenicity scores for missense mutations.') help_info = 'Find statistically significant Tumor Suppressor-like genes.' parser_tsg = subparsers.add_parser('tsg', help=help_info, description=help_info + ' Evaluates for a higher proportion ' 'of inactivating mutations than expected.') help_info = 'Find codons with significant clustering of missense mutations in sequence.' parser_hotmaps = subparsers.add_parser('hotmaps1d', help=help_info, description=help_info + ' Evaluates for a higher ammount of ' 'clustering of missense mutations.') #parser_protein = subparsers.add_parser('protein', help='Find statistically significant ' #'3D clustering in genes based on protein structure.') # program arguments for i, parser in enumerate([parser_og, parser_tsg, parser_hotmaps]): # group of parameters major_parser = parser.add_argument_group(title='Major options') advance_parser = parser.add_argument_group(title='Advanced options') # set the CLI params help_str = 'gene FASTA file from extract_gene_seq.py script' major_parser.add_argument('-i', '--input', type=str, required=True, help=help_str) help_str = ('DNA mutations file (MAF file). Columns can be in any order, ' 'but should contain the correct column header names.') major_parser.add_argument('-m', '--mutations', type=str, required=True, help=help_str) help_str = 'BED file annotation of genes' major_parser.add_argument('-b', '--bed', type=str, required=True, help=help_str) help_str = ('Number of processes to use for parallelization. 0 indicates using a single ' 'process without using a multiprocessing pool ' '(more means Faster, default: 0).') major_parser.add_argument('-p', '--processes', type=int, default=0, help=help_str) help_str = ('Number of iterations for null model. p-value precision ' 'increases with more iterations, however this will also ' 'increase the run time (Default: 100,000).') major_parser.add_argument('-n', '--num-iterations', type=int, default=100000, help=help_str) help_str = ('Number of iterations more significant then the observed statistic ' 'to stop further computations. This decreases compute time spent in resolving ' 'p-values for non-significant genes. (Default: 1000).') advance_parser.add_argument('-sc', '--stop-criteria', type=int, default=1000, help=help_str) help_str = ('Number of DNA bases to use as context. 0 indicates no context. ' '1 indicates only use the mutated base. 1.5 indicates using ' 'the base context used in CHASM ' '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). ' '2 indicates using the mutated base and the upstream base. ' '3 indicates using the mutated base and both the upstream ' 'and downstream bases. (Default: 1.5)') major_parser.add_argument('-c', '--context', type=float, default=1.5, help=help_str) if i == 0: help_str = 'Directory containing VEST score information in pickle files (Default: None).' major_parser.add_argument('-s', '--score-dir', type=str, default=None, help=help_str) help_str = ('Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') advance_parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ('Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Defaul: .02).') advance_parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) elif i == 1: help_str = ('Perform tsg randomization-based test if gene has ' 'at least a user specified number of deleterious mutations (default: 1)') advance_parser.add_argument('-d', '--deleterious', type=int, default=1, help=help_str) elif i == 2: help_str = ('Sequence window size for HotMAPS 1D algorithm ' 'by number of codons (Default: 3)') advance_parser.add_argument('-w', '--window', type=str, default='3', help=help_str) help_str = ('Flag for reporting index (row number, starts at zero) in associated mutation file') advance_parser.add_argument('-r', '--report-index', action='store_true', default=False, help=help_str) help_str = ('Path to directory to save empirical null distribution') advance_parser.add_argument('-nd', '--null-distr-dir', type=str, help=help_str) elif i == 3: help_str = 'Directory containing codon neighbor graph information in pickle files (Default: None).' major_parser.add_argument('-ng', '--neighbor-graph-dir', type=str, required=True, help=help_str) help_str = ('Minimum number of mutations at a position for it to be ' 'considered a recurrently mutated position (Default: 3).') advance_parser.add_argument('-r', '--recurrent', type=int, default=3, help=help_str) help_str = ('Fraction of total mutations in a gene. This define the ' 'minimumm number of mutations for a position to be defined ' 'as recurrently mutated (Default: .02).') advance_parser.add_argument('-f', '--fraction', type=float, default=.02, help=help_str) help_str = ('Only keep unique mutations for each tumor sample. ' 'Mutations reported from heterogeneous sources may contain' ' duplicates, e.g. a tumor sample was sequenced twice.') advance_parser.add_argument('--unique', action='store_true', default=False, help=help_str) help_str = ('Use mutations that are not mapped to the the single reference ' 'transcript for a gene specified in the bed file indicated by ' 'the -b option.') advance_parser.add_argument('-u', '--use-unmapped', action='store_true', default=False, help=help_str) help_str = ('Path to the genome fasta file. Required if --use-unmapped flag ' 'is used. (Default: None)') advance_parser.add_argument('-g', '--genome', type=str, default='', help=help_str) help_str = ('Specify the seed for the pseudo random number generator. ' 'By default, the seed is randomly chosen. The seed will ' 'be used for the monte carlo simulations (Default: 101).') advance_parser.add_argument('-seed', '--seed', type=int, default=101, help=help_str) help_str = 'Output text file of probabilistic 20/20 results' major_parser.add_argument('-o', '--output', type=str, required=True, help=help_str) args = parent_parser.parse_args() # handle logging if args.log_level or args.log: if args.log: log_file = args.log else: log_file = '' # auto-name the log file else: log_file = os.devnull log_level = args.log_level utils.start_logging(log_file=log_file, log_level=log_level, verbose=args.verbose) # start logging opts = vars(args) if opts['use_unmapped'] and not opts['genome']: print('You must specify a genome fasta with -g if you set the ' '--use-unmapped flag to true.') sys.exit(1) # log user entered command logger.info('Version: {0}'.format(prob2020.__version__)) logger.info('Command: {0}'.format(' '.join(sys.argv))) return opts