def parse_args(args): op = ArgumentParser(description='Find groups of orthologous genes.') #op.add_argument(dest='directory') op.add_argument('-o', '--out', '--dir', dest='out', required=True) op.add_argument('-g', '--gbs', '--annotations', dest='annotations') op.add_argument('-p', '--proteins', '--proteomes', dest='proteomes') op.add_argument('--no-download', dest='download_anno', action='store_false', default=True) op.add_argument('-s', '--species', '--species-list', dest='species_list') op.add_argument('-i', '--ids', '--ids-list', dest='ids_list') op.add_argument('--prot-id-field', dest='prot_id_field', default=1) op.usage = '''Finding orthogroups for a list of annotations / proteomes / ref ids / species. Test runs: python scenario_1.py --ids test_input/ids.txt -o test_ids python scenario_1.py --proteomes test_input/proteins -o test_proteomes Usage: %s [-p <proteomes dir>] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>] [-o <dir>] [--jobs 30] [--start-from <step num>] -o Output directory. -g Directory with .gb files for references with annotations. -p Directory with fasta (or faa, fa) files of protein sequences. If they are named by their reference ids (i.e. NC_005816.1.fasta), annotations will be downloaded from NCBI. -i File with reference ids (will be fetched from NCBI). -s File with a list of organism names as in Genbank. --prot-id-field When specifying proteomes, use this fasta id field number to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...). ''' % basename(__file__) #-a --annotations-dir Directory with .gb files. #-p --proteomes-dir Directory with fasta files of proteomes. #-i --ids-list File with reference ids (will be fetched from Genbank). #-s --species-list File with a list of organism names as in Genbank. # For example, "Salmonella enterica subsp. enterica serovar Typhi str. P-stx-12". #''' #indent = ' ' * len('usage: ' + basename(__file__) + ' ') #op.usage = basename(__file__) + ' [--annotations-dir DIR]\n' + \ # indent + '[--proteomes-dir DIR]\n' + \ # indent + '[--ids-file FILE]\n' + \ # indent + '[--species-file FILE]\n' add_common_arguments(op) p = op.parse_args(args) check_common_args(p) if not p.out: arg_parse_error('Specify output directory with -o.') if isfile(expanduser(p.out)): arg_parse_error('%s is a file' % p.out) p.out = abspath(expanduser(p.out)) if not isdir(p.out): makedirs(p.out) if p.species_list: check_file(expanduser(p.species_list)) p.species_list = abspath(expanduser(p.species_list)) if p.ids_list: check_file(expanduser(p.ids_list)) p.ids_list = abspath(expanduser(p.ids_list)) if p.proteomes: check_dir(expanduser(p.proteomes)) p.proteomes = abspath(expanduser(p.proteomes)) if p.annotations: check_dir(expanduser(p.annotations)) p.annotations = abspath(expanduser(p.annotations)) #if p.species_list or p.ids_list: # if not isdir(p.out): # mkdir(p.out) #else: # if not p.directory: # arg_parse_error('Directory or file must be specified.') # check_dir(p.directory) return p
def parse_args(args): import argparse op = argparse.ArgumentParser(description='Find groups of orthologous genes.') #op.add_argument(dest='directory', required=False) op.add_argument('-s1o', dest='directory', required=True) op.add_argument('-s2o', '-o', dest='out_dir', required=False) op.add_argument('-a', '--assemblies', dest='assemblies') op.add_argument('-g', '--annotations', '--gbs', dest='annotations') op.add_argument('-p', '--proteomes', '--proteins', dest='proteomes') op.add_argument('-i', '--ids', '--ids-list', dest='ids_list') op.add_argument('--prot-id-field', dest='prot_id_field', default=1) op.add_argument('--skip-blast-singletones', dest='blast_singletones', action='store_false', default=True) op.add_argument('--blastdb', '--blast-db', dest='blastdb') #-o: Output directory (if not specified, the input directory will be used). op.usage = '''Extends an orthogroup database and orthogroups files. First argument is a path to existed Scenario 1 output. Test runs: python scenario_2.py -s1o test_ids -s2o test_ids_new_ids --ids test_input/new_ids.txt python scenario_2.py -s1o test_proteomes -s2o test_prots_new_prots --proteomes test_input/new_proteins Usage: %s -s1o <scenario_1 result dir> -s2o <new output dir> [-a <assembies dir>] [-p <proteomes dir] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>] [--jobs 30] [--start-from <step num>] [--blast-singletones] [--blast-db <path>] -s1o Path to existed Scenario 1 output. -s2o Output directory (optional, if ommited, the input directory will be used). -a --assemblies: Directory with assemblies in fasta format. -g Directory with .gb files for references with annotations. -p Directory with fasta (or faa, fa) files of protein sequences. If they are named by their reference ids (i.e. NC_005816.1.fasta), annotations will be downloaded from NCBI. -i File with reference ids (will be fetched from NCBI). -s File with a list of organism names as in Genbank. --prot-id-field When specifying proteomes, use this fasta id field number to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...). --blast-singletones Search newly added proteins agains NCBI database, if they did not fit any group with known proteins. --blastdb Local BLAST database path. If not set, "blastdb" value in config.txt will be used. If it was not set either, remote NCBI will be used. ''' % basename(__file__) #indent = ' ' * len('usage: ' + basename(__file__) + ' ') #op.usage = basename(__file__) + ' [--existing-blast-results TSV]\n' + \ # indent + '[--existing_proteomes DIR]\n' + \ # indent + '[--assembly FASTA]\n' + \ # indent + '[--genes GFF]\n' + \ # indent + '[--proteome FASTA]\n' add_common_arguments(op) p = op.parse_args(args) check_common_args(p) if p.assemblies: check_dir(expanduser(p.assemblies)) p.assemblies = abspath(expanduser(p.assemblies)) if p.proteomes: check_dir(expanduser(p.proteomes)) p.proteomes = abspath(expanduser(p.proteomes)) if p.ids_list: check_file(expanduser(p.ids_list)) p.ids_list = abspath(expanduser(p.ids_list)) if p.annotations: check_dir(expanduser(p.annotations)) p.annotations = abspath(expanduser(p.annotations)) if p.blastdb: p.blastdb = abspath(expanduser(p.blastdb)) if not isdir(expanduser(p.directory)): arg_parse_error('Directory %s does not exist.' % p.directory) p.directory = abspath(expanduser(p.directory)) if not p.out_dir: p.out_dir = p.directory #arg_parse_error('Specify output directory with -o.') if isfile(expanduser(p.out_dir)): arg_parse_error('%s is a file' % p.out_dir) p.out_dir = abspath(expanduser(p.out_dir)) return p
def main(args): register_ctrl_c() p = parse_args(args) try: if not exists(join(p.directory, 'intermediate')): arg_parse_error('You need to run Scenario 1 on this directory first.') if not p.out_dir: p.out_dir = p.directory working_dir = p.out_dir with open(config_file) as cf: conf = dict( l.strip().split('=', 1) for l in cf.readlines() if l.strip() and l.strip()[0] != '#') start_from, start_after = get_starting_step(p.start_from, join(working_dir, log_fname)) if (not start_from or start_from == 1) and p.out_dir != p.directory: log_text = '' if isdir(p.out_dir): if not p.overwrite: files = [f for f in listdir(p.out_dir) if f and f[0] != '.'] #log.debug(files) if files: print('The output directory exists. Do you want to overwrite it? ' + '(You can run with the --overwrite option to avoid this warning.)') try: raw_input('Press any key to overwrite and continue, or Ctrl+C to interrupt.\n> ') except (EOFError, KeyboardInterrupt, SystemExit, GeneratorExit): exit(1) if exists(join(p.out_dir, log_fname)): with open(join(p.out_dir, log_fname)) as log_f: log_text = log_f.read() rmtree(p.out_dir) makedirs(p.out_dir) rmdir(p.out_dir) copytree(p.directory, p.out_dir) if isfile(join(p.out_dir, log_fname)): remove(join(p.out_dir, log_fname)) chdir(p.out_dir) if log_text: with open(join(p.out_dir, log_fname), 'w') as log_f: log_f.write(log_text) log_fpath = set_up_logging(p.debug, p.out_dir, 'a') log.info('python ' + basename(__file__) + ' ' + ' '.join(args)) log.info('') check_and_install_tools(p.debug, conf.get('db_vendor', 'sqlite'), log_fpath) log.info('Changing to %s' % working_dir) if not isdir(working_dir): makedirs(working_dir) chdir(working_dir) set_up_config(working_dir) # Building the workflow workflow = Workflow(working_dir, id=make_workflow_id(working_dir), cmdline_args=['python', __file__] + args) log.info('Workflow id is "' + workflow.id + '"') log.info('') if conf.get('db_vendor', 'sqlite') == 'sqlite': suffix = '' else: suffix = '_' + workflow.id workflow.extend([ step_prepare_input(p), steps.filter_proteomes( min_length=int(p.min_length), max_percent_stop=int(p.max_percent_stop)), filter_new_proteomes( new_proteomes_dir, min_length=int(p.min_length), max_percent_stop=int(p.max_percent_stop)), steps.make_blast_db(), steps.blast( workflow.id, p.threads or p.jobs or 30, on_cluster=p.threads > 0, new_good_proteomes=new_good_proteomes, evalue=float(p.evalue)), steps.parse_blast_results(), steps.clean_database(suffix), steps.install_schema(suffix), steps.load_blast_results(suffix), steps.find_pairs(suffix), steps.dump_pairs_to_files(suffix), steps.mcl(p.debug), steps.step_save_orthogroups(new_proteomes_dir if not p.ids_list and p.blast_singletones else None) ]) blastdb = p.blastdb or conf.get('blastdb', None) if not p.ids_list: workflow.extend([step_blast_singletones(p.threads, p.blast_singletones, blastdb, p.debug, p.overwrite)]) result = workflow.run( start_after, start_from, overwrite=True, ask_before=p.ask_each_step) if result == 0: log.info('Done.') log.info('Log is in ' + join(working_dir, log_fname)) if isfile(join(working_dir, config.orthogroups_file)): log.info('Groups are in ' + join(working_dir, config.orthogroups_file)) if isfile(config.nice_orthogroups_file): log.info('Groups with aligned columns are in ' + join(working_dir, config.nice_orthogroups_file)) else: log.info('Groups in short format are in ' + join(working_dir, config.short_orthogroups_file)) return result except (KeyboardInterrupt, SystemExit, GeneratorExit): return 1 except Exception as e: log.error('') log.exception('Unexpected error!') raise
def main(args): register_ctrl_c() p = parse_args(args) try: if not exists(join(p.directory, 'intermediate')): arg_parse_error( 'You need to run Scenario 1 on this directory first.') if not p.out_dir: p.out_dir = p.directory working_dir = p.out_dir with open(config_file) as cf: conf = dict(l.strip().split('=', 1) for l in cf.readlines() if l.strip() and l.strip()[0] != '#') start_from, start_after = get_starting_step( p.start_from, join(working_dir, log_fname)) if (not start_from or start_from == 1) and p.out_dir != p.directory: log_text = '' if isdir(p.out_dir): if not p.overwrite: files = [ f for f in listdir(p.out_dir) if f and f[0] != '.' ] #log.debug(files) if files: print( 'The output directory exists. Do you want to overwrite it? ' + '(You can run with the --overwrite option to avoid this warning.)' ) try: raw_input( 'Press any key to overwrite and continue, or Ctrl+C to interrupt.\n> ' ) except (EOFError, KeyboardInterrupt, SystemExit, GeneratorExit): exit(1) if exists(join(p.out_dir, log_fname)): with open(join(p.out_dir, log_fname)) as log_f: log_text = log_f.read() rmtree(p.out_dir) makedirs(p.out_dir) rmdir(p.out_dir) copytree(p.directory, p.out_dir) if isfile(join(p.out_dir, log_fname)): remove(join(p.out_dir, log_fname)) chdir(p.out_dir) if log_text: with open(join(p.out_dir, log_fname), 'w') as log_f: log_f.write(log_text) log_fpath = set_up_logging(p.debug, p.out_dir, 'a') log.info('python ' + basename(__file__) + ' ' + ' '.join(args)) log.info('') check_and_install_tools(p.debug, conf.get('db_vendor', 'sqlite'), log_fpath) log.info('Changing to %s' % working_dir) if not isdir(working_dir): makedirs(working_dir) chdir(working_dir) set_up_config(working_dir) # Building the workflow workflow = Workflow(working_dir, id=make_workflow_id(working_dir), cmdline_args=['python', __file__] + args) log.info('Workflow id is "' + workflow.id + '"') log.info('') if conf.get('db_vendor', 'sqlite') == 'sqlite': suffix = '' else: suffix = '_' + workflow.id workflow.extend([ step_prepare_input(p), steps.filter_proteomes(min_length=int(p.min_length), max_percent_stop=int(p.max_percent_stop)), filter_new_proteomes(new_proteomes_dir, min_length=int(p.min_length), max_percent_stop=int(p.max_percent_stop)), steps.make_blast_db(), steps.blast(workflow.id, p.threads or p.jobs or 30, on_cluster=p.threads > 0, new_good_proteomes=new_good_proteomes, evalue=float(p.evalue)), steps.parse_blast_results(), steps.clean_database(suffix), steps.install_schema(suffix), steps.load_blast_results(suffix), steps.find_pairs(suffix), steps.dump_pairs_to_files(suffix), steps.mcl(p.debug), steps.step_save_orthogroups(new_proteomes_dir if not p.ids_list and p.blast_singletones else None) ]) blastdb = p.blastdb or conf.get('blastdb', None) if not p.ids_list: workflow.extend([ step_blast_singletones(p.threads, p.blast_singletones, blastdb, p.debug, p.overwrite) ]) result = workflow.run(start_after, start_from, overwrite=True, ask_before=p.ask_each_step) if result == 0: log.info('Done.') log.info('Log is in ' + join(working_dir, log_fname)) if isfile(join(working_dir, config.orthogroups_file)): log.info('Groups are in ' + join(working_dir, config.orthogroups_file)) if isfile(config.nice_orthogroups_file): log.info('Groups with aligned columns are in ' + join(working_dir, config.nice_orthogroups_file)) else: log.info('Groups in short format are in ' + join(working_dir, config.short_orthogroups_file)) return result except (KeyboardInterrupt, SystemExit, GeneratorExit): return 1 except Exception as e: log.error('') log.exception('Unexpected error!') raise
def parse_args(args): import argparse op = argparse.ArgumentParser( description='Find groups of orthologous genes.') #op.add_argument(dest='directory', required=False) op.add_argument('-s1o', dest='directory', required=True) op.add_argument('-s2o', '-o', dest='out_dir', required=False) op.add_argument('-a', '--assemblies', dest='assemblies') op.add_argument('-g', '--annotations', '--gbs', dest='annotations') op.add_argument('-p', '--proteomes', '--proteins', dest='proteomes') op.add_argument('-i', '--ids', '--ids-list', dest='ids_list') op.add_argument('--prot-id-field', dest='prot_id_field', default=1) op.add_argument('--skip-blast-singletones', dest='blast_singletones', action='store_false', default=True) op.add_argument('--blastdb', '--blast-db', dest='blastdb') #-o: Output directory (if not specified, the input directory will be used). op.usage = '''Extends an orthogroup database and orthogroups files. First argument is a path to existed Scenario 1 output. Test runs: python scenario_2.py -s1o test_ids -s2o test_ids_new_ids --ids test_input/new_ids.txt python scenario_2.py -s1o test_proteomes -s2o test_prots_new_prots --proteomes test_input/new_proteins Usage: %s -s1o <scenario_1 result dir> -s2o <new output dir> [-a <assembies dir>] [-p <proteomes dir] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>] [--jobs 30] [--start-from <step num>] [--blast-singletones] [--blast-db <path>] -s1o Path to existed Scenario 1 output. -s2o Output directory (optional, if ommited, the input directory will be used). -a --assemblies: Directory with assemblies in fasta format. -g Directory with .gb files for references with annotations. -p Directory with fasta (or faa, fa) files of protein sequences. If they are named by their reference ids (i.e. NC_005816.1.fasta), annotations will be downloaded from NCBI. -i File with reference ids (will be fetched from NCBI). -s File with a list of organism names as in Genbank. --prot-id-field When specifying proteomes, use this fasta id field number to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...). --blast-singletones Search newly added proteins agains NCBI database, if they did not fit any group with known proteins. --blastdb Local BLAST database path. If not set, "blastdb" value in config.txt will be used. If it was not set either, remote NCBI will be used. ''' % basename(__file__) #indent = ' ' * len('usage: ' + basename(__file__) + ' ') #op.usage = basename(__file__) + ' [--existing-blast-results TSV]\n' + \ # indent + '[--existing_proteomes DIR]\n' + \ # indent + '[--assembly FASTA]\n' + \ # indent + '[--genes GFF]\n' + \ # indent + '[--proteome FASTA]\n' add_common_arguments(op) p = op.parse_args(args) check_common_args(p) if p.assemblies: check_dir(expanduser(p.assemblies)) p.assemblies = abspath(expanduser(p.assemblies)) if p.proteomes: check_dir(expanduser(p.proteomes)) p.proteomes = abspath(expanduser(p.proteomes)) if p.ids_list: check_file(expanduser(p.ids_list)) p.ids_list = abspath(expanduser(p.ids_list)) if p.annotations: check_dir(expanduser(p.annotations)) p.annotations = abspath(expanduser(p.annotations)) if p.blastdb: p.blastdb = abspath(expanduser(p.blastdb)) if not isdir(expanduser(p.directory)): arg_parse_error('Directory %s does not exist.' % p.directory) p.directory = abspath(expanduser(p.directory)) if not p.out_dir: p.out_dir = p.directory #arg_parse_error('Specify output directory with -o.') if isfile(expanduser(p.out_dir)): arg_parse_error('%s is a file' % p.out_dir) p.out_dir = abspath(expanduser(p.out_dir)) return p