Esempio n. 1
0
def parse_args(args):
    op = ArgumentParser(description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory')
    op.add_argument('-o', '--out', '--dir', dest='out', required=True)

    op.add_argument('-g', '--gbs', '--annotations', dest='annotations')
    op.add_argument('-p', '--proteins', '--proteomes', dest='proteomes')
    op.add_argument('--no-download', dest='download_anno', action='store_false', default=True)
    op.add_argument('-s', '--species', '--species-list', dest='species_list')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)

    op.usage = '''Finding orthogroups for a list of annotations / proteomes / ref ids / species.

Test runs:
    python scenario_1.py --ids test_input/ids.txt -o test_ids
    python scenario_1.py --proteomes test_input/proteins -o test_proteomes

Usage: %s [-p <proteomes dir>] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                     [-o <dir>] [--jobs 30] [--start-from <step num>]
    -o  Output directory.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).
    ''' % basename(__file__)

    #-a  --annotations-dir  Directory with .gb files.
    #-p  --proteomes-dir    Directory with fasta files of proteomes.
    #-i  --ids-list         File with reference ids (will be fetched from Genbank).
    #-s  --species-list     File with a list of organism names as in Genbank.
    #                       For example, "Salmonella enterica subsp. enterica serovar Typhi str. P-stx-12".
    #'''

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--annotations-dir DIR]\n' + \
    #    indent + '[--proteomes-dir DIR]\n' + \
    #    indent + '[--ids-file FILE]\n' + \
    #    indent + '[--species-file FILE]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if not p.out:
        arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out)):
        arg_parse_error('%s is a file' % p.out)
    p.out = abspath(expanduser(p.out))
    if not isdir(p.out):
        makedirs(p.out)

    if p.species_list:
        check_file(expanduser(p.species_list))
        p.species_list = abspath(expanduser(p.species_list))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    #if p.species_list or p.ids_list:
    #    if not isdir(p.out):
    #        mkdir(p.out)
    #else:
    #    if not p.directory:
    #        arg_parse_error('Directory or file must be specified.')
    #        check_dir(p.directory)
    return p
Esempio n. 2
0
def parse_args(args):
    op = ArgumentParser(description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory')
    op.add_argument('-o', '--out', '--dir', dest='out', required=True)

    op.add_argument('-g', '--gbs', '--annotations', dest='annotations')
    op.add_argument('-p', '--proteins', '--proteomes', dest='proteomes')
    op.add_argument('--no-download',
                    dest='download_anno',
                    action='store_false',
                    default=True)
    op.add_argument('-s', '--species', '--species-list', dest='species_list')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)

    op.usage = '''Finding orthogroups for a list of annotations / proteomes / ref ids / species.

Test runs:
    python scenario_1.py --ids test_input/ids.txt -o test_ids
    python scenario_1.py --proteomes test_input/proteins -o test_proteomes

Usage: %s [-p <proteomes dir>] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                     [-o <dir>] [--jobs 30] [--start-from <step num>]
    -o  Output directory.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).
    ''' % basename(__file__)

    #-a  --annotations-dir  Directory with .gb files.
    #-p  --proteomes-dir    Directory with fasta files of proteomes.
    #-i  --ids-list         File with reference ids (will be fetched from Genbank).
    #-s  --species-list     File with a list of organism names as in Genbank.
    #                       For example, "Salmonella enterica subsp. enterica serovar Typhi str. P-stx-12".
    #'''

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--annotations-dir DIR]\n' + \
    #    indent + '[--proteomes-dir DIR]\n' + \
    #    indent + '[--ids-file FILE]\n' + \
    #    indent + '[--species-file FILE]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if not p.out:
        arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out)):
        arg_parse_error('%s is a file' % p.out)
    p.out = abspath(expanduser(p.out))
    if not isdir(p.out):
        makedirs(p.out)

    if p.species_list:
        check_file(expanduser(p.species_list))
        p.species_list = abspath(expanduser(p.species_list))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    #if p.species_list or p.ids_list:
    #    if not isdir(p.out):
    #        mkdir(p.out)
    #else:
    #    if not p.directory:
    #        arg_parse_error('Directory or file must be specified.')
    #        check_dir(p.directory)
    return p
Esempio n. 3
0
def parse_args(args):
    import argparse
    op = argparse.ArgumentParser(description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory', required=False)
    op.add_argument('-s1o', dest='directory', required=True)
    op.add_argument('-s2o', '-o', dest='out_dir', required=False)

    op.add_argument('-a', '--assemblies', dest='assemblies')
    op.add_argument('-g', '--annotations', '--gbs', dest='annotations')
    op.add_argument('-p', '--proteomes', '--proteins', dest='proteomes')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)
    op.add_argument('--skip-blast-singletones', dest='blast_singletones',
                    action='store_false', default=True)
    op.add_argument('--blastdb', '--blast-db', dest='blastdb')

    #-o:                  Output directory (if not specified, the input directory will be used).

    op.usage = '''Extends an orthogroup database and orthogroups files.
First argument is a path to existed Scenario 1 output.

Test runs:
    python scenario_2.py -s1o test_ids -s2o test_ids_new_ids --ids test_input/new_ids.txt
    python scenario_2.py -s1o test_proteomes -s2o test_prots_new_prots --proteomes test_input/new_proteins

Usage: %s -s1o <scenario_1 result dir> -s2o <new output dir> [-a <assembies dir>] [-p <proteomes dir]
                                 [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                                 [--jobs 30] [--start-from <step num>]
                                 [--blast-singletones] [--blast-db <path>]
    -s1o                 Path to existed Scenario 1 output.
    -s2o                 Output directory (optional, if ommited, the input directory will be used).

    -a --assemblies:     Directory with assemblies in fasta format.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).

    --blast-singletones
        Search newly added proteins agains NCBI database, if they did not fit
        any group with known proteins.

    --blastdb
        Local BLAST database path. If not set, "blastdb" value in config.txt will be used.
        If it was not set either, remote NCBI will be used.
    ''' % basename(__file__)

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--existing-blast-results TSV]\n' + \
    #    indent + '[--existing_proteomes DIR]\n' + \
    #    indent + '[--assembly FASTA]\n' + \
    #    indent + '[--genes GFF]\n' + \
    #    indent + '[--proteome FASTA]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if p.assemblies:
        check_dir(expanduser(p.assemblies))
        p.assemblies = abspath(expanduser(p.assemblies))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    if p.blastdb:
        p.blastdb = abspath(expanduser(p.blastdb))

    if not isdir(expanduser(p.directory)):
        arg_parse_error('Directory %s does not exist.' % p.directory)
    p.directory = abspath(expanduser(p.directory))

    if not p.out_dir:
        p.out_dir = p.directory
        #arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out_dir)):
        arg_parse_error('%s is a file' % p.out_dir)
    p.out_dir = abspath(expanduser(p.out_dir))

    return p
Esempio n. 4
0
def main(args):
    register_ctrl_c()

    p = parse_args(args)

    try:
        if not exists(join(p.directory, 'intermediate')):
            arg_parse_error('You need to run Scenario 1 on this directory first.')

        if not p.out_dir:
            p.out_dir = p.directory

        working_dir = p.out_dir

        with open(config_file) as cf:
            conf = dict(
                l.strip().split('=', 1) for l
                in cf.readlines() if l.strip() and l.strip()[0] != '#')

        start_from, start_after = get_starting_step(p.start_from, join(working_dir, log_fname))

        if (not start_from or start_from == 1) and p.out_dir != p.directory:
            log_text = ''

            if isdir(p.out_dir):
                if not p.overwrite:
                    files = [f for f in listdir(p.out_dir) if f and f[0] != '.']
                    #log.debug(files)
                    if files:
                        print('The output directory exists. Do you want to overwrite it? ' +
                              '(You can run with the --overwrite option to avoid this warning.)')
                        try:
                            raw_input('Press any key to overwrite and continue, or Ctrl+C to interrupt.\n> ')
                        except (EOFError, KeyboardInterrupt, SystemExit, GeneratorExit):
                            exit(1)
                if exists(join(p.out_dir, log_fname)):
                    with open(join(p.out_dir, log_fname)) as log_f:
                        log_text = log_f.read()
                rmtree(p.out_dir)

            makedirs(p.out_dir)
            rmdir(p.out_dir)
            copytree(p.directory, p.out_dir)
            if isfile(join(p.out_dir, log_fname)):
                remove(join(p.out_dir, log_fname))
            chdir(p.out_dir)
            if log_text:
                with open(join(p.out_dir, log_fname), 'w') as log_f:
                    log_f.write(log_text)

        log_fpath = set_up_logging(p.debug, p.out_dir, 'a')
        log.info('python ' + basename(__file__) + ' ' + ' '.join(args))
        log.info('')
        check_and_install_tools(p.debug, conf.get('db_vendor', 'sqlite'), log_fpath)

        log.info('Changing to %s' % working_dir)
        if not isdir(working_dir):
            makedirs(working_dir)
        chdir(working_dir)

        set_up_config(working_dir)

        # Building the workflow
        workflow = Workflow(working_dir, id=make_workflow_id(working_dir),
                            cmdline_args=['python', __file__] + args)
        log.info('Workflow id is "' + workflow.id + '"')
        log.info('')

        if conf.get('db_vendor', 'sqlite') == 'sqlite':
            suffix = ''
        else:
            suffix = '_' + workflow.id

        workflow.extend([
            step_prepare_input(p),
            steps.filter_proteomes(
                min_length=int(p.min_length),
                max_percent_stop=int(p.max_percent_stop)),
            filter_new_proteomes(
                new_proteomes_dir,
                min_length=int(p.min_length),
                max_percent_stop=int(p.max_percent_stop)),
            steps.make_blast_db(),
            steps.blast(
                workflow.id,
                p.threads or p.jobs or 30,
                on_cluster=p.threads > 0,
                new_good_proteomes=new_good_proteomes,
                evalue=float(p.evalue)),
            steps.parse_blast_results(),
            steps.clean_database(suffix),
            steps.install_schema(suffix),
            steps.load_blast_results(suffix),
            steps.find_pairs(suffix),
            steps.dump_pairs_to_files(suffix),
            steps.mcl(p.debug),
            steps.step_save_orthogroups(new_proteomes_dir if not p.ids_list and p.blast_singletones else None)
        ])

        blastdb = p.blastdb or conf.get('blastdb', None)

        if not p.ids_list:
            workflow.extend([step_blast_singletones(p.threads, p.blast_singletones, blastdb, p.debug, p.overwrite)])

        result = workflow.run(
            start_after, start_from,
            overwrite=True,
            ask_before=p.ask_each_step)

        if result == 0:
            log.info('Done.')
            log.info('Log is in ' + join(working_dir, log_fname))
            if isfile(join(working_dir, config.orthogroups_file)):
                log.info('Groups are in ' + join(working_dir, config.orthogroups_file))
                if isfile(config.nice_orthogroups_file):
                    log.info('Groups with aligned columns are in ' +
                             join(working_dir, config.nice_orthogroups_file))
            else:
                log.info('Groups in short format are in ' + join(working_dir, config.short_orthogroups_file))

        return result

    except (KeyboardInterrupt, SystemExit, GeneratorExit):
        return 1

    except Exception as e:
        log.error('')
        log.exception('Unexpected error!')
        raise
Esempio n. 5
0
def main(args):
    register_ctrl_c()

    p = parse_args(args)

    try:
        if not exists(join(p.directory, 'intermediate')):
            arg_parse_error(
                'You need to run Scenario 1 on this directory first.')

        if not p.out_dir:
            p.out_dir = p.directory

        working_dir = p.out_dir

        with open(config_file) as cf:
            conf = dict(l.strip().split('=', 1) for l in cf.readlines()
                        if l.strip() and l.strip()[0] != '#')

        start_from, start_after = get_starting_step(
            p.start_from, join(working_dir, log_fname))

        if (not start_from or start_from == 1) and p.out_dir != p.directory:
            log_text = ''

            if isdir(p.out_dir):
                if not p.overwrite:
                    files = [
                        f for f in listdir(p.out_dir) if f and f[0] != '.'
                    ]
                    #log.debug(files)
                    if files:
                        print(
                            'The output directory exists. Do you want to overwrite it? '
                            +
                            '(You can run with the --overwrite option to avoid this warning.)'
                        )
                        try:
                            raw_input(
                                'Press any key to overwrite and continue, or Ctrl+C to interrupt.\n> '
                            )
                        except (EOFError, KeyboardInterrupt, SystemExit,
                                GeneratorExit):
                            exit(1)
                if exists(join(p.out_dir, log_fname)):
                    with open(join(p.out_dir, log_fname)) as log_f:
                        log_text = log_f.read()
                rmtree(p.out_dir)

            makedirs(p.out_dir)
            rmdir(p.out_dir)
            copytree(p.directory, p.out_dir)
            if isfile(join(p.out_dir, log_fname)):
                remove(join(p.out_dir, log_fname))
            chdir(p.out_dir)
            if log_text:
                with open(join(p.out_dir, log_fname), 'w') as log_f:
                    log_f.write(log_text)

        log_fpath = set_up_logging(p.debug, p.out_dir, 'a')
        log.info('python ' + basename(__file__) + ' ' + ' '.join(args))
        log.info('')
        check_and_install_tools(p.debug, conf.get('db_vendor', 'sqlite'),
                                log_fpath)

        log.info('Changing to %s' % working_dir)
        if not isdir(working_dir):
            makedirs(working_dir)
        chdir(working_dir)

        set_up_config(working_dir)

        # Building the workflow
        workflow = Workflow(working_dir,
                            id=make_workflow_id(working_dir),
                            cmdline_args=['python', __file__] + args)
        log.info('Workflow id is "' + workflow.id + '"')
        log.info('')

        if conf.get('db_vendor', 'sqlite') == 'sqlite':
            suffix = ''
        else:
            suffix = '_' + workflow.id

        workflow.extend([
            step_prepare_input(p),
            steps.filter_proteomes(min_length=int(p.min_length),
                                   max_percent_stop=int(p.max_percent_stop)),
            filter_new_proteomes(new_proteomes_dir,
                                 min_length=int(p.min_length),
                                 max_percent_stop=int(p.max_percent_stop)),
            steps.make_blast_db(),
            steps.blast(workflow.id,
                        p.threads or p.jobs or 30,
                        on_cluster=p.threads > 0,
                        new_good_proteomes=new_good_proteomes,
                        evalue=float(p.evalue)),
            steps.parse_blast_results(),
            steps.clean_database(suffix),
            steps.install_schema(suffix),
            steps.load_blast_results(suffix),
            steps.find_pairs(suffix),
            steps.dump_pairs_to_files(suffix),
            steps.mcl(p.debug),
            steps.step_save_orthogroups(new_proteomes_dir if not p.ids_list
                                        and p.blast_singletones else None)
        ])

        blastdb = p.blastdb or conf.get('blastdb', None)

        if not p.ids_list:
            workflow.extend([
                step_blast_singletones(p.threads, p.blast_singletones, blastdb,
                                       p.debug, p.overwrite)
            ])

        result = workflow.run(start_after,
                              start_from,
                              overwrite=True,
                              ask_before=p.ask_each_step)

        if result == 0:
            log.info('Done.')
            log.info('Log is in ' + join(working_dir, log_fname))
            if isfile(join(working_dir, config.orthogroups_file)):
                log.info('Groups are in ' +
                         join(working_dir, config.orthogroups_file))
                if isfile(config.nice_orthogroups_file):
                    log.info('Groups with aligned columns are in ' +
                             join(working_dir, config.nice_orthogroups_file))
            else:
                log.info('Groups in short format are in ' +
                         join(working_dir, config.short_orthogroups_file))

        return result

    except (KeyboardInterrupt, SystemExit, GeneratorExit):
        return 1

    except Exception as e:
        log.error('')
        log.exception('Unexpected error!')
        raise
Esempio n. 6
0
def parse_args(args):
    import argparse
    op = argparse.ArgumentParser(
        description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory', required=False)
    op.add_argument('-s1o', dest='directory', required=True)
    op.add_argument('-s2o', '-o', dest='out_dir', required=False)

    op.add_argument('-a', '--assemblies', dest='assemblies')
    op.add_argument('-g', '--annotations', '--gbs', dest='annotations')
    op.add_argument('-p', '--proteomes', '--proteins', dest='proteomes')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)
    op.add_argument('--skip-blast-singletones',
                    dest='blast_singletones',
                    action='store_false',
                    default=True)
    op.add_argument('--blastdb', '--blast-db', dest='blastdb')

    #-o:                  Output directory (if not specified, the input directory will be used).

    op.usage = '''Extends an orthogroup database and orthogroups files.
First argument is a path to existed Scenario 1 output.

Test runs:
    python scenario_2.py -s1o test_ids -s2o test_ids_new_ids --ids test_input/new_ids.txt
    python scenario_2.py -s1o test_proteomes -s2o test_prots_new_prots --proteomes test_input/new_proteins

Usage: %s -s1o <scenario_1 result dir> -s2o <new output dir> [-a <assembies dir>] [-p <proteomes dir]
                                 [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                                 [--jobs 30] [--start-from <step num>]
                                 [--blast-singletones] [--blast-db <path>]
    -s1o                 Path to existed Scenario 1 output.
    -s2o                 Output directory (optional, if ommited, the input directory will be used).

    -a --assemblies:     Directory with assemblies in fasta format.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).

    --blast-singletones
        Search newly added proteins agains NCBI database, if they did not fit
        any group with known proteins.

    --blastdb
        Local BLAST database path. If not set, "blastdb" value in config.txt will be used.
        If it was not set either, remote NCBI will be used.
    ''' % basename(__file__)

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--existing-blast-results TSV]\n' + \
    #    indent + '[--existing_proteomes DIR]\n' + \
    #    indent + '[--assembly FASTA]\n' + \
    #    indent + '[--genes GFF]\n' + \
    #    indent + '[--proteome FASTA]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if p.assemblies:
        check_dir(expanduser(p.assemblies))
        p.assemblies = abspath(expanduser(p.assemblies))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    if p.blastdb:
        p.blastdb = abspath(expanduser(p.blastdb))

    if not isdir(expanduser(p.directory)):
        arg_parse_error('Directory %s does not exist.' % p.directory)
    p.directory = abspath(expanduser(p.directory))

    if not p.out_dir:
        p.out_dir = p.directory
        #arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out_dir)):
        arg_parse_error('%s is a file' % p.out_dir)
    p.out_dir = abspath(expanduser(p.out_dir))

    return p