Esempio n. 1
0
def parse_args(args):
    op = ArgumentParser(description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory')
    op.add_argument('-o', '--out', '--dir', dest='out', required=True)

    op.add_argument('-g', '--gbs', '--annotations', dest='annotations')
    op.add_argument('-p', '--proteins', '--proteomes', dest='proteomes')
    op.add_argument('--no-download', dest='download_anno', action='store_false', default=True)
    op.add_argument('-s', '--species', '--species-list', dest='species_list')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)

    op.usage = '''Finding orthogroups for a list of annotations / proteomes / ref ids / species.

Test runs:
    python scenario_1.py --ids test_input/ids.txt -o test_ids
    python scenario_1.py --proteomes test_input/proteins -o test_proteomes

Usage: %s [-p <proteomes dir>] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                     [-o <dir>] [--jobs 30] [--start-from <step num>]
    -o  Output directory.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).
    ''' % basename(__file__)

    #-a  --annotations-dir  Directory with .gb files.
    #-p  --proteomes-dir    Directory with fasta files of proteomes.
    #-i  --ids-list         File with reference ids (will be fetched from Genbank).
    #-s  --species-list     File with a list of organism names as in Genbank.
    #                       For example, "Salmonella enterica subsp. enterica serovar Typhi str. P-stx-12".
    #'''

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--annotations-dir DIR]\n' + \
    #    indent + '[--proteomes-dir DIR]\n' + \
    #    indent + '[--ids-file FILE]\n' + \
    #    indent + '[--species-file FILE]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if not p.out:
        arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out)):
        arg_parse_error('%s is a file' % p.out)
    p.out = abspath(expanduser(p.out))
    if not isdir(p.out):
        makedirs(p.out)

    if p.species_list:
        check_file(expanduser(p.species_list))
        p.species_list = abspath(expanduser(p.species_list))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    #if p.species_list or p.ids_list:
    #    if not isdir(p.out):
    #        mkdir(p.out)
    #else:
    #    if not p.directory:
    #        arg_parse_error('Directory or file must be specified.')
    #        check_dir(p.directory)
    return p
Esempio n. 2
0
def parse_args(args):
    import argparse
    op = argparse.ArgumentParser(description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory', required=False)
    op.add_argument('-s1o', dest='directory', required=True)
    op.add_argument('-s2o', '-o', dest='out_dir', required=False)

    op.add_argument('-a', '--assemblies', dest='assemblies')
    op.add_argument('-g', '--annotations', '--gbs', dest='annotations')
    op.add_argument('-p', '--proteomes', '--proteins', dest='proteomes')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)
    op.add_argument('--skip-blast-singletones', dest='blast_singletones',
                    action='store_false', default=True)
    op.add_argument('--blastdb', '--blast-db', dest='blastdb')

    #-o:                  Output directory (if not specified, the input directory will be used).

    op.usage = '''Extends an orthogroup database and orthogroups files.
First argument is a path to existed Scenario 1 output.

Test runs:
    python scenario_2.py -s1o test_ids -s2o test_ids_new_ids --ids test_input/new_ids.txt
    python scenario_2.py -s1o test_proteomes -s2o test_prots_new_prots --proteomes test_input/new_proteins

Usage: %s -s1o <scenario_1 result dir> -s2o <new output dir> [-a <assembies dir>] [-p <proteomes dir]
                                 [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                                 [--jobs 30] [--start-from <step num>]
                                 [--blast-singletones] [--blast-db <path>]
    -s1o                 Path to existed Scenario 1 output.
    -s2o                 Output directory (optional, if ommited, the input directory will be used).

    -a --assemblies:     Directory with assemblies in fasta format.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).

    --blast-singletones
        Search newly added proteins agains NCBI database, if they did not fit
        any group with known proteins.

    --blastdb
        Local BLAST database path. If not set, "blastdb" value in config.txt will be used.
        If it was not set either, remote NCBI will be used.
    ''' % basename(__file__)

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--existing-blast-results TSV]\n' + \
    #    indent + '[--existing_proteomes DIR]\n' + \
    #    indent + '[--assembly FASTA]\n' + \
    #    indent + '[--genes GFF]\n' + \
    #    indent + '[--proteome FASTA]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if p.assemblies:
        check_dir(expanduser(p.assemblies))
        p.assemblies = abspath(expanduser(p.assemblies))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    if p.blastdb:
        p.blastdb = abspath(expanduser(p.blastdb))

    if not isdir(expanduser(p.directory)):
        arg_parse_error('Directory %s does not exist.' % p.directory)
    p.directory = abspath(expanduser(p.directory))

    if not p.out_dir:
        p.out_dir = p.directory
        #arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out_dir)):
        arg_parse_error('%s is a file' % p.out_dir)
    p.out_dir = abspath(expanduser(p.out_dir))

    return p
Esempio n. 3
0
def parse_args(args):
    op = ArgumentParser(description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory')
    op.add_argument('-o', '--out', '--dir', dest='out', required=True)

    op.add_argument('-g', '--gbs', '--annotations', dest='annotations')
    op.add_argument('-p', '--proteins', '--proteomes', dest='proteomes')
    op.add_argument('--no-download',
                    dest='download_anno',
                    action='store_false',
                    default=True)
    op.add_argument('-s', '--species', '--species-list', dest='species_list')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)

    op.usage = '''Finding orthogroups for a list of annotations / proteomes / ref ids / species.

Test runs:
    python scenario_1.py --ids test_input/ids.txt -o test_ids
    python scenario_1.py --proteomes test_input/proteins -o test_proteomes

Usage: %s [-p <proteomes dir>] [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                     [-o <dir>] [--jobs 30] [--start-from <step num>]
    -o  Output directory.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).
    ''' % basename(__file__)

    #-a  --annotations-dir  Directory with .gb files.
    #-p  --proteomes-dir    Directory with fasta files of proteomes.
    #-i  --ids-list         File with reference ids (will be fetched from Genbank).
    #-s  --species-list     File with a list of organism names as in Genbank.
    #                       For example, "Salmonella enterica subsp. enterica serovar Typhi str. P-stx-12".
    #'''

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--annotations-dir DIR]\n' + \
    #    indent + '[--proteomes-dir DIR]\n' + \
    #    indent + '[--ids-file FILE]\n' + \
    #    indent + '[--species-file FILE]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if not p.out:
        arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out)):
        arg_parse_error('%s is a file' % p.out)
    p.out = abspath(expanduser(p.out))
    if not isdir(p.out):
        makedirs(p.out)

    if p.species_list:
        check_file(expanduser(p.species_list))
        p.species_list = abspath(expanduser(p.species_list))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    #if p.species_list or p.ids_list:
    #    if not isdir(p.out):
    #        mkdir(p.out)
    #else:
    #    if not p.directory:
    #        arg_parse_error('Directory or file must be specified.')
    #        check_dir(p.directory)
    return p
Esempio n. 4
0
def parse_args(args):
    import argparse
    op = argparse.ArgumentParser(
        description='Find groups of orthologous genes.')

    #op.add_argument(dest='directory', required=False)
    op.add_argument('-s1o', dest='directory', required=True)
    op.add_argument('-s2o', '-o', dest='out_dir', required=False)

    op.add_argument('-a', '--assemblies', dest='assemblies')
    op.add_argument('-g', '--annotations', '--gbs', dest='annotations')
    op.add_argument('-p', '--proteomes', '--proteins', dest='proteomes')
    op.add_argument('-i', '--ids', '--ids-list', dest='ids_list')

    op.add_argument('--prot-id-field', dest='prot_id_field', default=1)
    op.add_argument('--skip-blast-singletones',
                    dest='blast_singletones',
                    action='store_false',
                    default=True)
    op.add_argument('--blastdb', '--blast-db', dest='blastdb')

    #-o:                  Output directory (if not specified, the input directory will be used).

    op.usage = '''Extends an orthogroup database and orthogroups files.
First argument is a path to existed Scenario 1 output.

Test runs:
    python scenario_2.py -s1o test_ids -s2o test_ids_new_ids --ids test_input/new_ids.txt
    python scenario_2.py -s1o test_proteomes -s2o test_prots_new_prots --proteomes test_input/new_proteins

Usage: %s -s1o <scenario_1 result dir> -s2o <new output dir> [-a <assembies dir>] [-p <proteomes dir]
                                 [-a <.gb files dir>] [-i <gb ids file>] [-s <strain names file>]
                                 [--jobs 30] [--start-from <step num>]
                                 [--blast-singletones] [--blast-db <path>]
    -s1o                 Path to existed Scenario 1 output.
    -s2o                 Output directory (optional, if ommited, the input directory will be used).

    -a --assemblies:     Directory with assemblies in fasta format.
    -g  Directory with .gb files for references with annotations.
    -p  Directory with fasta (or faa, fa) files of protein sequences. If they
        are named by their reference ids (i.e. NC_005816.1.fasta), annotations
        will be downloaded from NCBI.
    -i  File with reference ids (will be fetched from NCBI).
    -s  File with a list of organism names as in Genbank.

    --prot-id-field
        When specifying proteomes, use this fasta id field number
        to retrieve protein ids (default if 1, like >NC_005816.1|NP_995567.1 ...).

    --blast-singletones
        Search newly added proteins agains NCBI database, if they did not fit
        any group with known proteins.

    --blastdb
        Local BLAST database path. If not set, "blastdb" value in config.txt will be used.
        If it was not set either, remote NCBI will be used.
    ''' % basename(__file__)

    #indent = ' ' * len('usage: ' + basename(__file__) + ' ')
    #op.usage = basename(__file__) + ' [--existing-blast-results TSV]\n' + \
    #    indent + '[--existing_proteomes DIR]\n' + \
    #    indent + '[--assembly FASTA]\n' + \
    #    indent + '[--genes GFF]\n' + \
    #    indent + '[--proteome FASTA]\n'

    add_common_arguments(op)

    p = op.parse_args(args)

    check_common_args(p)

    if p.assemblies:
        check_dir(expanduser(p.assemblies))
        p.assemblies = abspath(expanduser(p.assemblies))

    if p.proteomes:
        check_dir(expanduser(p.proteomes))
        p.proteomes = abspath(expanduser(p.proteomes))

    if p.ids_list:
        check_file(expanduser(p.ids_list))
        p.ids_list = abspath(expanduser(p.ids_list))

    if p.annotations:
        check_dir(expanduser(p.annotations))
        p.annotations = abspath(expanduser(p.annotations))

    if p.blastdb:
        p.blastdb = abspath(expanduser(p.blastdb))

    if not isdir(expanduser(p.directory)):
        arg_parse_error('Directory %s does not exist.' % p.directory)
    p.directory = abspath(expanduser(p.directory))

    if not p.out_dir:
        p.out_dir = p.directory
        #arg_parse_error('Specify output directory with -o.')
    if isfile(expanduser(p.out_dir)):
        arg_parse_error('%s is a file' % p.out_dir)
    p.out_dir = abspath(expanduser(p.out_dir))

    return p