Ejemplo n.º 1
0
def fetch(arg, log):
    """A function to download the appropriate files from Ensembl Plants and
    Phytozome. Will also convert them to BLAST databases."""
    fetchdeps = check_modules.check_modules(fetch=True)
    if fetchdeps:
        check_modules.missing_mods(fetchdeps)
        exit(1)
    #   Next we check for the presence of the utilities we need to
    #   do the fetching
    missing_reqs = check_modules.missing_executables(
        [arg['bash_path'],
         'makeblastdb',
         'gzip',
         'sum'])
    if missing_reqs:
        log.error(
            'Some required executables were not found on your system: ' +
            '\n'.join(missing_reqs) + '\nPlease install them to continue.')
        exit(1)
    #   Import the main fetching script
    #   We do it here, since we only want to import this if we are fetching
    import lrt_predict.Fetch.phytozome as phytozome
    import lrt_predict.Fetch.ensembl as ensembl
    #   Create a new Phytozome instance that will handle our work with
    #   the JGI Genomes Portal.
    log.debug('Creating a new Phytozome instance to fetch data.')
    phy = phytozome.Phytozome(
        arg['user'],
        arg['password'],
        arg['base'],
        arg['convert_only'],
        arg['loglevel'])
    log.debug('Creating a new Ensembl instance to fetch data.')
    ens = ensembl.EnsemblPlants(
        arg['base'],
        arg['convert_only'],
        arg['loglevel'])

    if arg['convert_only']:
        log.debug('Only converting files.')
        ens.convert()
        phy.convert()
    elif arg['fetch_only']:
        log.debug('Only downloading files.')
        log.info('Fetching from Ensembl Plants...')
        ens.get_ftp_urls()
        ens.download_files()
        log.info('Fetching from Phytozome...')
        phy.get_xml_urls()
        phy.fetch_cds()
    else:
        log.debug('Downloading and converting Ensembl Plants files...')
        ens.get_ftp_urls()
        ens.download_files()
        log.debug('Downloading and converting files.')
        phy.get_xml_urls()
        phy.fetch_cds()
        ens.convert()
        phy.convert()
    return
Ejemplo n.º 2
0
def blast(arg, log):
    """A function to search the databses with BLAST and collect the
    homologous sequences from them."""
    blastdeps = check_modules.check_modules(predict=True)
    #   Check the module dependencies for the BLAST function
    if blastdeps:
        check_modules.missing_mods(blastdeps)
        exit(1)
    missing_reqs = check_modules.missing_executables(
        [arg['bash_path'], arg['tblastx_path']])
    #   And then check the executable dependencies
    if missing_reqs:
        log.error('Some required executables were not found on your system: ' +
                  '\n'.join(missing_reqs) +
                  '\nPlease install them to continue.')
        exit(1)
    #   If all that checks out, import the BLAST class script
    import lrt_predict.Blast.blast_search as blast_search
    log.info('Creating a new instance to BLAST.')
    b_search = blast_search.BlastSearch(arg['base'], arg['target'],
                                        arg['fasta'], arg['evalue'],
                                        arg['loglevel'])
    b_search.blast_all()
    #   hom contains the file object that has the unaligned sequence in it.
    hom = b_search.get_hit_seqs()
    return hom
Ejemplo n.º 3
0
def predict(arg, log):
    """A function to run the HYPHY codon prediction model on each column of
    the alignment and return a score for each one."""
    predictdeps = check_modules.check_modules(predict=True)
    if predictdeps:
        check_modules.missing_mods(predictdeps)
        exit(1)
    #   Check for the required executables
    missing_reqs = check_modules.missing_executables(
        [
            arg['bash_path'],
            arg['hyphy_path']
        ])
    if missing_reqs:
        log.error(
            'Some required executables were not found on your system: ' +
            '\n'.join(missing_reqs) + '\nPlease install them to continue.')
        exit(1)
    #   import the predict script
    import lrt_predict.Predict.predict as predictor
    #   Create a new instance of class LRTPredict
    lrt = predictor.LRTPredict(
        arg['hyphy_path'],
        arg['alignment'],
        arg['tree'],
        arg['fasta'],
        arg['substitutions'],
        arg['loglevel'])
    lrt.get_query_position()
    lrt.get_aligned_positions()
    lrt.write_aligned_subs()
    lrt.prepare_hyphy_inputs()
    outputfile = lrt.predict_codons()
    return outputfile
Ejemplo n.º 4
0
def blast(arg, log):
    """A function to search the databses with BLAST and collect the
    homologous sequences from them."""
    blastdeps = check_modules.check_modules(predict=True)
    #   Check the module dependencies for the BLAST function
    if blastdeps:
        check_modules.missing_mods(blastdeps)
        exit(1)
    missing_reqs = check_modules.missing_executables(
        [
            arg['bash_path'],
            arg['tblastx_path']
        ])
    #   And then check the executable dependencies
    if missing_reqs:
        log.error(
            'Some required executables were not found on your system: ' +
            '\n'.join(missing_reqs) + '\nPlease install them to continue.')
        exit(1)
    #   If all that checks out, import the BLAST class script
    import lrt_predict.Blast.blast_search as blast_search
    log.info('Creating a new instance to BLAST.')
    b_search = blast_search.BlastSearch(
        arg['base'],
        arg['target'],
        arg['fasta'],
        arg['evalue'],
        arg['loglevel'])
    b_search.blast_all()
    #   hom contains the file object that has the unaligned sequence in it.
    hom = b_search.get_hit_seqs()
    return hom
Ejemplo n.º 5
0
def predict(arg, log):
    """A function to run the HYPHY codon prediction model on each column of
    the alignment and return a score for each one."""
    predictdeps = check_modules.check_modules(predict=True)
    if predictdeps:
        check_modules.missing_mods(predictdeps)
        exit(1)
    #   Check for the required executables
    missing_reqs = check_modules.missing_executables(
        [arg['bash_path'], arg['hyphy_path']])
    if missing_reqs:
        log.error('Some required executables were not found on your system: ' +
                  '\n'.join(missing_reqs) +
                  '\nPlease install them to continue.')
        exit(1)
    #   import the predict script
    import lrt_predict.Predict.predict as predictor
    #   Create a new instance of class LRTPredict
    lrt = predictor.LRTPredict(arg['hyphy_path'], arg['alignment'],
                               arg['tree'], arg['fasta'], arg['substitutions'],
                               arg['loglevel'])
    lrt.get_query_position()
    lrt.get_aligned_positions()
    lrt.write_aligned_subs()
    lrt.prepare_hyphy_inputs()
    outputfile = lrt.predict_codons()
    return outputfile
Ejemplo n.º 6
0
def align(arg, unaligned, log):
    """A function to align the homologous sequences with pasta, and return
    the aligned sequences and the phylogenetic tree."""
    aligndeps = check_modules.check_modules(predict=True)
    if aligndeps:
        check_modules.missing_mods(aligndeps)
        exit(1)
    #   Check for the required executables
    missing_reqs = check_modules.missing_executables([
        arg['bash_path'], arg['pasta_path'], arg['clustalo_path'],
        arg['fasttree_path']
    ])
    if missing_reqs:
        log.error('Some required executables were not found on your system: ' +
                  '\n'.join(missing_reqs) +
                  '\nPlease install them to continue.')
        exit(1)
    #   Then we import the necessary modules
    import lrt_predict.Predict.align as aligner
    log.info('Creating a new instance of PastaAlign.')
    aln = aligner.PastaAlign(arg['pasta_path'], arg['clustalo_path'],
                             arg['fasttree_path'], unaligned, arg['fasta'],
                             arg['loglevel'])
    #   Prepare the sequences for alignment:
    #       Check length is multiple of 3
    #       Translate to protein
    #       Remove STOP codons
    nseqs = aln.prepare_sequences()
    #   Then align them
    if nseqs > 2:
        stdout, stderr = aln.pasta_align()
        log.debug('stdout: \n' + stdout.decode('utf-8'))
        log.debug('stderr: \n' + stderr.decode('utf-8'))
    else:
        log.warning('Only two sequences; using clustal-omega for alignment.')
        stdout, stderr = aln.clustalo_align()
        log.debug('stdout: \n' + stdout.decode('utf-8'))
        log.debug('stderr: \n' + stderr.decode('utf-8'))
    #   Backtranslate the alignment
    aln.back_translate()
    #   Then sanitize the alignment and tree
    aln.sanitize_outputs()
    #   Then copy them over
    log.info('Nucleotide alignment in ' + aln.final_aln)
    log.info('Tree in ' + aln.tree_out)
    new_nuc = os.path.join(
        arg['output'],
        os.path.basename(arg['fasta'].replace('.fasta', '_MSA.fasta')))
    new_tree = os.path.join(
        arg['output'],
        os.path.basename(arg['fasta'].replace('.fasta', '.tree')))
    open(new_nuc, 'w').close()
    open(new_tree, 'w').close()
    shutil.copy2(aln.final_aln, new_nuc)
    shutil.copy2(aln.tree_out, new_tree)
    log.info('MSA copied to ' + new_nuc)
    log.info('Tree copied to ' + new_tree)
    #   Cleanup the temporary file
    os.remove(aln.final_aln)
    return
Ejemplo n.º 7
0
def fetch(arg, log):
    """A function to download the appropriate files from Ensembl Plants and
    Phytozome. Will also convert them to BLAST databases."""
    fetchdeps = check_modules.check_modules(fetch=True)
    if fetchdeps:
        check_modules.missing_mods(fetchdeps)
        exit(1)
    #   Next we check for the presence of the utilities we need to
    #   do the fetching
    missing_reqs = check_modules.missing_executables(
        [arg['bash_path'], 'makeblastdb', 'gzip', 'sum'])
    if missing_reqs:
        log.error('Some required executables were not found on your system: ' +
                  '\n'.join(missing_reqs) +
                  '\nPlease install them to continue.')
        exit(1)
    #   Import the main fetching script
    #   We do it here, since we only want to import this if we are fetching
    import lrt_predict.Fetch.phytozome as phytozome
    import lrt_predict.Fetch.ensembl as ensembl
    #   Create a new Phytozome instance that will handle our work with
    #   the JGI Genomes Portal.
    log.debug('Creating a new Phytozome instance to fetch data.')
    phy = phytozome.Phytozome(arg['user'], arg['password'], arg['base'],
                              arg['convert_only'], arg['loglevel'])
    log.debug('Creating a new Ensembl instance to fetch data.')
    ens = ensembl.EnsemblPlants(arg['base'], arg['convert_only'],
                                arg['loglevel'])

    if arg['convert_only']:
        log.debug('Only converting files.')
        ens.convert()
        phy.convert()
    elif arg['fetch_only']:
        log.debug('Only downloading files.')
        log.info('Fetching from Ensembl Plants...')
        ens.get_ftp_urls()
        ens.download_files()
        log.info('Fetching from Phytozome...')
        phy.get_xml_urls()
        phy.fetch_cds()
    else:
        log.debug('Downloading and converting Ensembl Plants files...')
        ens.get_ftp_urls()
        ens.download_files()
        log.debug('Downloading and converting files.')
        phy.get_xml_urls()
        phy.fetch_cds()
        ens.convert()
        phy.convert()
    return
Ejemplo n.º 8
0
def setup(arg):
    """A function to write the configuration file."""
    #   Check for the required modules for setup
    setupdeps = check_modules.check_modules(setup=True)
    if setupdeps:
        check_modules.missing_mods(setupdeps)
        exit(1)
    #   Import the setup script
    import lrt_predict.Setup.setup_env as setup_env
    #   Start a new instance of the configuration class
    s_env = setup_env.SetupEnv(arg['base'], arg['deps_dir'], arg['target'],
                               arg['evalue'], arg['config'], arg['loglevel'])
    s_env.write_config()
    return
Ejemplo n.º 9
0
def setup(arg):
    """A function to write the configuration file."""
    #   Check for the required modules for setup
    setupdeps = check_modules.check_modules(setup=True)
    if setupdeps:
        check_modules.missing_mods(setupdeps)
        exit(1)
    #   Import the setup script
    import lrt_predict.Setup.setup_env as setup_env
    #   Start a new instance of the configuration class
    s_env = setup_env.SetupEnv(
        arg['base'],
        arg['deps_dir'],
        arg['target'],
        arg['evalue'],
        arg['config'],
        arg['loglevel'])
    s_env.write_config()
    return
Ejemplo n.º 10
0
def main():
    """The main function."""
    #   The very first thing we do is do a base check to make sure that we can
    #   parse arguments
    dep = check_modules.check_modules()
    if dep:
        check_modules.missing_mods(dep)
        exit(1)
    #   Parse the arguments
    #   First, a check to see if any arguments were sent at all
    #   If not, then print the usage and exit
    if not sys.argv[1:]:
        parse_args.usage()
        exit(1)
    #   Import the config handling script
    import lrt_predict.Setup.parse_config as config
    #   vars() will convert from the weird Namespace type to a dictionary
    arguments = vars(parse_args.parse_args())
    #   Pull out the verbosity switch right away
    loglevel = set_verbosity.verbosity('LRT_Predict', arguments['loglevel'])
    #   If the config variable was passed
    if 'config' in arguments:
        #   We ask then if the setup command was not passed
        #   If the user wants to setup, then don't bother trying to validate
        #   the config.
        if arguments['action'] != 'setup':
            cfg = config.ConfigHandler(
                arguments['config'],
                arguments,
                arguments['loglevel'])
            if cfg.is_valid():
                cfg.read_vars()
                config_opts = cfg.merge_options()
            else:
                loglevel.error('Config file is not valid!')
                exit(1)
        #   Else, just set it to the options that were passed
        else:
            config_opts = arguments
    else:
        config_opts = arguments
    arguments_valid, msg = parse_args.validate_args(config_opts, loglevel)
    #   If we got a return value that isn't False, then our arguments are good
    if arguments_valid:
        loglevel.debug(arguments_valid['action'] + ' subcommand was invoked')
        #   Which command was invoked?
        if arguments_valid['action'] == 'setup':
            setup(arguments_valid)
        elif arguments_valid['action'] == 'fetch':
            fetch(arguments_valid, loglevel)
        elif arguments_valid['action'] == 'align':
            #   We will return the filename that contains the unaligned
            #   sequences, as we will use these as inputs for pasta
            unaligned_seqs = blast(arguments_valid, loglevel)
            #   Then add the query sequence and align them
            align(
                arguments_valid,
                unaligned_seqs,
                loglevel)
        elif arguments_valid['action'] == 'predict':
            out = predict(arguments_valid, loglevel)
            #   copy the output file into the destination directory
            #   To build the output filename, we join the output directory
            #   with a new name based on the input filename
            out_fname = os.path.join(
                arguments_valid['output'],
                os.path.basename(
                    arguments_valid['fasta'].replace(
                        '.fasta',
                        '_Predictions.txt')
                    )
                )
            open(out_fname, 'w').close()
            shutil.copy2(out.name, out_fname)
            loglevel.info('Prediction in ' + out_fname)
        elif arguments_valid['action'] == 'compile':
            compile_preds(arguments_valid, loglevel)
            return
    else:
        loglevel.error(msg)
    return
Ejemplo n.º 11
0
def align(arg, unaligned, log):
    """A function to align the homologous sequences with pasta, and return
    the aligned sequences and the phylogenetic tree."""
    aligndeps = check_modules.check_modules(predict=True)
    if aligndeps:
        check_modules.missing_mods(aligndeps)
        exit(1)
    #   Check for the required executables
    missing_reqs = check_modules.missing_executables(
        [
            arg['bash_path'],
            arg['pasta_path']
        ])
    if missing_reqs:
        log.error(
            'Some required executables were not found on your system: ' +
            '\n'.join(missing_reqs) + '\nPlease install them to continue.')
        exit(1)
    #   Then we import the necessary modules
    import lrt_predict.Predict.align as aligner
    log.info('Creating a new instance of PastaAlign.')
    aln = aligner.PastaAlign(
        arg['pasta_path'],
        unaligned,
        arg['fasta'],
        arg['loglevel'])
    #   Prepare the sequences for alignment:
    #       Check length is multiple of 3
    #       Translate to protein
    #       Remove STOP codons
    aln.prepare_sequences()
    #   Then align them
    stdout, stderr = aln.pasta_align()
    log.debug('stdout: \n' + stdout)
    log.debug('stderr: \n' + stderr)
    #   Backtranslate the alignment
    aln.back_translate()
    #   Then sanitize the alignment and tree
    aln.sanitize_outputs()
    #   Then copy them over
    log.info('Nucleotide alignment in ' + aln.final_aln)
    log.info('Tree in ' + aln.tree_out)
    new_nuc = os.path.join(
        arg['output'],
        os.path.basename(
            arg['fasta'].replace(
                '.fasta',
                '_MSA.fasta')
            )
        )
    new_tree = os.path.join(
        arg['output'],
        os.path.basename(
            arg['fasta'].replace('.fasta', '.tree')
            )
        )
    open(new_nuc, 'w').close()
    open(new_tree, 'w').close()
    shutil.copy2(aln.final_aln, new_nuc)
    shutil.copy2(aln.tree_out, new_tree)
    log.info('MSA copied to ' + new_nuc)
    log.info('Tree copied to ' + new_tree)
    #   Cleanup the temporary file
    os.remove(aln.final_aln)
    return
Ejemplo n.º 12
0
def main():
    """The main function."""
    #   The very first thing we do is do a base check to make sure that we can
    #   parse arguments
    dep = check_modules.check_modules()
    if dep:
        check_modules.missing_mods(dep)
        exit(1)
    #   Parse the arguments
    #   First, a check to see if any arguments were sent at all
    #   If not, then print the usage and exit
    if not sys.argv[1:]:
        parse_args.usage()
        exit(1)
    #   Import the config handling script
    import lrt_predict.Setup.parse_config as config
    #   vars() will convert from the weird Namespace type to a dictionary
    arguments = vars(parse_args.parse_args())
    #   Pull out the verbosity switch right away
    loglevel = set_verbosity.verbosity('LRT_Predict', arguments['loglevel'])
    #   If the config variable was passed
    if 'config' in arguments:
        #   We ask then if the setup command was not passed
        #   If the user wants to setup, then don't bother trying to validate
        #   the config.
        if arguments['action'] != 'setup':
            cfg = config.ConfigHandler(arguments['config'], arguments,
                                       arguments['loglevel'])
            if cfg.is_valid():
                cfg.read_vars()
                config_opts = cfg.merge_options()
            else:
                loglevel.error('Config file is not valid!')
                exit(1)
        #   Else, just set it to the options that were passed
        else:
            config_opts = arguments
    else:
        config_opts = arguments
    arguments_valid, msg = parse_args.validate_args(config_opts, loglevel)
    #   If we got a return value that isn't False, then our arguments are good
    if arguments_valid:
        loglevel.debug(arguments_valid['action'] + ' subcommand was invoked')
        #   Which command was invoked?
        if arguments_valid['action'] == 'setup':
            setup(arguments_valid)
        elif arguments_valid['action'] == 'fetch':
            fetch(arguments_valid, loglevel)
        elif arguments_valid['action'] == 'align':
            #   We will return the filename that contains the unaligned
            #   sequences, as we will use these as inputs for pasta
            unaligned_seqs = blast(arguments_valid, loglevel)
            #   Then add the query sequence and align them
            align(arguments_valid, unaligned_seqs, loglevel)
        elif arguments_valid['action'] == 'predict':
            out = predict(arguments_valid, loglevel)
            #   copy the output file into the destination directory
            #   To build the output filename, we join the output directory
            #   with a new name based on the input filename
            out_fname = os.path.join(
                arguments_valid['output'],
                os.path.basename(arguments_valid['fasta'].replace(
                    '.fasta', '_Predictions.txt')))
            open(out_fname, 'w').close()
            shutil.copy2(out.name, out_fname)
            loglevel.info('Prediction in ' + out_fname)
        elif arguments_valid['action'] == 'compile':
            compile_preds(arguments_valid, loglevel)
            return
    else:
        loglevel.error(msg)
    return