Example #1
0
def main():
    '''Main function'''
    argparse_usage = 'run_braker.py -m <masked_assembly> -b <bam_files>'
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-m',
                        '--masked_assembly',
                        nargs=1,
                        required=True,
                        help='Repeat-masked genome assembly in FASTA format')
    parser.add_argument('-b',
                        '--bam_files',
                        nargs='+',
                        required=True,
                        help='BAM files generated by Hisat2')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='braker_out',
                        help='Output directory')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used')
    parser.add_argument('-t',
                        '--translation_table',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Translation table (default: 1)')
    parser.add_argument('--fungus',
                        action='store_true',
                        help='--fungus flag for BRAKER')

    args = parser.parse_args()
    masked_assembly = os.path.abspath(args.masked_assembly[0])
    bam_files = [os.path.abspath(x) for x in args.bam_files]
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores
    translation_table = args.translation_table
    fungus_flag = '--fungus' if args.fungus else ''

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_braker.log')
    logger = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    adjusted_assembly = adjust_header(masked_assembly)
    run_braker(adjusted_assembly, bam_files, output_dir, log_dir, num_cores,
               translation_table, fungus_flag, logger)
Example #2
0
 def __init__(self):
     self.current_path = os.path.join(os.path.dirname(__file__), 'config')
     conf = ConfigParser.ConfigParser()
     conf.read(os.path.join(self.current_path, 'config.ini'))
     self.server_ip = conf.get('HTTP', 'host')
     self.port = conf.get('HTTP', 'port')
     self.timeout = float(conf.get('HTTP', 'timeout'))
     self.logger = set_logging.set_logging('CI')
Example #3
0
def set_loggings(output_dir):
    create_dir(output_dir)
    log_file = os.path.join(output_dir, 'logs', 'fungap.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    logger_txt.debug('\n============ New Run {} ============'.format(
        datetime.now()))
Example #4
0
def main():
    '''Main function'''
    argparser_usage = (
        'run_hisat2.py -r <fastq1> <fastq2> <fastq3> ...'
        ' -o <output_dir> -l <log_dir> -f <ref_fasta> -c <num_cores>'
        ' -m <max_intron>'
    )
    parser = ArgumentParser(usage=argparser_usage)
    parser.add_argument(
        '-r', '--read_files', nargs='+', required=True,
        help='Multiople read files in fastq format'
    )
    parser.add_argument(
        '-o', '--output_dir', nargs='?', default='hisat2_out',
        help='Output directory'
    )
    parser.add_argument(
        '-l', '--log_dir', nargs='?', default='logs',
        help='Log directory'
    )
    parser.add_argument(
        '-f', '--ref_fasta', nargs=1, required=True,
        help='Reference fasta'
    )
    parser.add_argument(
        '-c', '--num_cores', nargs='?', default=1, type=int,
        help='Number of cores'
    )
    parser.add_argument(
        '-m', '--max_intron', nargs='?', default=2000, type=int,
        help='Max intron length (Default: 2000 bp)'
    )

    args = parser.parse_args()

    read_files = [os.path.abspath(x) for x in args.read_files]
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    ref_fasta = os.path.abspath(args.ref_fasta[0])
    num_cores = args.num_cores
    max_intron = args.max_intron

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_hisat2.log')
    logger = set_logging(log_file)
    logger_time = logger[0]

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START: Hisat2')
    run_hisat2(
        read_files, output_dir, log_dir, ref_fasta, num_cores,
        max_intron, logger
    )
    logger_time.debug('DONE : Hisat2')
Example #5
0
def main(argv):
    argparse_usage = 'run_braker1.py -m <masked_assembly> -b <bam_files>'
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument("-m",
                        "--masked_assembly",
                        nargs=1,
                        required=True,
                        help="Repeat-masked genome assembly in FASTA format")
    parser.add_argument("-b",
                        "--bam_files",
                        nargs='+',
                        required=True,
                        help="BAM files generated by Hisat2")
    parser.add_argument("-o",
                        "--output_dir",
                        nargs='?',
                        default='braker1_out',
                        help="Output directory")
    parser.add_argument("-l",
                        "--log_dir",
                        nargs='?',
                        default='logs',
                        help="Log directory")
    parser.add_argument("-c",
                        "--num_cores",
                        nargs='?',
                        default=1,
                        type=int,
                        help="Number of cores to be used")
    parser.add_argument('--fungus',
                        action='store_true',
                        help='--fungus flag for BRAKER1')

    args = parser.parse_args()
    masked_assembly = os.path.abspath(args.masked_assembly[0])
    bam_files = [os.path.abspath(x) for x in args.bam_files]
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores
    if args.fungus:
        fungus_flag = '--fungus'
    else:
        fungus_flag = ''

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_braker1.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    run_braker1(masked_assembly, bam_files, output_dir, log_dir, num_cores,
                fungus_flag)
Example #6
0
def main(argv):
    argparse_usage = (
        'run_blastn.py -q <query_fasta> -d <db_fasta> -o <output_prefix> '
        '-l <log_dir> -c <num_cores>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-q',
                        '--query_fasta',
                        nargs=1,
                        required=True,
                        help='Query FASTA file')
    parser.add_argument('-d',
                        '--db_fasta',
                        nargs=1,
                        required=True,
                        help='Database FASTA file')
    parser.add_argument('-o',
                        '--output_prefix',
                        nargs='?',
                        default='out',
                        help='Output prefix')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used')

    args = parser.parse_args()
    query_fasta = os.path.abspath(args.query_fasta[0])
    db_fasta = os.path.abspath(args.db_fasta[0])
    output_prefix = os.path.abspath(args.output_prefix)
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores

    # Set logging
    create_dir(log_dir)

    log_file = os.path.join(log_dir, 'run_blastn.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    logger_time.debug('START: BLASTn for {}'.format(
        os.path.basename(query_fasta)))
    # Run functions :) Slow is as good as Fast
    run_blastn(query_fasta, db_fasta, output_prefix, log_dir, num_cores)
    logger_time.debug('Done : BLASTn for {}'.format(
        os.path.basename(query_fasta)))
Example #7
0
def main():
    '''Main function'''
    argparse_usage = 'run_augustus.py -m <masked_assembly> -s <species>'
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-m',
                        '--masked_assembly',
                        nargs=1,
                        required=True,
                        help='Repeat-masked genome assembly in FASTA format')
    parser.add_argument('-s',
                        '--species',
                        nargs=1,
                        required=True,
                        help='Augustus reference species')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='augustus_out',
                        help='Output directory (default: augustus_out)')
    parser.add_argument('-t',
                        '--translation_table',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Translation table (default: 1)')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory')

    args = parser.parse_args()
    masked_assembly = os.path.abspath(args.masked_assembly[0])
    species = args.species[0]
    output_dir = os.path.abspath(args.output_dir)
    translation_table = args.translation_table
    log_dir = os.path.abspath(args.log_dir)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_augustus.log')
    logger = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    run_augustus(masked_assembly, output_dir, species, translation_table,
                 logger)
    parse_augustus(output_dir)
Example #8
0
def main(argv):
    argparse_usage = ('run_blast_reduce.py -q <query_fasta> -d <db_fasta> '
                      '-l <log_dir> -c <num_cores>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-q',
                        '--query_fasta',
                        nargs=1,
                        required=True,
                        help='Query FASTA file')
    parser.add_argument('-d',
                        '--db_fasta',
                        nargs=1,
                        required=True,
                        help='Database FASTA files')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used')

    args = parser.parse_args()
    query_fasta = os.path.abspath(args.query_fasta[0])
    db_fasta = os.path.abspath(args.db_fasta[0])
    log_dir = args.log_dir
    num_cores = args.num_cores

    # Check input FASTA is valid
    if not glob(query_fasta):
        print '[ERROR] No such file: {}'.format(query_fasta)
        sys.exit(2)

    # Create necessary dirs
    create_dir(log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_blastp_reduce.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START: BLASTp')
    run_blastp(query_fasta, db_fasta, log_dir, num_cores)
    logger_time.debug('DONE : BLASTp')
Example #9
0
def main(argv):
    argparse_usage = (
        'run_busco.py -i <input_fasta> -o <output_dir> -l <log_dir> '
        '-c <num_cores>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-i',
                        '--input_fasta',
                        nargs=1,
                        required=True,
                        help='Input protein FASTA file')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='busco_out',
                        help='Output directory (default: busco_out)')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory (default: logs)')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used (default: 1)')

    args = parser.parse_args()
    input_fasta = os.path.abspath(args.input_fasta[0])
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores

    # Create necessary dir
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_busco.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is always better than Fast
    run_busco(input_fasta, output_dir, log_dir, num_cores)
Example #10
0
def main():
    '''Main function'''
    argparse_usage = ('run_busco.py -i <input_fasta> -d <lineage_dataset>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-i',
                        '--input_fasta',
                        nargs=1,
                        required=True,
                        help='Input protein FASTA file')
    parser.add_argument(
        '-d',
        '--lineage_dataset',
        nargs=1,
        required=True,
        help='BUSCO lineage dataset (run "busco --list-datasets" for the list)'
    )
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='busco_out',
                        help='Output directory (default: busco_out)')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory (default: logs)')

    args = parser.parse_args()
    input_fasta = os.path.abspath(args.input_fasta[0])
    lineage_dataset = args.lineage_dataset[0]
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)

    # Create necessary dir
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_busco.log')
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is always better than Fast
    log_tup = (log_dir, logger_time, logger_txt)
    run_busco(input_fasta, lineage_dataset, output_dir, log_tup)
Example #11
0
def main():
    '''Main function'''
    argparse_usage = 'run_repeat_modeler.py -g <genome_assembly>'
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-g',
                        '--genome_assembly',
                        nargs=1,
                        required=True,
                        help='Genome assembly file in FASTA format')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='repeat_modeler_out',
                        help='Output directory (default: repeat_modeler_out)')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory (default: logs)')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used')

    args = parser.parse_args()
    genome_assembly = os.path.abspath(args.genome_assembly[0])
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_repeat_modeler.log')
    logger = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    run_repeat_modeler(genome_assembly, output_dir, log_dir, num_cores, logger)
def main(argv):
    optparse_usage = (
        'run_repeat_modeler.py -g <genome_assembly>'
    )
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument(
        "-g", "--genome_assembly", nargs=1, required=True,
        help="Genome assembly file in FASTA format"
    )
    parser.add_argument(
        "-o", "--output_dir", nargs='?', default='repeat_modeler_out',
        help="Output directory"
    )
    parser.add_argument(
        "-l", "--log_dir", nargs='?', default='logs',
        help="Log directory"
    )
    parser.add_argument(
        "-c", "--num_cores", nargs='?', default=1, type=int,
        help="Number of cores to be used"
    )

    args = parser.parse_args()
    genome_assembly = os.path.abspath(args.genome_assembly[0])
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(
        log_dir, 'run_repeat_modeler.log'
    )
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    run_repeat_modeler(genome_assembly, output_dir, log_dir, num_cores)
Example #13
0
def main(argv):
    argparse_usage = 'run_pfam_scan.py -i <input_fasta> -l <log_dir>'
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-i',
                        '--input_fasta',
                        nargs=1,
                        required=True,
                        help='Input protein FASTA format')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used')

    args = parser.parse_args()
    input_fasta = os.path.abspath(args.input_fasta[0])
    log_dir = os.path.abspath(args.log_dir)
    num_cores = args.num_cores

    # Create necessary dirs
    create_dir(log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_pfam_scan.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as fast
    new_input_fasta = check_sequence(input_fasta)
    run_pfam_scan(new_input_fasta, log_dir, num_cores)
Example #14
0
def main(argv):
    optparse_usage = (
        'run_interproscan.py -i <input_fasta> -o <output_dir> -l <log_dir>'
        ' -C <config_file>')
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument("-i",
                        "--input_fasta",
                        dest="input_fasta",
                        nargs=1,
                        help="Input protein FASTA format")
    parser.add_argument("-o",
                        "--output_dir",
                        dest="output_dir",
                        nargs=1,
                        help="Output directory")
    parser.add_argument("-l",
                        "--log_dir",
                        dest="log_dir",
                        nargs=1,
                        help="Log directory")
    parser.add_argument("-C",
                        "--config_file",
                        dest="config_file",
                        nargs=1,
                        help="Config file generated by check_dependencies.py")

    args = parser.parse_args()
    if args.input_fasta:
        input_fasta = os.path.abspath(args.input_fasta[0])
    else:
        print '[ERROR] Please provide INPUT FASTA'
        sys.exit(2)

    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] Please provide OUTPUT DIRECTORY'
        sys.exit(2)

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide LOG DIRECTORY'
        sys.exit(2)

    if args.config_file:
        config_file = os.path.abspath(args.config_file[0])
    else:
        print '[ERROR] Please provide CONFIG FILE'
        sys.exit(2)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'pipeline', 'run_interproscan_pfam.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as fast
    interproscan_bin = parse_config(config_file)
    new_input_fasta = check_sequence(input_fasta)
    run_iprscan(new_input_fasta, output_dir, log_dir, interproscan_bin)
Example #15
0
def main():
    '''Main function'''
    optparse_usage = (
        'run_maker.py -i <input_fasta> -p <protein_db_fasta> -c <num_cores> '
        '-R <repeat_model> -e <est_files>')
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument('-i',
                        '--input_fasta',
                        nargs=1,
                        required=True,
                        help='Input genome sequence in FASTA format')
    parser.add_argument('-a',
                        '--augustus_species',
                        nargs=1,
                        required=True,
                        help='"augustus --species=help" would be helpful')
    parser.add_argument('-p',
                        '--protein_db_fasta',
                        nargs='+',
                        required=True,
                        help='Protein db in FASTA foramt')
    parser.add_argument(
        '-R',
        '--repeat_model',
        nargs=1,
        required=True,
        help='De novo repeat model by RepeatModeler: consensi.fa.classified')
    parser.add_argument('-e',
                        '--est_files',
                        nargs='+',
                        required=True,
                        help='Multiple EST data if available')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='maker_out',
                        help='Output directory')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory')
    parser.add_argument('-t',
                        '--translation_table',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Translation table (default: 1)')
    parser.add_argument('--gmes_fungus',
                        action='store_true',
                        help='--fungus flag in GeneMark')

    args = parser.parse_args()
    input_fasta = os.path.abspath(args.input_fasta[0])
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    augustus_species = args.augustus_species[0]
    protein_db_fastas = [os.path.abspath(x) for x in args.protein_db_fasta]
    num_cores = args.num_cores
    repeat_model = os.path.abspath(args.repeat_model[0])
    est_files = [os.path.abspath(x) for x in args.est_files]
    translation_table = args.translation_table

    if args.gmes_fungus:
        gmes_fungus = '--fungus'
    else:
        gmes_fungus = ''

    # Create necessary directory
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_maker.log')
    logger = set_logging(log_file)
    logger_time, logger_txt = logger

    # Run Maker on each EST file
    all_gff_file = ''
    for est_file in est_files:
        # Create directory
        est_prefix = os.path.basename(os.path.splitext(est_file)[0])
        est_prefix = est_prefix.replace('Trinity_', '')
        est_dir = os.path.join(output_dir, est_prefix)
        if not glob(est_dir):
            os.mkdir(est_dir)

        # Check maker is already done
        run_flag_run1 = check_maker_finished(output_dir, input_fasta, '1',
                                             est_prefix)

        # Run Maker batch
        logger_time.debug('START running Maker run1')
        if run_flag_run1:
            run_maker_batch(input_fasta, output_dir, log_dir,
                            protein_db_fastas, num_cores, repeat_model,
                            est_file, all_gff_file, logger)
        else:
            logger_txt.debug('[Note] Running Maker has already been finished')
        logger_time.debug('DONE  running Maker run1')

        # Train run1 & run Maker run2
        all_gff_file_run1 = collect_result(input_fasta, output_dir, '1',
                                           est_prefix, logger)
        logger_time.debug('START training run1 & running maker run2')
        snap_hmm_file_run1 = train_snap(output_dir, all_gff_file_run1, '1',
                                        est_prefix, logger)
        run_flag_run2 = check_maker_finished(output_dir, input_fasta, '2',
                                             est_prefix)
        if run_flag_run2:
            run_maker_trained(input_fasta, output_dir, log_dir,
                              augustus_species, num_cores, snap_hmm_file_run1,
                              all_gff_file_run1, '2', est_prefix, logger)
        else:
            logger_txt.debug('[Note] Running Maker has already been finished')
        logger_time.debug('DONE  training run1 & running maker run2')

        # Train run2 & run Maker run3
        all_gff_file_run2 = collect_result(input_fasta, output_dir, '2',
                                           est_prefix, logger)
        logger_time.debug('START training run2 & running maker run3')
        snap_hmm_file_run2 = train_snap(output_dir, all_gff_file_run2, '2',
                                        est_prefix, logger)
        run_flag_run3 = check_maker_finished(output_dir, input_fasta, '3',
                                             est_prefix)
        if run_flag_run3:
            run_maker_trained(input_fasta, output_dir, log_dir,
                              augustus_species, num_cores, snap_hmm_file_run2,
                              all_gff_file_run2, '3', est_prefix, logger)
        else:
            logger_txt.debug('[Note] Running Maker has already been finished')
        logger_time.debug('DONE  training run2 & running maker run3')

        # Now, for final run, get masked assembly and get GeneMark hmm model
        masked_assembly = get_masked_asm(output_dir, est_files, logger)

        # Run gmes or gmsn
        eukgmhmmfile = run_gmes(masked_assembly, num_cores, output_dir,
                                log_dir, gmes_fungus, logger)

        # Train run3 & run Maker run4
        all_gff_file_run3 = collect_result(input_fasta, output_dir, '3',
                                           est_prefix, logger)
        logger_time.debug('START training run3 & running maker run4')
        snap_hmm_file_run3 = train_snap(output_dir, all_gff_file_run3, '3',
                                        est_prefix, logger)
        run_flag_run4 = check_maker_finished(output_dir, input_fasta, '4',
                                             est_prefix)
        if run_flag_run4:
            run_maker_trained(input_fasta, output_dir, log_dir,
                              augustus_species, num_cores, snap_hmm_file_run3,
                              all_gff_file_run3, '4', est_prefix, logger,
                              eukgmhmmfile)
        else:
            logger_txt.debug('[Note] Running Maker has already been finished')
        logger_time.debug('DONE  training run3 & running maker run4')

        # Get final GFF3 & FASTA
        collect_result_final(input_fasta, output_dir, est_prefix,
                             translation_table, logger)
        all_gff_file = collect_result(input_fasta, output_dir, '4', est_prefix,
                                      logger)
Example #16
0
def main():
    '''Main function'''
    argparse_usage = (
        'filter_gff3s.py -a <genome_assembly> -i <input_gff3s> '
        '-m <mapping_file> -b <blastp_dict> -B <busco_dict> -p <pfam_dict> '
        '-N <blastn_dict> -g <bad_dict> -n <nr_prot_file> -o <output_dir>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-a',
                        '--genome_assembly',
                        nargs=1,
                        required=True,
                        help='Genome assembly file')
    parser.add_argument('-i',
                        '--input_gff3s',
                        nargs='+',
                        required=True,
                        help='Multiple gff3 files')
    parser.add_argument('-m',
                        '--mapping_file',
                        nargs=1,
                        required=True,
                        help='Mapping txt file (make_nr_prot.py)')
    parser.add_argument(
        '-b',
        '--blastp_dict',
        nargs=1,
        required=True,
        help='Parsed blastp output in dictionary (import_blastp.py)')
    parser.add_argument(
        '-B',
        '--busco_dict',
        nargs=1,
        required=True,
        help='Parsed BUSCO output in dictionary (import_busco.py)')
    parser.add_argument(
        '-p',
        '--pfam_dict',
        nargs=1,
        required=True,
        help='Parsed Pfam_scan output in dictionary (import_pfam.py)')
    parser.add_argument(
        '-N',
        '--blastn_dict',
        nargs=1,
        required=True,
        help='Parsed BLASTn output in dictionary (import_blastn.py)')
    parser.add_argument('-g',
                        '--bad_dict',
                        nargs=1,
                        required=True,
                        help='Parsed IPRscan output in dictionary')
    parser.add_argument('-n',
                        '--nr_prot_file',
                        nargs=1,
                        required=True,
                        help='nr_prot.faa file (make_nr_prot.py)')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='gene_filtering',
                        help='Output directory')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='log_dir',
                        help='Log directory')

    args = parser.parse_args()
    genome_assembly = os.path.abspath(args.genome_assembly[0])
    input_gff3s = [os.path.abspath(x) for x in args.input_gff3s]
    mapping_file = os.path.abspath(args.mapping_file[0])
    blastp_dict = os.path.abspath(args.blastp_dict[0])
    busco_dict = os.path.abspath(args.busco_dict[0])
    pfam_dict = os.path.abspath(args.pfam_dict[0])
    blastn_dict = os.path.abspath(args.blastn_dict[0])
    bad_dict = os.path.abspath(args.bad_dict[0])
    d_bad = pickle.load(open(bad_dict, 'rb'))
    nr_prot_file = os.path.abspath(args.nr_prot_file[0])
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'filter_gff3s.log')
    logger_time = set_logging(log_file)[0]

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START: Filtering GFF3')
    d_mapping, d_mapping_rev = import_mapping(mapping_file)

    # Import dictionaries
    d_blastp = pickle.load(open(blastp_dict, 'rb'))
    d_busco = pickle.load(open(busco_dict, 'rb'))
    d_pfam = pickle.load(open(pfam_dict, 'rb'))
    d_blastn = pickle.load(open(blastn_dict, 'rb'))

    # Self-filtering
    for input_gff3 in input_gff3s:
        prefix = re.sub(r'\.gff3$', '', os.path.basename(input_gff3))
        d_gff3, d_gene, d_cds, d_cds_len, d_exon = import_gff3([input_gff3])
        self_filtered = filtering(d_cds, d_cds_len, d_blastp, d_busco, d_pfam,
                                  d_blastn, d_bad, output_dir)
        outfile_self = os.path.join(output_dir,
                                    '{}_filtered.list'.format(prefix))
        outhandle_self = open(outfile_self, 'w')

        cds_len_filtered = 0
        for tup in self_filtered:
            outhandle_self.write('{}\n'.format(tup[1]))
            cds_len_filtered += d_cds_len[tup]
        outhandle_self.close()

    # Filtering
    d_gff3, d_gene, d_cds, d_cds_len, d_exon = import_gff3(input_gff3s)
    final_gene_set = filtering(d_cds, d_cds_len, d_blastp, d_busco, d_pfam,
                               d_blastn, d_bad, output_dir)
    d_prot = import_prot(nr_prot_file, d_mapping_rev)
    write_final_prots(final_gene_set, d_mapping, output_dir)
    write_files(genome_assembly, final_gene_set, d_gene, d_gff3, d_prot,
                d_exon, output_dir, d_cds)

    cds_len_final = 0
    for tup in final_gene_set:
        cds_len_final += d_cds_len[tup]

    logger_time.debug('DONE : Filtering GFF3')
Example #17
0
def main():
    '''Main function'''
    argparse_usage = (
        'run_trinity.py -b <bam_files> -o <output_dir> -l <log_dir> '
        '-c <num_cores> -m <max_intron>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-b',
                        '--bam_files',
                        nargs='+',
                        required=True,
                        help='Sorted BAM files generated by HISAT2')
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='trinity_out',
                        help='Output directory (default: trinity_out)')
    parser.add_argument('-l',
                        '--log_dir',
                        nargs='?',
                        default='logs',
                        help='Log directory (default: logs)')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used (default: 1)')
    parser.add_argument('-m',
                        '--max_intron',
                        nargs='?',
                        default=2000,
                        type=int,
                        help='Max intron length (Default: 2000 bp)')
    parser.add_argument('--jaccard_clip',
                        action='store_true',
                        help='--jaccard_clip flag in Trinity')

    args = parser.parse_args()
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)
    bam_files = [os.path.abspath(x) for x in args.bam_files]
    num_cores = args.num_cores
    max_intron = args.max_intron

    if args.jaccard_clip:
        jaccard_clip_flag = '--jaccard_clip'
    else:
        jaccard_clip_flag = ''

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'run_trinity.log')
    logger = set_logging(log_file)
    logger_txt = logger[1]

    # Check bamfile
    bam_files = [x for x in bam_files if glob(x)]
    if not bam_files:
        logger_txt.debug('[ERROR] You provided wrong BAM FILES. Please check')
        sys.exit(2)

    # Run functions :)
    run_trinity(bam_files, output_dir, log_dir, num_cores, max_intron,
                jaccard_clip_flag, logger)
Example #18
0
def main(argv):
    optparse_usage = (
        'run_maker.py -i <input_fasta> -r <root_dir> -p <project_name>'
        ' -P <protein_db_fastas> -c <num_cores> -R <repeat_model>'
        ' -e <est_files> -C <config_file>')
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument("-i",
                        "--input_fasta",
                        dest="input_fasta",
                        nargs=1,
                        help="Input genome sequence in FASTA format")
    parser.add_argument("-r",
                        "--root_dir",
                        dest="root_dir",
                        nargs=1,
                        help="Resulting files will be generated here")
    parser.add_argument("-a",
                        "--augustus_species",
                        dest="augustus_species",
                        nargs=1,
                        help='"augustus --species=help" would be helpful')
    parser.add_argument("-p",
                        "--project_name",
                        dest="project_name",
                        nargs=1,
                        help="Output prefix for resulting files without space")
    parser.add_argument(
        "-P",
        "--protein_db_fastas",
        dest="protein_db_fastas",
        nargs=1,
        help="Protein db in FASTA foramt. It could be SwissProt "
        "or UniProt database")
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help="Number of cores to be used")
    parser.add_argument('-R',
                        '--repeat_model',
                        dest="repeat_model",
                        nargs=1,
                        help="Custom repeat model by RepeatModeler")
    parser.add_argument('-e',
                        '--est_files',
                        dest="est_files",
                        nargs='*',
                        help="Multiple EST data if available")
    parser.add_argument("-C",
                        "--config_file",
                        dest="config_file",
                        nargs=1,
                        help="Config file generated by check_dependencies.py")
    parser.add_argument('--gmes_fungus',
                        dest='gmes_fungus',
                        action='store_true',
                        help='--fungus flag in GeneMark')

    args = parser.parse_args()
    if args.input_fasta:
        input_fasta = os.path.abspath(args.input_fasta[0])
    else:
        print '[ERROR] Please provide INPUT FASTA'
        sys.exit(2)

    if args.root_dir:
        root_dir = os.path.abspath(args.root_dir[0])
    else:
        print '[ERROR] Please provide ROOT DIRECTORY'
        sys.exit(2)

    if args.augustus_species:
        augustus_species = args.augustus_species[0]
    else:
        print '[ERROR] Please provide AUGUSTUS SPECIES'
        sys.exit(2)

    if args.project_name:
        project_name = args.project_name[0]
    else:
        print '[ERROR] Please provide PROJECT NAME'
        sys.exit(2)

    if args.protein_db_fastas:
        protein_db_fastas = [
            os.path.abspath(x) for x in args.protein_db_fastas
        ]
    else:
        print '[ERROR] Please provide PROTEIN DB FASTA FILES'
        sys.exit(2)

    if args.num_cores:
        num_cores = args.num_cores[0]
    else:
        num_cores = 1

    if args.repeat_model:
        repeat_model = os.path.abspath(args.repeat_model[0])
    else:
        print '[ERROR] Please provide REPEAT MODEL'
        sys.exit(2)

    if args.est_files:
        est_files = [os.path.abspath(x) for x in args.est_files]
    else:
        est_files = ['']

    if args.config_file:
        config_file = os.path.abspath(args.config_file[0])
    else:
        print '[ERROR] Please provide CONFIG FILE'
        sys.exit(2)

    if args.gmes_fungus:
        gmes_fungus = '--fungus'
    else:
        gmes_fungus = ''

    # Create necessary directory
    create_dir(root_dir)

    # Set logging
    log_file = os.path.join(root_dir, 'logs', 'pipeline', 'run_maker.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    maker_bin, genemark_bin = parse_config(config_file)

    # Run Maker on each EST file
    all_gff_file = ''
    for est_file in est_files:
        # Create directory
        est_prefix = (os.path.basename(est_file).split('.')[0].replace(
            'Trinity_', ''))
        est_dir = os.path.join(root_dir, software, est_prefix)
        if not glob(est_dir):
            os.mkdir(est_dir)

        # Check maker is already done
        run_flag_run1 = check_maker_finished(root_dir, input_fasta, '1',
                                             est_prefix)

        # Run Maker batch
        logger_time.debug('START running Maker run1')
        if run_flag_run1:
            run_maker_batch(input_fasta, root_dir, augustus_species,
                            protein_db_fastas, num_cores, repeat_model,
                            est_file, all_gff_file, maker_bin)
        else:
            logger_txt.debug('Running Maker has already been finished')
        logger_time.debug('DONE  running Maker run1')

        # Train run1 & run Maker run2
        all_gff_file_run1 = collect_result(input_fasta, root_dir, project_name,
                                           '1', est_prefix)
        logger_time.debug('START training run1 & running maker run2')
        snap_hmm_file_run1 = train_snap(root_dir, all_gff_file_run1, '1',
                                        est_prefix, maker_bin)
        run_flag_run2 = check_maker_finished(root_dir, input_fasta, '2',
                                             est_prefix)
        if run_flag_run2:
            run_maker_trained(input_fasta, root_dir, augustus_species,
                              num_cores, snap_hmm_file_run1, all_gff_file_run1,
                              '2', est_prefix, maker_bin)
        else:
            logger_txt.debug('Running Maker has already been finished')
        logger_time.debug('DONE  training run1 & running maker run2')

        # Train run2 & run Maker run3
        all_gff_file_run2 = collect_result(input_fasta, root_dir, project_name,
                                           '2', est_prefix)
        logger_time.debug('START training run2 & running maker run3')
        snap_hmm_file_run2 = train_snap(root_dir, all_gff_file_run2, '2',
                                        est_prefix, maker_bin)
        run_flag_run3 = check_maker_finished(root_dir, input_fasta, '3',
                                             est_prefix)
        if run_flag_run3:
            run_maker_trained(input_fasta, root_dir, augustus_species,
                              num_cores, snap_hmm_file_run2, all_gff_file_run2,
                              '3', est_prefix, maker_bin)
        else:
            logger_txt.debug('Running Maker has already been finished')
        logger_time.debug('DONE  training run2 & running maker run3')

        # Now, for final run, get masked assembly and get GeneMark hmm model
        masked_assembly = get_masked_asm(root_dir, est_files)

        # Run gmes or gmsn
        eukgmhmmfile = run_gmes(masked_assembly, num_cores, root_dir,
                                genemark_bin, gmes_fungus)

        # Train run3 & run Maker run4
        all_gff_file_run3 = collect_result(input_fasta, root_dir, project_name,
                                           '3', est_prefix)
        logger_time.debug('START training run3 & running maker run4')
        snap_hmm_file_run3 = train_snap(root_dir, all_gff_file_run3, '3',
                                        est_prefix, maker_bin)
        run_flag_run4 = check_maker_finished(root_dir, input_fasta, '4',
                                             est_prefix)
        if run_flag_run4:
            run_maker_trained(input_fasta, root_dir, augustus_species,
                              num_cores, snap_hmm_file_run3, all_gff_file_run3,
                              '4', est_prefix, maker_bin, eukgmhmmfile)
        else:
            logger_txt.debug('Running Maker has already been finished')
        logger_time.debug('DONE  training run3 & running maker run4')

        # Get final GFF3 & FASTA
        collect_result_final(input_fasta, root_dir, project_name, est_prefix)

        all_gff_file = collect_result(input_fasta, root_dir, project_name, '4',
                                      est_prefix)
Example #19
0
def main(argv):
    argparse_usage = (
        'fungap.py -g <genome_assembly> -r <trans_read_files> '
        '-o <output_dir> -p <project_name> -a <augustus_species> '
        '-O <org_id> -s <sister_proteome>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument("-o",
                        "--output_dir",
                        dest="output_dir",
                        nargs=1,
                        help="Output directory")
    parser.add_argument("-r",
                        "--trans_read_files",
                        dest="trans_read_files",
                        nargs=2,
                        help="Multiple transcriptome read files in FASTAQ"
                        " (two paired-end files)")
    parser.add_argument(
        "-p",
        "--project_name",
        dest="project_name",
        nargs=1,
        help="Project name without space. e.g. Mag, Eco, Pst_LUM")
    parser.add_argument("-g",
                        "--genome_assembly",
                        dest="genome_assembly",
                        nargs=1,
                        help="Genome assembly file in FASTA format")
    parser.add_argument("-a",
                        "--augustus_species",
                        dest="augustus_species",
                        nargs=1,
                        help="AUGUSTUS species")
    parser.add_argument(
        "-O",
        "--org_id",
        dest="org_id",
        nargs=1,
        help="Organism ID. E.g. Hypma for Hypsizygus marmoreus")
    parser.add_argument("-s",
                        "--sister_proteome",
                        dest="sister_proteome",
                        nargs=1,
                        help="Sister proteome sequences in .faa")
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help="Number of cores to be used")

    parser.add_argument(
        "-H",
        "--with_hisat2",
        dest="with_hisat2",
        nargs='?',
        help="User-defined Hisat2 installation path (binary directory)")
    parser.add_argument(
        "-t",
        "--with_trinity",
        dest="with_trinity",
        nargs='?',
        help="User-defined Trinity installation path (binary directory)")
    parser.add_argument(
        "-m",
        "--with_maker",
        dest="with_maker",
        nargs='?',
        help="User-defined Maker installation path (binary directory)")
    parser.add_argument(
        "-R",
        "--with_repeat_modeler",
        dest="with_repeat_modeler",
        nargs='?',
        help="User-defined Repeat Modeler installation path (binary directory)"
    )
    parser.add_argument(
        "-b",
        "--with_braker1",
        dest="with_braker1",
        nargs='?',
        help="User-defined Braker1 installation path (binary directory)")
    parser.add_argument(
        "-B",
        "--with_busco",
        dest="with_busco",
        nargs='?',
        help="User-defined BUSCO installation path (binary directory)")
    parser.add_argument(
        "-i",
        "--with_interproscan",
        dest="with_interproscan",
        nargs='?',
        help="User-defined InterproScan installation path (binary directory)")

    # Options for non-fungus genome
    parser.add_argument(
        '--no_braker_fungus',
        dest='no_braker_fungus',
        action='store_true',
        help='No --fungus flag in BRAKER for non-fungus genomes')
    parser.add_argument(
        '--no_jaccard_clip',
        dest='no_jaccard_clip',
        action='store_true',
        help='No --jaccard_clip flag in Trinity for non-fungus genomes')
    parser.add_argument(
        '--no_genemark_fungus',
        dest='no_genemark_fungus',
        action='store_true',
        help='No --fungus flag in GeneMark for non-fungus genomes')
    parser.add_argument("-M",
                        "--max_intron",
                        dest="max_intron",
                        nargs='?',
                        help="Max intron length (Default: 2,000 bp)")

    args = parser.parse_args()
    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] Please provide OUTPUT DIRECTORY'
        sys.exit(2)

    if args.trans_read_files:
        trans_read_files = [os.path.abspath(x) for x in args.trans_read_files]
    else:
        print '[ERROR] Please provide TRANSCRIPTOME READ FILES'
        sys.exit(2)

    if args.project_name:
        project_name = args.project_name[0]
    else:
        print '[ERROR] Please provide PROJECTN NAME'
        sys.exit(2)

    if args.genome_assembly:
        genome_assembly = os.path.abspath(args.genome_assembly[0])
    else:
        print '[ERROR] Please provide transcriptome read files'
        sys.exit(2)

    if args.augustus_species:
        augustus_species = args.augustus_species[0]
    else:
        print '[ERROR] Please provide transcriptome AUGUSTUS SPECIES'
        sys.exit(2)

    if args.org_id:
        org_id = args.org_id[0]
    else:
        print '[ERROR] Please provide transcriptome ORGANISM ID'
        sys.exit(2)

    if args.sister_proteome:
        sister_proteome = os.path.abspath(args.sister_proteome[0])
    else:
        print '[ERROR] Please provide TRANSCRIPTOME READ FILES'
        sys.exit(2)

    if args.num_cores:
        num_cores = args.num_cores[0]
    else:
        print '[ERROR] Please provide NUMBER OF CORES'
        sys.exit(2)

    if args.with_hisat2:
        with_hisat2 = os.path.abspath(args.with_hisat2)
    else:
        with_hisat2 = ''

    if args.with_trinity:
        with_trinity = os.path.abspath(args.with_trinity)
    else:
        with_trinity = ''

    if args.with_maker:
        with_maker = os.path.abspath(args.maker)
    else:
        with_maker = ''

    if args.with_repeat_modeler:
        with_repeat_modeler = os.path.abspath(args.with_repeat_modeler)
    else:
        with_repeat_modeler = ''

    if args.with_braker1:
        with_braker1 = os.path.abspath(args.with_braker1)
    else:
        with_braker1 = ''

    if args.with_busco:
        with_busco = os.path.abspath(args.with_busco)
    else:
        with_busco = ''

    if args.with_interproscan:
        with_interproscan = os.path.abspath(args.with_interproscan)
    else:
        with_interproscan = ''

    # For non-fungus genomes
    if args.no_braker_fungus:
        no_braker_fungus = ''
    else:
        no_braker_fungus = '--fungus'

    if args.no_jaccard_clip:
        no_jaccard_clip = ''
    else:
        no_jaccard_clip = '--jaccard_clip'

    if args.no_genemark_fungus:
        no_genemark_fungus = ''
    else:
        no_genemark_fungus = '--gmes_fungus'

    if args.max_intron:
        max_intron = int(args.max_intron)
    else:
        max_intron = 2000

    # Create nessasary dirs
    create_dir(output_dir)

    # Set logging
    log_file = os.path.join(output_dir, 'logs', 'pipeline', 'fungap.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    logger_txt.debug("\n============ New Run %s ============" %
                     (datetime.now()))

    # Run functions :) Slow is as good as Fast
    config_file = run_check_dependencies(output_dir, with_hisat2, with_trinity,
                                         with_maker, with_repeat_modeler,
                                         with_braker1, with_busco,
                                         with_interproscan)
    trans_bams = run_hisat2(genome_assembly, trans_read_files, output_dir,
                            num_cores, config_file, max_intron)
    trinity_asms = run_trinity(trans_bams, output_dir, project_name, num_cores,
                               config_file, no_jaccard_clip, max_intron)
    repeat_model_file = run_repeat_modeler(genome_assembly, output_dir,
                                           project_name, num_cores,
                                           config_file)
    maker_gff3s, maker_faas = run_maker(genome_assembly, output_dir,
                                        augustus_species, project_name,
                                        sister_proteome, num_cores,
                                        repeat_model_file, trinity_asms,
                                        config_file, no_genemark_fungus)
    # Get masked assembly
    masked_assembly = os.path.join(output_dir, 'gpre_maker',
                                   'masked_assembly.fasta')

    # Run Augustus
    augustus_gff3, augustus_faa = run_augustus(masked_assembly, output_dir,
                                               augustus_species)

    # Run Braker1
    braker1_gff3s, braker1_faas = run_braker1(masked_assembly, trans_bams,
                                              output_dir, num_cores,
                                              config_file, no_braker_fungus)

    # Run BUSCO on each gene models
    if not glob(os.path.join(output_dir, 'gpre_busco')):
        os.mkdir(os.path.join(output_dir, 'gpre_busco'))

    for maker_faa in maker_faas:
        maker_prefix = os.path.basename(maker_faa).split('.')[0]
        maker_busco = os.path.join(output_dir, 'gpre_busco', maker_prefix)
        run_busco(maker_faa, maker_busco, num_cores, config_file)

    augustus_prefix = os.path.basename(augustus_faa).split('.')[0]
    augustus_busco = os.path.join(output_dir, 'gpre_busco', augustus_prefix)
    run_busco(augustus_faa, augustus_busco, num_cores, config_file)

    for braker1_faa in braker1_faas:
        braker1_prefix = os.path.basename(braker1_faa).split('.')[0]
        braker1_busco = os.path.join(output_dir, 'gpre_busco', braker1_prefix)
        run_busco(braker1_faa, braker1_busco, num_cores, config_file)

    busco_dir = os.path.join(output_dir, 'gpre_busco')

    # Get protein nr by removing identical proteins
    all_prot_files = maker_faas + [augustus_faa] + braker1_faas
    nr_prot_file, nr_prot_mapping_file = make_nr_prot(all_prot_files,
                                                      output_dir)

    # Run BLASTp with nr prot file
    blastp_output = run_blastp(nr_prot_file, output_dir, sister_proteome,
                               num_cores)

    # Run IPRscan with nr prot file
    ipr_output = run_iprscan(nr_prot_file, output_dir, config_file)

    # Import BLAST, BUSCO and Pfam score
    blast_dict_score, blast_dict_evalue = import_blast(blastp_output,
                                                       nr_prot_mapping_file)
    busco_dict_score, busco_dict_list = import_busco(busco_dir)
    pfam_dict_score, pfam_dict_count = import_pfam(ipr_output,
                                                   nr_prot_mapping_file)

    # Catch bad genes
    D_bad_pickle = catch_bad_genes(maker_gff3s, augustus_gff3, braker1_gff3s,
                                   genome_assembly, output_dir)

    filter_gff3s(maker_gff3s, augustus_gff3, braker1_gff3s, blast_dict_score,
                 blast_dict_evalue, busco_dict_score, busco_dict_list,
                 pfam_dict_score, pfam_dict_count, D_bad_pickle, nr_prot_file,
                 nr_prot_mapping_file, org_id, output_dir)

    # Copy output files
    copy_output(output_dir)

    # Create markdown
    create_markdown(genome_assembly, output_dir, trinity_asms)
Example #20
0
def main(argv):
    argparse_usage = (
        'filter_gff3s.py -a <genome_assembly> -i <input_gff3s> '
        '-m <mapping_file> -b <blastp_dict> -B <busco_dict> -p <pfam_dict> '
        '-N <blastn_dict> -g <bad_dict> -n <nr_prot_file> -o <output_dir>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument("-a",
                        "--genome_assembly",
                        nargs=1,
                        required=True,
                        help="Genome assembly file")
    parser.add_argument("-i",
                        "--input_gff3s",
                        nargs='+',
                        required=True,
                        help="Multiple gff3 files")
    parser.add_argument("-m",
                        "--mapping_file",
                        nargs=1,
                        required=True,
                        help="Mapping txt file (make_nr_prot.py)")
    parser.add_argument(
        "-b",
        "--blastp_dict",
        nargs=1,
        required=True,
        help="Parsed blastp output in dictionary (import_blastp.py)")
    parser.add_argument(
        "-B",
        "--busco_dict",
        nargs=1,
        required=True,
        help="Parsed BUSCO output in dictionary (import_busco.py)")
    parser.add_argument(
        "-p",
        "--pfam_dict",
        nargs=1,
        required=True,
        help="Parsed Pfam_scan output in dictionary (import_pfam.py)")
    parser.add_argument(
        "-N",
        "--blastn_dict",
        nargs=1,
        required=True,
        help="Parsed BLASTn output in dictionary (import_blastn.py)")
    parser.add_argument("-g",
                        "--bad_dict",
                        nargs=1,
                        required=True,
                        help="Parsed IPRscan output in dictionary")
    parser.add_argument("-n",
                        "--nr_prot_file",
                        nargs=1,
                        required=True,
                        help="nr_prot.faa file (make_nr_prot.py)")
    parser.add_argument("-o",
                        "--output_dir",
                        nargs='?',
                        default='gene_filtering',
                        help="Output directory")
    parser.add_argument("-l",
                        "--log_dir",
                        nargs='?',
                        default='log_dir',
                        help="Log directory")

    args = parser.parse_args()
    genome_assembly = os.path.abspath(args.genome_assembly[0])
    input_gff3s = [os.path.abspath(x) for x in args.input_gff3s]
    mapping_file = os.path.abspath(args.mapping_file[0])
    blastp_dict = os.path.abspath(args.blastp_dict[0])
    busco_dict = os.path.abspath(args.busco_dict[0])
    pfam_dict = os.path.abspath(args.pfam_dict[0])
    blastn_dict = os.path.abspath(args.blastn_dict[0])
    bad_dict = os.path.abspath(args.bad_dict[0])
    D_bad = cPickle.load(open(bad_dict, 'rb'))
    nr_prot_file = os.path.abspath(args.nr_prot_file[0])
    output_dir = os.path.abspath(args.output_dir)
    log_dir = os.path.abspath(args.log_dir)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'filter_gff3s.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START: Filtering GFF3')
    D_mapping, D_mapping_rev = import_mapping(mapping_file)

    # Import dictionaries
    D_blastp = cPickle.load(open(blastp_dict, 'rb'))
    D_busco = cPickle.load(open(busco_dict, 'rb'))
    D_pfam = cPickle.load(open(pfam_dict, 'rb'))
    D_blastn = cPickle.load(open(blastn_dict, 'rb'))

    # Self-filtering
    for input_gff3 in input_gff3s:
        prefix = re.sub(r'\.gff3$', '', os.path.basename(input_gff3))
        D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3([input_gff3])
        self_filtered = filtering(D_cds, D_cds_len, D_blastp, D_busco, D_pfam,
                                  D_blastn, D_bad, output_dir)
        outfile_self = os.path.join(output_dir,
                                    '{}_filtered.list'.format(prefix))
        outhandle_self = open(outfile_self, 'w')

        cds_len_filtered = 0
        for tup in self_filtered:
            outhandle_self.write('{}\n'.format(tup[1]))
            cds_len_filtered += D_cds_len[tup]
        outhandle_self.close()

    # Filtering
    D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3(input_gff3s)
    final_gene_set = filtering(D_cds, D_cds_len, D_blastp, D_busco, D_pfam,
                               D_blastn, D_bad, output_dir)
    D_prot = import_prot(nr_prot_file, D_mapping_rev)
    write_final_prots(final_gene_set, D_mapping, output_dir)
    write_files(genome_assembly, final_gene_set, D_gene, D_gff3, D_prot,
                D_exon, output_dir, D_cds)

    cds_len_final = 0
    for tup in final_gene_set:
        cds_len_final += D_cds_len[tup]

    logger_time.debug('DONE : Filtering GFF3')
Example #21
0
def main(argv):
    argparse_usage = (
        'run_blastn.py -q <query_fasta> -d <db_fasta> -o <output_prefix> '
        '-l <log_dir>'
    )
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument(
        "-q", "--query_fasta", dest="query_fasta", nargs=1,
        help="input fasta file"
    )
    parser.add_argument(
        "-d", "--db_fasta", dest="db_fasta", nargs=1,
        help="input fasta file"
    )
    parser.add_argument(
        "-o", "--output_prefix", dest="output_prefix", nargs=1,
        help="Output prefix"
    )
    parser.add_argument(
        "-l", "--log_dir", dest="log_dir", nargs=1,
        help="Log directory"
    )

    args = parser.parse_args()
    if args.query_fasta:
        query_fasta = os.path.abspath(args.query_fasta[0])
    else:
        print '[ERROR] Please provide QUERY FASTA'
        parser.print_help()
        sys.exit(2)

    if args.db_fasta:
        db_fasta = os.path.abspath(args.db_fasta[0])
    else:
        print '[ERROR] Please provide DB FASTA'
        parser.print_help()
        sys.exit(2)

    if args.output_prefix:
        output_prefix = os.path.abspath(args.output_prefix[0])
    else:
        print '[ERROR] Please provide OUTPUT PREFIX'
        parser.print_help()
        sys.exit(2)

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide LOG DIRECTORY'
        parser.print_help()
        sys.exit(2)

    # Set logging
    log_file = os.path.join(
        log_dir, 'pipeline', 'run_blastn.log'
    )
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START running BLASTn for %s' % (
        os.path.basename(query_fasta)
    ))

    # Run functions :) Slow is as good as Fast
    run_blastn(query_fasta, db_fasta, output_prefix)
Example #22
0
def main(argv):
    optparse_usage = (
        'run_repeat_modeler.py -g <genome_assembly> -o <output_dir> '
        '-l <log_dir> -p <project_name> -c <num_cores> -C <config_file>')
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument("-g",
                        "--genome_assembly",
                        dest="genome_assembly",
                        nargs=1,
                        help="Genome assembly file in FASTA format")
    parser.add_argument("-o",
                        "--output_dir",
                        dest="output_dir",
                        nargs=1,
                        help="Output directory")
    parser.add_argument("-l",
                        "--log_dir",
                        dest="log_dir",
                        nargs=1,
                        help="Log directory")
    parser.add_argument(
        "-p",
        "--project_name",
        dest="project_name",
        nargs=1,
        help="Project name without space. e.g. Mag, Eco, Pst_LUM")
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help="Number of cores to be used")
    parser.add_argument("-C",
                        "--config_file",
                        dest="config_file",
                        nargs=1,
                        help="Config file generated by check_dependencies.py")

    args = parser.parse_args()
    if args.genome_assembly:
        genome_assembly = os.path.abspath(args.genome_assembly[0])
    else:
        print '[ERROR] Please provide INPUT ASSEMBLY'
        sys.exit(2)

    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] Please provide OUTPUT DIRECTORY'
        sys.exit(2)

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide LOG DIRECTORY'
        sys.exit(2)

    if args.project_name:
        project_name = args.project_name[0]
    else:
        print '[ERROR] Please provide PROJECT NAME'
        sys.exit(2)

    if args.num_cores:
        num_cores = args.num_cores[0]
    else:
        print '[ERROR] Please provide NUMBER OF CORES'
        sys.exit(2)

    if args.config_file:
        config_file = os.path.abspath(args.config_file[0])
    else:
        print '[ERROR] Please provide CONFIG FILE'
        sys.exit(2)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'pipeline', 'run_repeat_modeler.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    repeat_modeler_bin = parse_config(config_file)
    run_repeat_modeler(genome_assembly, output_dir, log_dir, project_name,
                       num_cores, repeat_modeler_bin)
Example #23
0
def main(argv):
    argparse_usage = (
        'check_dependencies.py -o <output_dir> -H <with_hisat2>'
        ' -t <with_trinity> -m <with_maker> -r <with_repeat_modeler>'
        ' -b <with_braker1>  -B <with_busco> -i <with_interproscan>'
        ' -g <with_genemark>'
    )
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument(
        "-o", "--output_dir", dest="output_dir", nargs=1,
        help="Output directory"
    )
    parser.add_argument(
        "-H", "--with_hisat2", dest="with_hisat2", nargs='?',
        help="User-defined Hisat2 installation path (binary directory)"
    )
    parser.add_argument(
        "-t", "--with_trinity", dest="with_trinity", nargs='?',
        help="User-defined Trinity installation path (binary directory)"
    )
    parser.add_argument(
        "-m", "--with_maker", dest="with_maker", nargs='?',
        help="User-defined Maker installation path (binary directory)"
    )
    parser.add_argument(
        "-r", "--with_repeat_modeler", dest="with_repeat_modeler", nargs='?',
        help="User-defined Repeat Modeler installation path (binary directory)"
    )
    parser.add_argument(
        "-b", "--with_braker1", dest="with_braker1", nargs='?',
        help="User-defined Braker1 installation path (binary directory)"
    )
    parser.add_argument(
        "-B", "--with_busco", dest="with_busco", nargs='?',
        help="User-defined BUSCO installation path (binary directory)"
    )
    parser.add_argument(
        "-i", "--with_interproscan", dest="with_interproscan", nargs='?',
        help="User-defined InterproScan installation path (binary directory)"
    )
    parser.add_argument(
        "-g", "--with_genemark", dest="with_genemark", nargs='?',
        help="User-defined GeneMark installation path (binary directory)"
    )

    args = parser.parse_args()
    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] You should provide OUTPUT DIRECTORY'
        sys.exit(2)

    if args.with_hisat2:
        with_hisat2 = os.path.abspath(args.with_hisat2)
    else:
        with_hisat2 = ''

    if args.with_trinity:
        with_trinity = os.path.abspath(args.with_trinity)
    else:
        with_trinity = ''

    if args.with_maker:
        with_maker = os.path.abspath(args.maker)
    else:
        with_maker = ''

    if args.with_repeat_modeler:
        with_repeat_modeler = os.path.abspath(args.with_repeat_modeler)
    else:
        with_repeat_modeler = ''

    if args.with_braker1:
        with_braker1 = os.path.abspath(args.with_braker1)
    else:
        with_braker1 = ''

    if args.with_busco:
        with_busco = os.path.abspath(args.with_busco)
    else:
        with_busco = ''

    if args.with_interproscan:
        with_interproscan = os.path.abspath(args.with_interproscan)
    else:
        with_interproscan = ''

    if args.with_genemark:
        with_genemark = os.path.abspath(args.with_genemark)
    else:
        with_genemark = ''

    # Create necessary dirs
    create_dir(output_dir)

    # Set logging
    log_dir = os.path.join(output_dir, 'logs')
    log_file = os.path.join(
        log_dir, 'pipeline', 'check_dependencies.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    logger_time.debug('Check dependencies: get paths')
    (
        hisat2_path, trinity_path, maker_path, repeat_modeler_path,
        braker1_path, busco_path, interproscan_path, genemark_path
    ) = get_path(
        with_hisat2, with_trinity, with_maker, with_repeat_modeler,
        with_braker1, with_busco, with_interproscan, with_genemark
    )

    logger_txt.debug('')
    logger_time.debug('Check dependencies: check tools working')
    check_working(
        hisat2_path, trinity_path, maker_path, repeat_modeler_path,
        braker1_path, busco_path, interproscan_path, genemark_path
    )

    write_config(
        output_dir, hisat2_path, trinity_path, maker_path,
        repeat_modeler_path, braker1_path, busco_path, interproscan_path,
        genemark_path
    )

    # Check BLAST installation
    check_blast()
Example #24
0
def main(argv):
    optparse_usage = ('run_blast_reduce.py -i <input_fasta> -f <ref_fasta> '
                      '-o <output> -c <num_cores> --nr')
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument("-i",
                        "--input_fasta",
                        dest="input_fasta",
                        nargs=1,
                        help='input fasta file')
    parser.add_argument(
        "-f",
        "--ref_fasta",
        dest="ref_fasta",
        nargs='*',
        help=('Multiple reference FASTA files (order dependent, '
              'smallest dataset should be posed at first)'))
    parser.add_argument("-o",
                        "--output_prefix",
                        dest="output_prefix",
                        nargs=1,
                        help="output prefix")
    parser.add_argument("-r",
                        "--root_dir",
                        dest="root_dir",
                        nargs=1,
                        help=('Root directory where log directory will be '
                              'generated (default: ".")'),
                        default=[os.getcwd()])
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help="Number of cores to be used")

    args = parser.parse_args()
    if args.input_fasta:
        input_fasta = os.path.abspath(args.input_fasta[0])
    else:
        print '[ERROR] Please provide INPUT FASTA'
        sys.exit(2)

    if args.ref_fasta:
        references = [os.path.abspath(x) for x in args.ref_fasta]
    else:
        references = []

    if args.output_prefix:
        output_prefix = args.output_prefix[0]
    else:
        print '[ERROR] Please provide OUTPUT_PREFIX'
        sys.exit(2)

    if args.num_cores:
        num_cores = args.num_cores[0]
    else:
        print '[ERROR] Please provide NUMBER OF CORES'
        sys.exit(2)

    root_dir = os.path.abspath(args.root_dir[0])

    # Check input fasta is valid
    if not glob(input_fasta):
        print '[ERROR] No such file: %s' % (input_fasta)
        sys.exit(2)

    # Create necessary dirs
    create_dir(root_dir)

    # Set logging
    log_file = os.path.join(root_dir, 'logs', 'pipeline',
                            'run_blastp_reduce.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START running BLASTp-reduce for %s' %
                      (os.path.basename(input_fasta)))

    if references:
        filtered_fasta = input_fasta
        tmp_num = 1
        for ref in references:
            tmp_output_blast = run_blastp_ref(filtered_fasta, ref,
                                              output_prefix, tmp_num,
                                              num_cores)
            filtered_fasta, tmp_num = filtering(filtered_fasta, output_prefix,
                                                tmp_num, tmp_output_blast)
    else:
        filtered_fasta = input_fasta

    integrate(output_prefix, tmp_num)
    logger_time.debug('DONE  running BLASTp-reduce for %s' %
                      (os.path.basename(input_fasta)))
Example #25
0
def main(argv):
    argparse_usage = (
        'run_braker1.py -m <masked_assembly> -b <bam_files> -o <output_dir> '
        '-l <log_dir> -c <num_cores> -C <config_file>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument("-m",
                        "--maksed_assembly",
                        dest="masked_assembly",
                        nargs=1,
                        help="Assembly file in FASTA")
    parser.add_argument("-b",
                        "--bam_fileles",
                        dest="bam_files",
                        nargs='+',
                        help="BAM files generated by Hisat2")
    parser.add_argument("-o",
                        "--output_dir",
                        dest="output_dir",
                        nargs='+',
                        help="Output directory")
    parser.add_argument("-l",
                        "--log_dir",
                        dest="log_dir",
                        nargs='+',
                        help="Log directory")
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help="Number of cores to be used")
    parser.add_argument("-C",
                        "--config_file",
                        dest="config_file",
                        nargs=1,
                        help="Config file generated by check_dependencies.py")
    parser.add_argument('--fungus',
                        dest='fungus_flag',
                        action='store_true',
                        help='Fungus flag of BRAKER1')

    args = parser.parse_args()
    if args.masked_assembly:
        masked_assembly = os.path.abspath(args.masked_assembly[0])
    else:
        print '[ERROR] Please provide INPUT ASSEMBLY'
        sys.exit(2)

    if args.bam_files:
        bam_files = [os.path.abspath(x) for x in args.bam_files]
    else:
        print '[ERROR] Please provide BAM FILES'
        sys.exit(2)

    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] Please provide OUTPUT DIRECTORY'
        sys.exit(2)

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide LOG DIRECTORY'
        sys.exit(2)

    if args.num_cores:
        num_cores = args.num_cores[0]
    else:
        num_cores = 1

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide LOG DIRECTORY'
        sys.exit(2)

    if args.config_file:
        config_file = os.path.abspath(args.config_file[0])
    else:
        print '[ERROR] Please provide CONFIG FILE'
        sys.exit(2)

    if args.fungus_flag:
        fungus_flag = '--fungus'
    else:
        fungus_flag = ''

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'pipeline', 'run_braker1.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    braker1_bin = parse_config(config_file)
    run_braker1(masked_assembly, bam_files, output_dir, log_dir, num_cores,
                braker1_bin, fungus_flag)
Example #26
0
def main(argv):
    argparser_usage = (
        'run_hisat2.py -r <fastq1> <fastq2> <fastq3> ...'
        ' -o <output_dir> -l <log_dir> -f <ref_fasta> -c <num_cores>'
        ' -C <config_file>')
    parser = ArgumentParser(usage=argparser_usage)
    parser.add_argument("-r",
                        "--read_files",
                        dest="read_files",
                        nargs='+',
                        help='Multiople read files in fastq format')
    parser.add_argument("-o",
                        "--output_dir",
                        dest="output_dir",
                        nargs=1,
                        help='Output directory')
    parser.add_argument("-l",
                        "--log_dir",
                        dest="log_dir",
                        nargs=1,
                        help='Log directory')
    parser.add_argument("-f",
                        "--ref_fasta",
                        dest="ref_fasta",
                        nargs=1,
                        help='Reference fasta')
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help='Number of cores')
    parser.add_argument("-C",
                        "--config_file",
                        dest="config_file",
                        nargs=1,
                        help="Config file generated by check_dependencies.py")

    args = parser.parse_args()

    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] Please provide proper OUTPUT DIRECTORY'
        sys.exit(2)

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide proper LOG DIRECTORY'
        sys.exit(2)

    if args.read_files:
        read_files = [os.path.abspath(x) for x in args.read_files]
    else:
        print '[ERROR] Please provide proper READ FILES'
        sys.exit(2)

    # Reference fasta
    if args.ref_fasta:
        ref_fasta = os.path.abspath(args.ref_fasta[0])
    else:
        print '[ERROR] Please provide proper file: REFERENCE FASTA'
        sys.exit(2)

    if args.num_cores:
        num_cores = int(args.num_cores[0])
    else:
        num_cores = 1

    if args.config_file:
        config_file = os.path.abspath(args.config_file[0])
    else:
        print '[ERROR] Please provide CONFIG FILE'
        sys.exit(2)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'pipeline', 'run_hisat2.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    hisat2_bin = parse_config(config_file)
    logger_time.debug('START: Hisat2')
    hisat2_outputs = run_hisat2(read_files, output_dir, log_dir, ref_fasta,
                                num_cores, hisat2_bin)
    post_process_sam(hisat2_outputs)
    logger_time.debug('DONE : Hisat2')
 def __init__(self, db_name='sqlite.db'):
     self.sqlite_db = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), 'database/' + db_name)
     self.create_folder()
     self.logger = set_logging.set_logging('sqlite_operator')
Example #28
0
def main(argv):
    optparse_usage = (
        'run_busco.py -i <input_fasta> -o <output_dir> -l <log_dir> '
        '-c <num_cores> -C <config_file>')
    parser = ArgumentParser(usage=optparse_usage)
    parser.add_argument("-i",
                        "--input_fasta",
                        dest="input_fasta",
                        nargs=1,
                        help="Input protein FASTA file")
    parser.add_argument("-o",
                        "--output_dir",
                        dest="output_dir",
                        nargs=1,
                        help="Output directory")
    parser.add_argument("-l",
                        "--log_dir",
                        dest="log_dir",
                        nargs=1,
                        help='Log directory')
    parser.add_argument("-c",
                        "--num_cores",
                        dest="num_cores",
                        nargs=1,
                        help="Number of cores to be used")
    parser.add_argument("-C",
                        "--config_file",
                        dest="config_file",
                        nargs=1,
                        help="Config file generated by check_dependencies.py")

    args = parser.parse_args()
    if args.input_fasta:
        input_fasta = os.path.abspath(args.input_fasta[0])
    else:
        print '[ERROR] Please provide INPUT FASTA'
        sys.exit(2)

    if args.output_dir:
        output_dir = os.path.abspath(args.output_dir[0])
    else:
        print '[ERROR] Please provide OUTPUT DIRECTORY'
        sys.exit(2)

    if args.log_dir:
        log_dir = os.path.abspath(args.log_dir[0])
    else:
        print '[ERROR] Please provide LOG DIRECTORY'
        sys.exit(2)

    if args.num_cores:
        num_cores = args.num_cores[0]
    else:
        print '[ERROR] Please provide NUMBER OF CORES'
        sys.exit(2)

    if args.config_file:
        config_file = os.path.abspath(args.config_file[0])
    else:
        print '[ERROR] Please provide CONFIG FILE'
        sys.exit(2)

    # Create necessary dirs
    create_dir(output_dir, log_dir)

    # Set logging
    log_file = os.path.join(log_dir, 'pipeline', 'run_busco.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Check BUSCO library
    if not glob(os.path.join(lineage_path, 'hmms/*hmm')):
        logger_txt.debug(
            '\n[ERROR] You did not download BUSCO library\n'
            'Go to FunGAP_PATH/data/ and type\n'
            'wget http://busco.ezlab.org/v1/files/fungi_buscos.tar.gz;'
            'tar -zxvf fungi_buscos.tar.gz\n'
            'You can resume FunGAP without restarting '
            '(run FunGAP in the same directory)')
        sys.exit(2)

    # Run functions :) Slow is always better than Fast
    busco_bin = parse_config(config_file)
    run_busco(input_fasta, output_dir, log_dir, num_cores, busco_bin)
Example #29
0
def main(argv):
    argparse_usage = (
        'fungap.py -g <genome_assembly> -12UA <trans_read_files> '
        '-o <output_dir> -a <augustus_species> '
        '-s <sister_proteome>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument('-o',
                        '--output_dir',
                        nargs='?',
                        default='fungap_out',
                        help='Output directory (default: fungap_out)')
    parser.add_argument('-1',
                        '--trans_read_1',
                        nargs='?',
                        default='',
                        help='Paired-end read1 "<prefix>_1.fastq"')
    parser.add_argument('-2',
                        '--trans_read_2',
                        nargs='?',
                        default='',
                        help='Paired-end read2 "<prefix>_2.fastq"')
    parser.add_argument('-U',
                        '--trans_read_single',
                        nargs='?',
                        default='',
                        help='Single read "<prefix>_s.fastq"')
    parser.add_argument(
        '-A',
        '--trans_bam',
        nargs='?',
        default='',
        help='BAM file (RNA-seq reads alignment to a genome assembly')
    parser.add_argument('-g',
                        '--genome_assembly',
                        nargs=1,
                        required=True,
                        help='Genome assembly file in FASTA format')
    parser.add_argument('-a',
                        '--augustus_species',
                        nargs=1,
                        required=True,
                        help='AUGUSTUS species')
    parser.add_argument('-s',
                        '--sister_proteome',
                        nargs=1,
                        required=True,
                        help='Sister proteome sequences in .faa')
    parser.add_argument('-c',
                        '--num_cores',
                        nargs='?',
                        default=1,
                        type=int,
                        help='Number of cores to be used (default: 1)')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s {}'.format(__version__))

    # Options for non-fungus genome
    parser.add_argument(
        '--no_braker_fungus',
        action='store_true',
        help='No --fungus flag in BRAKER for non-fungus genomes')
    parser.add_argument(
        '--no_jaccard_clip',
        action='store_true',
        help='No --jaccard_clip flag in Trinity for non-fungus genomes')
    parser.add_argument(
        '--no_genemark_fungus',
        action='store_true',
        help='No --fungus flag in GeneMark for non-fungus genomes')
    parser.add_argument('-M',
                        '--max_intron',
                        nargs='?',
                        default=2000,
                        type=int,
                        help='Max intron length (Default: 2000 bp)')

    args = parser.parse_args()
    output_dir = os.path.abspath(args.output_dir)
    trans_read_1 = args.trans_read_1
    trans_read_2 = args.trans_read_2
    trans_read_single = args.trans_read_single
    trans_bam = args.trans_bam
    genome_assembly = os.path.abspath(args.genome_assembly[0])
    augustus_species = args.augustus_species[0]
    sister_proteome = os.path.abspath(args.sister_proteome[0])
    num_cores = args.num_cores
    max_intron = args.max_intron

    # For non-fungus genomes
    if args.no_braker_fungus:
        no_braker_fungus = ''
    else:
        no_braker_fungus = '--fungus'

    if args.no_jaccard_clip:
        no_jaccard_clip = ''
    else:
        no_jaccard_clip = '--jaccard_clip'

    if args.no_genemark_fungus:
        no_genemark_fungus = ''
    else:
        no_genemark_fungus = '--gmes_fungus'

    # Create nessasary dirs
    create_dir(output_dir)

    # Set logging
    log_file = os.path.join(output_dir, 'logs', 'fungap.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    logger_txt.debug('\n============ New Run {} ============'.format(
        datetime.now()))

    # Run functions :) Slow is as good as Fast
    trans_read_files = check_inputs(trans_read_1, trans_read_2,
                                    trans_read_single, trans_bam,
                                    genome_assembly, sister_proteome)
    trans_bams = run_hisat2(genome_assembly, trans_read_files, output_dir,
                            num_cores, max_intron)
    trinity_asms = run_trinity(trans_bams, output_dir, num_cores,
                               no_jaccard_clip, max_intron)
    repeat_model_file = run_repeat_modeler(genome_assembly, output_dir,
                                           num_cores)
    maker_gff3s, maker_faas = run_maker(genome_assembly, output_dir,
                                        augustus_species, sister_proteome,
                                        num_cores, repeat_model_file,
                                        trinity_asms, no_genemark_fungus)
    # Get masked assembly
    masked_assembly = os.path.join(output_dir, 'maker_out',
                                   'masked_assembly.fasta')

    # Run Augustus
    augustus_gff3, augustus_faa = run_augustus(masked_assembly, output_dir,
                                               augustus_species)

    # Run Braker1
    braker1_gff3s, braker1_faas = run_braker1(masked_assembly, trans_bams,
                                              output_dir, num_cores,
                                              no_braker_fungus)

    # Run BUSCO on each gene models
    faa_files = [augustus_faa] + maker_faas + braker1_faas
    for faa_file in faa_files:
        run_busco(faa_file, output_dir, num_cores)
    busco_out_dir = os.path.join(output_dir, 'busco_out')

    # Get protein nr by removing identical proteins
    nr_prot_file, nr_prot_mapping_file = make_nr_prot(faa_files, output_dir)

    # Run BLASTp with nr prot file
    blastp_output = run_blastp(nr_prot_file, output_dir, sister_proteome,
                               num_cores)

    # Run Pfam_scan with nr prot file
    pfam_scan_out = run_pfam_scan(nr_prot_file, output_dir, num_cores)

    # Concatenate all transcripts files
    gene_filtering_dir = os.path.join(output_dir, 'gene_filtering')
    trinity_asm = os.path.join(gene_filtering_dir, 'trinity_transcripts.fna')
    command = 'cat {} > {}'.format(' '.join(trinity_asms), trinity_asm)
    logger_time.debug('Create transcript')
    logger_txt.debug('[Run] {}'.format(command))
    os.system(command)

    gff3_files = [augustus_gff3] + maker_gff3s + braker1_gff3s
    blastn_out_files = []
    for gff3_file in gff3_files:
        transcript_file = make_transcripts(genome_assembly, gff3_file)
        blastn_out_file = run_blastn(transcript_file, trinity_asm, output_dir)
        blastn_out_files.append(blastn_out_file)

    # Import BLAST, BUSCO and Pfam score
    blastp_dict = import_blastp(blastp_output, nr_prot_mapping_file)
    busco_dict = import_busco(busco_out_dir, output_dir)
    pfam_dict = import_pfam(pfam_scan_out, nr_prot_mapping_file)
    blastn_dict = import_blastn(blastn_out_files, output_dir)

    # Catch bad genes
    bad_dict = catch_bad_genes(gff3_files, genome_assembly, output_dir)
    filter_gff3s(genome_assembly, gff3_files, blastp_dict, busco_dict,
                 pfam_dict, blastn_dict, bad_dict, nr_prot_file,
                 nr_prot_mapping_file, output_dir)
    gff3_postprocess(genome_assembly, output_dir)

    # Copy output files
    copy_output(output_dir)

    # Create markdown
    create_markdown(genome_assembly, output_dir, trans_bams, trinity_asms)
Example #30
0
def main(argv):
    argparse_usage = (
        'filter_gff3s.py -i <input_gff3s> -m <mapping_file> -b <blast_dict> '
        '-B <busco_dict> -p <ipr_dict> -g <bad_dict> -n <nr_prot_file> '
        '-s <short_id> -o <output_prefix> -r <root_dir>')
    parser = ArgumentParser(usage=argparse_usage)
    parser.add_argument("-i",
                        "--input_gff3s",
                        dest="input_gff3s",
                        nargs='+',
                        help="Multiple gff3 files")
    parser.add_argument("-m",
                        "--mapping_file",
                        dest="mapping_file",
                        nargs=1,
                        help="Mapping txt file (make_nr_prot.py)")
    parser.add_argument("-b",
                        "--blast_dict",
                        dest="blast_dict",
                        nargs=2,
                        help="Parsed blast output in dictionary")
    parser.add_argument("-B",
                        "--busco_dict",
                        dest="busco_dict",
                        nargs=2,
                        help="Parsed BUSCO output in dictionary")
    parser.add_argument("-p",
                        "--ipr_dict",
                        dest="ipr_dict",
                        nargs=2,
                        help="Parsed IPRscan output in dictionary")
    parser.add_argument("-g",
                        "--bad_dict",
                        dest="bad_dict",
                        nargs=1,
                        help="Parsed IPRscan output in dictionary")
    parser.add_argument("-n",
                        "--nr_prot_file",
                        dest="nr_prot_file",
                        nargs=1,
                        help="nr_prot.faa file (make_nr_prot.py)")
    parser.add_argument("-s",
                        "--short_id",
                        dest="short_id",
                        nargs=1,
                        help="Short ID for gene numbers")
    parser.add_argument("-o",
                        "--output_prefix",
                        dest="output_prefix",
                        nargs=1,
                        help="Output prefix")
    parser.add_argument("-r",
                        "--root_dir",
                        dest="root_dir",
                        nargs=1,
                        help=('Root directory where log directory will be '
                              'generated (default: ".")'),
                        default=[os.getcwd()])

    args = parser.parse_args()
    if args.input_gff3s:
        input_gff3s = [os.path.abspath(x) for x in args.input_gff3s]
    else:
        print '[ERROR] Please provide INPUT GFF3'
        sys.exit(2)

    if args.mapping_file:
        mapping_file = os.path.abspath(args.mapping_file[0])
    else:
        print '[ERROR] Please provide MAPPING TXT FILE'
        sys.exit(2)

    if args.blast_dict:
        blast_dict_score = os.path.abspath(args.blast_dict[0])
        blast_dict_evalue = os.path.abspath(args.blast_dict[1])
    else:
        print '[ERROR] Please provide BLAST DICT'
        sys.exit(2)

    if args.busco_dict:
        busco_dict_score = os.path.abspath(args.busco_dict[0])
        busco_dict_list = os.path.abspath(args.busco_dict[1])
    else:
        print '[ERROR] Please provide BUSCO DICT'
        sys.exit(2)

    if args.ipr_dict:
        ipr_dict_score = os.path.abspath(args.ipr_dict[0])
        ipr_dict_count = os.path.abspath(args.ipr_dict[1])
    else:
        print '[ERROR] Please provide IPR DICT PICKLE'
        sys.exit(2)

    if args.bad_dict:
        bad_dict = os.path.abspath(args.bad_dict[0])
        D_bad = cPickle.load(open(bad_dict, 'rb'))
    else:
        print '[WARNING] Please provide BAD DICT PICKLE'
        D_bad = defaultdict(bool)

    if args.nr_prot_file:
        nr_prot_file = os.path.abspath(args.nr_prot_file[0])
    else:
        print '[ERROR] Please provide "nr_prot.faa" FILE'
        sys.exit(2)

    if args.short_id:
        short_id = args.short_id[0]
    else:
        print '[ERROR] Please provide SHORT ID'
        sys.exit(2)

    if args.output_prefix:
        output_prefix = args.output_prefix[0]
    else:
        print '[ERROR] Please provide OUTPUT PREFIX'
        sys.exit(2)

    root_dir = os.path.abspath(args.root_dir[0])

    # Create necessary dirs
    create_dir(root_dir)

    # Set logging
    log_file = os.path.join(root_dir, 'logs', 'pipeline', 'filter_gff3s.log')
    global logger_time, logger_txt
    logger_time, logger_txt = set_logging(log_file)

    # Run functions :) Slow is as good as Fast
    logger_time.debug('START: Filtering GFF3')
    D_mapping, D_mapping_rev = import_mapping(mapping_file)

    # Import dictionaries
    D_blast_score = cPickle.load(open(blast_dict_score, 'rb'))
    D_blast_evalue = cPickle.load(open(blast_dict_evalue, 'rb'))
    D_busco_score = cPickle.load(open(busco_dict_score, 'rb'))
    D_busco_list = cPickle.load(open(busco_dict_list, 'rb'))
    D_pfam_score = cPickle.load(open(ipr_dict_score, 'rb'))
    D_pfam_count = cPickle.load(open(ipr_dict_count, 'rb'))

    # Self-filtering to get stats
    D_stats = {}
    for input_gff3 in input_gff3s:
        prefix = os.path.basename(input_gff3).split('.')[0]
        D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3([input_gff3])
        self_filtered, stats = filtering(D_gene, D_cds, D_cds_len, D_mapping,
                                         D_blast_score, D_blast_evalue,
                                         D_busco_score, D_busco_list,
                                         D_pfam_score, D_pfam_count, D_bad,
                                         output_prefix)
        outfile_self = '%s_%s_filtered.list' % (output_prefix, prefix)
        outhandle_self = open(outfile_self, 'w')

        cds_len_filtered = 0
        for tup in self_filtered:
            outhandle_self.write('%s\n' % (tup[1]))
            cds_len_filtered += D_cds_len[tup]
        outhandle_self.close()

        (raw_num_genes, final_num_genes, blast_hit, pfam_hit, pfam_domains,
         busco_hit) = stats
        new_stats = (raw_num_genes, final_num_genes, blast_hit, pfam_hit,
                     pfam_domains, busco_hit, cds_len_filtered)
        D_stats[prefix] = new_stats

    # Filtering
    D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3(input_gff3s)
    final_gene_set, final_stats = filtering(D_gene, D_cds, D_cds_len,
                                            D_mapping, D_blast_score,
                                            D_blast_evalue, D_busco_score,
                                            D_busco_list, D_pfam_score,
                                            D_pfam_count, D_bad, output_prefix)
    D_prot = import_prot(nr_prot_file, D_mapping_rev)
    write_final_prots(final_gene_set, D_mapping, output_prefix)
    write_files(final_gene_set, D_gene, D_gff3, D_prot, D_exon, output_prefix,
                short_id)

    cds_len_final = 0
    for tup in final_gene_set:
        cds_len_final += D_cds_len[tup]

    (raw_num_genes, final_num_genes, blast_hit, pfam_hit, pfam_domains,
     busco_hit) = final_stats

    new_final_stats = (raw_num_genes, final_num_genes, blast_hit, pfam_hit,
                       pfam_domains, busco_hit, cds_len_final)
    D_stats['final'] = new_final_stats

    write_stats(D_stats, output_prefix)
    logger_time.debug('DONE : Filtering GFF3')