def main(): '''Main function''' argparse_usage = 'run_braker.py -m <masked_assembly> -b <bam_files>' parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-m', '--masked_assembly', nargs=1, required=True, help='Repeat-masked genome assembly in FASTA format') parser.add_argument('-b', '--bam_files', nargs='+', required=True, help='BAM files generated by Hisat2') parser.add_argument('-o', '--output_dir', nargs='?', default='braker_out', help='Output directory') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used') parser.add_argument('-t', '--translation_table', nargs='?', default=1, type=int, help='Translation table (default: 1)') parser.add_argument('--fungus', action='store_true', help='--fungus flag for BRAKER') args = parser.parse_args() masked_assembly = os.path.abspath(args.masked_assembly[0]) bam_files = [os.path.abspath(x) for x in args.bam_files] output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores translation_table = args.translation_table fungus_flag = '--fungus' if args.fungus else '' # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_braker.log') logger = set_logging(log_file) # Run functions :) Slow is as good as Fast adjusted_assembly = adjust_header(masked_assembly) run_braker(adjusted_assembly, bam_files, output_dir, log_dir, num_cores, translation_table, fungus_flag, logger)
def __init__(self): self.current_path = os.path.join(os.path.dirname(__file__), 'config') conf = ConfigParser.ConfigParser() conf.read(os.path.join(self.current_path, 'config.ini')) self.server_ip = conf.get('HTTP', 'host') self.port = conf.get('HTTP', 'port') self.timeout = float(conf.get('HTTP', 'timeout')) self.logger = set_logging.set_logging('CI')
def set_loggings(output_dir): create_dir(output_dir) log_file = os.path.join(output_dir, 'logs', 'fungap.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) logger_txt.debug('\n============ New Run {} ============'.format( datetime.now()))
def main(): '''Main function''' argparser_usage = ( 'run_hisat2.py -r <fastq1> <fastq2> <fastq3> ...' ' -o <output_dir> -l <log_dir> -f <ref_fasta> -c <num_cores>' ' -m <max_intron>' ) parser = ArgumentParser(usage=argparser_usage) parser.add_argument( '-r', '--read_files', nargs='+', required=True, help='Multiople read files in fastq format' ) parser.add_argument( '-o', '--output_dir', nargs='?', default='hisat2_out', help='Output directory' ) parser.add_argument( '-l', '--log_dir', nargs='?', default='logs', help='Log directory' ) parser.add_argument( '-f', '--ref_fasta', nargs=1, required=True, help='Reference fasta' ) parser.add_argument( '-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores' ) parser.add_argument( '-m', '--max_intron', nargs='?', default=2000, type=int, help='Max intron length (Default: 2000 bp)' ) args = parser.parse_args() read_files = [os.path.abspath(x) for x in args.read_files] output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) ref_fasta = os.path.abspath(args.ref_fasta[0]) num_cores = args.num_cores max_intron = args.max_intron # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_hisat2.log') logger = set_logging(log_file) logger_time = logger[0] # Run functions :) Slow is as good as Fast logger_time.debug('START: Hisat2') run_hisat2( read_files, output_dir, log_dir, ref_fasta, num_cores, max_intron, logger ) logger_time.debug('DONE : Hisat2')
def main(argv): argparse_usage = 'run_braker1.py -m <masked_assembly> -b <bam_files>' parser = ArgumentParser(usage=argparse_usage) parser.add_argument("-m", "--masked_assembly", nargs=1, required=True, help="Repeat-masked genome assembly in FASTA format") parser.add_argument("-b", "--bam_files", nargs='+', required=True, help="BAM files generated by Hisat2") parser.add_argument("-o", "--output_dir", nargs='?', default='braker1_out', help="Output directory") parser.add_argument("-l", "--log_dir", nargs='?', default='logs', help="Log directory") parser.add_argument("-c", "--num_cores", nargs='?', default=1, type=int, help="Number of cores to be used") parser.add_argument('--fungus', action='store_true', help='--fungus flag for BRAKER1') args = parser.parse_args() masked_assembly = os.path.abspath(args.masked_assembly[0]) bam_files = [os.path.abspath(x) for x in args.bam_files] output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores if args.fungus: fungus_flag = '--fungus' else: fungus_flag = '' # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_braker1.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast run_braker1(masked_assembly, bam_files, output_dir, log_dir, num_cores, fungus_flag)
def main(argv): argparse_usage = ( 'run_blastn.py -q <query_fasta> -d <db_fasta> -o <output_prefix> ' '-l <log_dir> -c <num_cores>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-q', '--query_fasta', nargs=1, required=True, help='Query FASTA file') parser.add_argument('-d', '--db_fasta', nargs=1, required=True, help='Database FASTA file') parser.add_argument('-o', '--output_prefix', nargs='?', default='out', help='Output prefix') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used') args = parser.parse_args() query_fasta = os.path.abspath(args.query_fasta[0]) db_fasta = os.path.abspath(args.db_fasta[0]) output_prefix = os.path.abspath(args.output_prefix) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores # Set logging create_dir(log_dir) log_file = os.path.join(log_dir, 'run_blastn.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) logger_time.debug('START: BLASTn for {}'.format( os.path.basename(query_fasta))) # Run functions :) Slow is as good as Fast run_blastn(query_fasta, db_fasta, output_prefix, log_dir, num_cores) logger_time.debug('Done : BLASTn for {}'.format( os.path.basename(query_fasta)))
def main(): '''Main function''' argparse_usage = 'run_augustus.py -m <masked_assembly> -s <species>' parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-m', '--masked_assembly', nargs=1, required=True, help='Repeat-masked genome assembly in FASTA format') parser.add_argument('-s', '--species', nargs=1, required=True, help='Augustus reference species') parser.add_argument('-o', '--output_dir', nargs='?', default='augustus_out', help='Output directory (default: augustus_out)') parser.add_argument('-t', '--translation_table', nargs='?', default=1, type=int, help='Translation table (default: 1)') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory') args = parser.parse_args() masked_assembly = os.path.abspath(args.masked_assembly[0]) species = args.species[0] output_dir = os.path.abspath(args.output_dir) translation_table = args.translation_table log_dir = os.path.abspath(args.log_dir) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_augustus.log') logger = set_logging(log_file) # Run functions :) Slow is as good as Fast run_augustus(masked_assembly, output_dir, species, translation_table, logger) parse_augustus(output_dir)
def main(argv): argparse_usage = ('run_blast_reduce.py -q <query_fasta> -d <db_fasta> ' '-l <log_dir> -c <num_cores>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-q', '--query_fasta', nargs=1, required=True, help='Query FASTA file') parser.add_argument('-d', '--db_fasta', nargs=1, required=True, help='Database FASTA files') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used') args = parser.parse_args() query_fasta = os.path.abspath(args.query_fasta[0]) db_fasta = os.path.abspath(args.db_fasta[0]) log_dir = args.log_dir num_cores = args.num_cores # Check input FASTA is valid if not glob(query_fasta): print '[ERROR] No such file: {}'.format(query_fasta) sys.exit(2) # Create necessary dirs create_dir(log_dir) # Set logging log_file = os.path.join(log_dir, 'run_blastp_reduce.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast logger_time.debug('START: BLASTp') run_blastp(query_fasta, db_fasta, log_dir, num_cores) logger_time.debug('DONE : BLASTp')
def main(argv): argparse_usage = ( 'run_busco.py -i <input_fasta> -o <output_dir> -l <log_dir> ' '-c <num_cores>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-i', '--input_fasta', nargs=1, required=True, help='Input protein FASTA file') parser.add_argument('-o', '--output_dir', nargs='?', default='busco_out', help='Output directory (default: busco_out)') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory (default: logs)') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used (default: 1)') args = parser.parse_args() input_fasta = os.path.abspath(args.input_fasta[0]) output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores # Create necessary dir create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_busco.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is always better than Fast run_busco(input_fasta, output_dir, log_dir, num_cores)
def main(): '''Main function''' argparse_usage = ('run_busco.py -i <input_fasta> -d <lineage_dataset>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-i', '--input_fasta', nargs=1, required=True, help='Input protein FASTA file') parser.add_argument( '-d', '--lineage_dataset', nargs=1, required=True, help='BUSCO lineage dataset (run "busco --list-datasets" for the list)' ) parser.add_argument('-o', '--output_dir', nargs='?', default='busco_out', help='Output directory (default: busco_out)') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory (default: logs)') args = parser.parse_args() input_fasta = os.path.abspath(args.input_fasta[0]) lineage_dataset = args.lineage_dataset[0] output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) # Create necessary dir create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_busco.log') logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is always better than Fast log_tup = (log_dir, logger_time, logger_txt) run_busco(input_fasta, lineage_dataset, output_dir, log_tup)
def main(): '''Main function''' argparse_usage = 'run_repeat_modeler.py -g <genome_assembly>' parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-g', '--genome_assembly', nargs=1, required=True, help='Genome assembly file in FASTA format') parser.add_argument('-o', '--output_dir', nargs='?', default='repeat_modeler_out', help='Output directory (default: repeat_modeler_out)') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory (default: logs)') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used') args = parser.parse_args() genome_assembly = os.path.abspath(args.genome_assembly[0]) output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_repeat_modeler.log') logger = set_logging(log_file) # Run functions :) Slow is as good as Fast run_repeat_modeler(genome_assembly, output_dir, log_dir, num_cores, logger)
def main(argv): optparse_usage = ( 'run_repeat_modeler.py -g <genome_assembly>' ) parser = ArgumentParser(usage=optparse_usage) parser.add_argument( "-g", "--genome_assembly", nargs=1, required=True, help="Genome assembly file in FASTA format" ) parser.add_argument( "-o", "--output_dir", nargs='?', default='repeat_modeler_out', help="Output directory" ) parser.add_argument( "-l", "--log_dir", nargs='?', default='logs', help="Log directory" ) parser.add_argument( "-c", "--num_cores", nargs='?', default=1, type=int, help="Number of cores to be used" ) args = parser.parse_args() genome_assembly = os.path.abspath(args.genome_assembly[0]) output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join( log_dir, 'run_repeat_modeler.log' ) global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast run_repeat_modeler(genome_assembly, output_dir, log_dir, num_cores)
def main(argv): argparse_usage = 'run_pfam_scan.py -i <input_fasta> -l <log_dir>' parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-i', '--input_fasta', nargs=1, required=True, help='Input protein FASTA format') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used') args = parser.parse_args() input_fasta = os.path.abspath(args.input_fasta[0]) log_dir = os.path.abspath(args.log_dir) num_cores = args.num_cores # Create necessary dirs create_dir(log_dir) # Set logging log_file = os.path.join(log_dir, 'run_pfam_scan.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as fast new_input_fasta = check_sequence(input_fasta) run_pfam_scan(new_input_fasta, log_dir, num_cores)
def main(argv): optparse_usage = ( 'run_interproscan.py -i <input_fasta> -o <output_dir> -l <log_dir>' ' -C <config_file>') parser = ArgumentParser(usage=optparse_usage) parser.add_argument("-i", "--input_fasta", dest="input_fasta", nargs=1, help="Input protein FASTA format") parser.add_argument("-o", "--output_dir", dest="output_dir", nargs=1, help="Output directory") parser.add_argument("-l", "--log_dir", dest="log_dir", nargs=1, help="Log directory") parser.add_argument("-C", "--config_file", dest="config_file", nargs=1, help="Config file generated by check_dependencies.py") args = parser.parse_args() if args.input_fasta: input_fasta = os.path.abspath(args.input_fasta[0]) else: print '[ERROR] Please provide INPUT FASTA' sys.exit(2) if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] Please provide OUTPUT DIRECTORY' sys.exit(2) if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide LOG DIRECTORY' sys.exit(2) if args.config_file: config_file = os.path.abspath(args.config_file[0]) else: print '[ERROR] Please provide CONFIG FILE' sys.exit(2) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'pipeline', 'run_interproscan_pfam.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as fast interproscan_bin = parse_config(config_file) new_input_fasta = check_sequence(input_fasta) run_iprscan(new_input_fasta, output_dir, log_dir, interproscan_bin)
def main(): '''Main function''' optparse_usage = ( 'run_maker.py -i <input_fasta> -p <protein_db_fasta> -c <num_cores> ' '-R <repeat_model> -e <est_files>') parser = ArgumentParser(usage=optparse_usage) parser.add_argument('-i', '--input_fasta', nargs=1, required=True, help='Input genome sequence in FASTA format') parser.add_argument('-a', '--augustus_species', nargs=1, required=True, help='"augustus --species=help" would be helpful') parser.add_argument('-p', '--protein_db_fasta', nargs='+', required=True, help='Protein db in FASTA foramt') parser.add_argument( '-R', '--repeat_model', nargs=1, required=True, help='De novo repeat model by RepeatModeler: consensi.fa.classified') parser.add_argument('-e', '--est_files', nargs='+', required=True, help='Multiple EST data if available') parser.add_argument('-o', '--output_dir', nargs='?', default='maker_out', help='Output directory') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory') parser.add_argument('-t', '--translation_table', nargs='?', default=1, type=int, help='Translation table (default: 1)') parser.add_argument('--gmes_fungus', action='store_true', help='--fungus flag in GeneMark') args = parser.parse_args() input_fasta = os.path.abspath(args.input_fasta[0]) output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) augustus_species = args.augustus_species[0] protein_db_fastas = [os.path.abspath(x) for x in args.protein_db_fasta] num_cores = args.num_cores repeat_model = os.path.abspath(args.repeat_model[0]) est_files = [os.path.abspath(x) for x in args.est_files] translation_table = args.translation_table if args.gmes_fungus: gmes_fungus = '--fungus' else: gmes_fungus = '' # Create necessary directory create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_maker.log') logger = set_logging(log_file) logger_time, logger_txt = logger # Run Maker on each EST file all_gff_file = '' for est_file in est_files: # Create directory est_prefix = os.path.basename(os.path.splitext(est_file)[0]) est_prefix = est_prefix.replace('Trinity_', '') est_dir = os.path.join(output_dir, est_prefix) if not glob(est_dir): os.mkdir(est_dir) # Check maker is already done run_flag_run1 = check_maker_finished(output_dir, input_fasta, '1', est_prefix) # Run Maker batch logger_time.debug('START running Maker run1') if run_flag_run1: run_maker_batch(input_fasta, output_dir, log_dir, protein_db_fastas, num_cores, repeat_model, est_file, all_gff_file, logger) else: logger_txt.debug('[Note] Running Maker has already been finished') logger_time.debug('DONE running Maker run1') # Train run1 & run Maker run2 all_gff_file_run1 = collect_result(input_fasta, output_dir, '1', est_prefix, logger) logger_time.debug('START training run1 & running maker run2') snap_hmm_file_run1 = train_snap(output_dir, all_gff_file_run1, '1', est_prefix, logger) run_flag_run2 = check_maker_finished(output_dir, input_fasta, '2', est_prefix) if run_flag_run2: run_maker_trained(input_fasta, output_dir, log_dir, augustus_species, num_cores, snap_hmm_file_run1, all_gff_file_run1, '2', est_prefix, logger) else: logger_txt.debug('[Note] Running Maker has already been finished') logger_time.debug('DONE training run1 & running maker run2') # Train run2 & run Maker run3 all_gff_file_run2 = collect_result(input_fasta, output_dir, '2', est_prefix, logger) logger_time.debug('START training run2 & running maker run3') snap_hmm_file_run2 = train_snap(output_dir, all_gff_file_run2, '2', est_prefix, logger) run_flag_run3 = check_maker_finished(output_dir, input_fasta, '3', est_prefix) if run_flag_run3: run_maker_trained(input_fasta, output_dir, log_dir, augustus_species, num_cores, snap_hmm_file_run2, all_gff_file_run2, '3', est_prefix, logger) else: logger_txt.debug('[Note] Running Maker has already been finished') logger_time.debug('DONE training run2 & running maker run3') # Now, for final run, get masked assembly and get GeneMark hmm model masked_assembly = get_masked_asm(output_dir, est_files, logger) # Run gmes or gmsn eukgmhmmfile = run_gmes(masked_assembly, num_cores, output_dir, log_dir, gmes_fungus, logger) # Train run3 & run Maker run4 all_gff_file_run3 = collect_result(input_fasta, output_dir, '3', est_prefix, logger) logger_time.debug('START training run3 & running maker run4') snap_hmm_file_run3 = train_snap(output_dir, all_gff_file_run3, '3', est_prefix, logger) run_flag_run4 = check_maker_finished(output_dir, input_fasta, '4', est_prefix) if run_flag_run4: run_maker_trained(input_fasta, output_dir, log_dir, augustus_species, num_cores, snap_hmm_file_run3, all_gff_file_run3, '4', est_prefix, logger, eukgmhmmfile) else: logger_txt.debug('[Note] Running Maker has already been finished') logger_time.debug('DONE training run3 & running maker run4') # Get final GFF3 & FASTA collect_result_final(input_fasta, output_dir, est_prefix, translation_table, logger) all_gff_file = collect_result(input_fasta, output_dir, '4', est_prefix, logger)
def main(): '''Main function''' argparse_usage = ( 'filter_gff3s.py -a <genome_assembly> -i <input_gff3s> ' '-m <mapping_file> -b <blastp_dict> -B <busco_dict> -p <pfam_dict> ' '-N <blastn_dict> -g <bad_dict> -n <nr_prot_file> -o <output_dir>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-a', '--genome_assembly', nargs=1, required=True, help='Genome assembly file') parser.add_argument('-i', '--input_gff3s', nargs='+', required=True, help='Multiple gff3 files') parser.add_argument('-m', '--mapping_file', nargs=1, required=True, help='Mapping txt file (make_nr_prot.py)') parser.add_argument( '-b', '--blastp_dict', nargs=1, required=True, help='Parsed blastp output in dictionary (import_blastp.py)') parser.add_argument( '-B', '--busco_dict', nargs=1, required=True, help='Parsed BUSCO output in dictionary (import_busco.py)') parser.add_argument( '-p', '--pfam_dict', nargs=1, required=True, help='Parsed Pfam_scan output in dictionary (import_pfam.py)') parser.add_argument( '-N', '--blastn_dict', nargs=1, required=True, help='Parsed BLASTn output in dictionary (import_blastn.py)') parser.add_argument('-g', '--bad_dict', nargs=1, required=True, help='Parsed IPRscan output in dictionary') parser.add_argument('-n', '--nr_prot_file', nargs=1, required=True, help='nr_prot.faa file (make_nr_prot.py)') parser.add_argument('-o', '--output_dir', nargs='?', default='gene_filtering', help='Output directory') parser.add_argument('-l', '--log_dir', nargs='?', default='log_dir', help='Log directory') args = parser.parse_args() genome_assembly = os.path.abspath(args.genome_assembly[0]) input_gff3s = [os.path.abspath(x) for x in args.input_gff3s] mapping_file = os.path.abspath(args.mapping_file[0]) blastp_dict = os.path.abspath(args.blastp_dict[0]) busco_dict = os.path.abspath(args.busco_dict[0]) pfam_dict = os.path.abspath(args.pfam_dict[0]) blastn_dict = os.path.abspath(args.blastn_dict[0]) bad_dict = os.path.abspath(args.bad_dict[0]) d_bad = pickle.load(open(bad_dict, 'rb')) nr_prot_file = os.path.abspath(args.nr_prot_file[0]) output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'filter_gff3s.log') logger_time = set_logging(log_file)[0] # Run functions :) Slow is as good as Fast logger_time.debug('START: Filtering GFF3') d_mapping, d_mapping_rev = import_mapping(mapping_file) # Import dictionaries d_blastp = pickle.load(open(blastp_dict, 'rb')) d_busco = pickle.load(open(busco_dict, 'rb')) d_pfam = pickle.load(open(pfam_dict, 'rb')) d_blastn = pickle.load(open(blastn_dict, 'rb')) # Self-filtering for input_gff3 in input_gff3s: prefix = re.sub(r'\.gff3$', '', os.path.basename(input_gff3)) d_gff3, d_gene, d_cds, d_cds_len, d_exon = import_gff3([input_gff3]) self_filtered = filtering(d_cds, d_cds_len, d_blastp, d_busco, d_pfam, d_blastn, d_bad, output_dir) outfile_self = os.path.join(output_dir, '{}_filtered.list'.format(prefix)) outhandle_self = open(outfile_self, 'w') cds_len_filtered = 0 for tup in self_filtered: outhandle_self.write('{}\n'.format(tup[1])) cds_len_filtered += d_cds_len[tup] outhandle_self.close() # Filtering d_gff3, d_gene, d_cds, d_cds_len, d_exon = import_gff3(input_gff3s) final_gene_set = filtering(d_cds, d_cds_len, d_blastp, d_busco, d_pfam, d_blastn, d_bad, output_dir) d_prot = import_prot(nr_prot_file, d_mapping_rev) write_final_prots(final_gene_set, d_mapping, output_dir) write_files(genome_assembly, final_gene_set, d_gene, d_gff3, d_prot, d_exon, output_dir, d_cds) cds_len_final = 0 for tup in final_gene_set: cds_len_final += d_cds_len[tup] logger_time.debug('DONE : Filtering GFF3')
def main(): '''Main function''' argparse_usage = ( 'run_trinity.py -b <bam_files> -o <output_dir> -l <log_dir> ' '-c <num_cores> -m <max_intron>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-b', '--bam_files', nargs='+', required=True, help='Sorted BAM files generated by HISAT2') parser.add_argument('-o', '--output_dir', nargs='?', default='trinity_out', help='Output directory (default: trinity_out)') parser.add_argument('-l', '--log_dir', nargs='?', default='logs', help='Log directory (default: logs)') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used (default: 1)') parser.add_argument('-m', '--max_intron', nargs='?', default=2000, type=int, help='Max intron length (Default: 2000 bp)') parser.add_argument('--jaccard_clip', action='store_true', help='--jaccard_clip flag in Trinity') args = parser.parse_args() output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) bam_files = [os.path.abspath(x) for x in args.bam_files] num_cores = args.num_cores max_intron = args.max_intron if args.jaccard_clip: jaccard_clip_flag = '--jaccard_clip' else: jaccard_clip_flag = '' # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'run_trinity.log') logger = set_logging(log_file) logger_txt = logger[1] # Check bamfile bam_files = [x for x in bam_files if glob(x)] if not bam_files: logger_txt.debug('[ERROR] You provided wrong BAM FILES. Please check') sys.exit(2) # Run functions :) run_trinity(bam_files, output_dir, log_dir, num_cores, max_intron, jaccard_clip_flag, logger)
def main(argv): optparse_usage = ( 'run_maker.py -i <input_fasta> -r <root_dir> -p <project_name>' ' -P <protein_db_fastas> -c <num_cores> -R <repeat_model>' ' -e <est_files> -C <config_file>') parser = ArgumentParser(usage=optparse_usage) parser.add_argument("-i", "--input_fasta", dest="input_fasta", nargs=1, help="Input genome sequence in FASTA format") parser.add_argument("-r", "--root_dir", dest="root_dir", nargs=1, help="Resulting files will be generated here") parser.add_argument("-a", "--augustus_species", dest="augustus_species", nargs=1, help='"augustus --species=help" would be helpful') parser.add_argument("-p", "--project_name", dest="project_name", nargs=1, help="Output prefix for resulting files without space") parser.add_argument( "-P", "--protein_db_fastas", dest="protein_db_fastas", nargs=1, help="Protein db in FASTA foramt. It could be SwissProt " "or UniProt database") parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help="Number of cores to be used") parser.add_argument('-R', '--repeat_model', dest="repeat_model", nargs=1, help="Custom repeat model by RepeatModeler") parser.add_argument('-e', '--est_files', dest="est_files", nargs='*', help="Multiple EST data if available") parser.add_argument("-C", "--config_file", dest="config_file", nargs=1, help="Config file generated by check_dependencies.py") parser.add_argument('--gmes_fungus', dest='gmes_fungus', action='store_true', help='--fungus flag in GeneMark') args = parser.parse_args() if args.input_fasta: input_fasta = os.path.abspath(args.input_fasta[0]) else: print '[ERROR] Please provide INPUT FASTA' sys.exit(2) if args.root_dir: root_dir = os.path.abspath(args.root_dir[0]) else: print '[ERROR] Please provide ROOT DIRECTORY' sys.exit(2) if args.augustus_species: augustus_species = args.augustus_species[0] else: print '[ERROR] Please provide AUGUSTUS SPECIES' sys.exit(2) if args.project_name: project_name = args.project_name[0] else: print '[ERROR] Please provide PROJECT NAME' sys.exit(2) if args.protein_db_fastas: protein_db_fastas = [ os.path.abspath(x) for x in args.protein_db_fastas ] else: print '[ERROR] Please provide PROTEIN DB FASTA FILES' sys.exit(2) if args.num_cores: num_cores = args.num_cores[0] else: num_cores = 1 if args.repeat_model: repeat_model = os.path.abspath(args.repeat_model[0]) else: print '[ERROR] Please provide REPEAT MODEL' sys.exit(2) if args.est_files: est_files = [os.path.abspath(x) for x in args.est_files] else: est_files = [''] if args.config_file: config_file = os.path.abspath(args.config_file[0]) else: print '[ERROR] Please provide CONFIG FILE' sys.exit(2) if args.gmes_fungus: gmes_fungus = '--fungus' else: gmes_fungus = '' # Create necessary directory create_dir(root_dir) # Set logging log_file = os.path.join(root_dir, 'logs', 'pipeline', 'run_maker.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) maker_bin, genemark_bin = parse_config(config_file) # Run Maker on each EST file all_gff_file = '' for est_file in est_files: # Create directory est_prefix = (os.path.basename(est_file).split('.')[0].replace( 'Trinity_', '')) est_dir = os.path.join(root_dir, software, est_prefix) if not glob(est_dir): os.mkdir(est_dir) # Check maker is already done run_flag_run1 = check_maker_finished(root_dir, input_fasta, '1', est_prefix) # Run Maker batch logger_time.debug('START running Maker run1') if run_flag_run1: run_maker_batch(input_fasta, root_dir, augustus_species, protein_db_fastas, num_cores, repeat_model, est_file, all_gff_file, maker_bin) else: logger_txt.debug('Running Maker has already been finished') logger_time.debug('DONE running Maker run1') # Train run1 & run Maker run2 all_gff_file_run1 = collect_result(input_fasta, root_dir, project_name, '1', est_prefix) logger_time.debug('START training run1 & running maker run2') snap_hmm_file_run1 = train_snap(root_dir, all_gff_file_run1, '1', est_prefix, maker_bin) run_flag_run2 = check_maker_finished(root_dir, input_fasta, '2', est_prefix) if run_flag_run2: run_maker_trained(input_fasta, root_dir, augustus_species, num_cores, snap_hmm_file_run1, all_gff_file_run1, '2', est_prefix, maker_bin) else: logger_txt.debug('Running Maker has already been finished') logger_time.debug('DONE training run1 & running maker run2') # Train run2 & run Maker run3 all_gff_file_run2 = collect_result(input_fasta, root_dir, project_name, '2', est_prefix) logger_time.debug('START training run2 & running maker run3') snap_hmm_file_run2 = train_snap(root_dir, all_gff_file_run2, '2', est_prefix, maker_bin) run_flag_run3 = check_maker_finished(root_dir, input_fasta, '3', est_prefix) if run_flag_run3: run_maker_trained(input_fasta, root_dir, augustus_species, num_cores, snap_hmm_file_run2, all_gff_file_run2, '3', est_prefix, maker_bin) else: logger_txt.debug('Running Maker has already been finished') logger_time.debug('DONE training run2 & running maker run3') # Now, for final run, get masked assembly and get GeneMark hmm model masked_assembly = get_masked_asm(root_dir, est_files) # Run gmes or gmsn eukgmhmmfile = run_gmes(masked_assembly, num_cores, root_dir, genemark_bin, gmes_fungus) # Train run3 & run Maker run4 all_gff_file_run3 = collect_result(input_fasta, root_dir, project_name, '3', est_prefix) logger_time.debug('START training run3 & running maker run4') snap_hmm_file_run3 = train_snap(root_dir, all_gff_file_run3, '3', est_prefix, maker_bin) run_flag_run4 = check_maker_finished(root_dir, input_fasta, '4', est_prefix) if run_flag_run4: run_maker_trained(input_fasta, root_dir, augustus_species, num_cores, snap_hmm_file_run3, all_gff_file_run3, '4', est_prefix, maker_bin, eukgmhmmfile) else: logger_txt.debug('Running Maker has already been finished') logger_time.debug('DONE training run3 & running maker run4') # Get final GFF3 & FASTA collect_result_final(input_fasta, root_dir, project_name, est_prefix) all_gff_file = collect_result(input_fasta, root_dir, project_name, '4', est_prefix)
def main(argv): argparse_usage = ( 'fungap.py -g <genome_assembly> -r <trans_read_files> ' '-o <output_dir> -p <project_name> -a <augustus_species> ' '-O <org_id> -s <sister_proteome>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument("-o", "--output_dir", dest="output_dir", nargs=1, help="Output directory") parser.add_argument("-r", "--trans_read_files", dest="trans_read_files", nargs=2, help="Multiple transcriptome read files in FASTAQ" " (two paired-end files)") parser.add_argument( "-p", "--project_name", dest="project_name", nargs=1, help="Project name without space. e.g. Mag, Eco, Pst_LUM") parser.add_argument("-g", "--genome_assembly", dest="genome_assembly", nargs=1, help="Genome assembly file in FASTA format") parser.add_argument("-a", "--augustus_species", dest="augustus_species", nargs=1, help="AUGUSTUS species") parser.add_argument( "-O", "--org_id", dest="org_id", nargs=1, help="Organism ID. E.g. Hypma for Hypsizygus marmoreus") parser.add_argument("-s", "--sister_proteome", dest="sister_proteome", nargs=1, help="Sister proteome sequences in .faa") parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help="Number of cores to be used") parser.add_argument( "-H", "--with_hisat2", dest="with_hisat2", nargs='?', help="User-defined Hisat2 installation path (binary directory)") parser.add_argument( "-t", "--with_trinity", dest="with_trinity", nargs='?', help="User-defined Trinity installation path (binary directory)") parser.add_argument( "-m", "--with_maker", dest="with_maker", nargs='?', help="User-defined Maker installation path (binary directory)") parser.add_argument( "-R", "--with_repeat_modeler", dest="with_repeat_modeler", nargs='?', help="User-defined Repeat Modeler installation path (binary directory)" ) parser.add_argument( "-b", "--with_braker1", dest="with_braker1", nargs='?', help="User-defined Braker1 installation path (binary directory)") parser.add_argument( "-B", "--with_busco", dest="with_busco", nargs='?', help="User-defined BUSCO installation path (binary directory)") parser.add_argument( "-i", "--with_interproscan", dest="with_interproscan", nargs='?', help="User-defined InterproScan installation path (binary directory)") # Options for non-fungus genome parser.add_argument( '--no_braker_fungus', dest='no_braker_fungus', action='store_true', help='No --fungus flag in BRAKER for non-fungus genomes') parser.add_argument( '--no_jaccard_clip', dest='no_jaccard_clip', action='store_true', help='No --jaccard_clip flag in Trinity for non-fungus genomes') parser.add_argument( '--no_genemark_fungus', dest='no_genemark_fungus', action='store_true', help='No --fungus flag in GeneMark for non-fungus genomes') parser.add_argument("-M", "--max_intron", dest="max_intron", nargs='?', help="Max intron length (Default: 2,000 bp)") args = parser.parse_args() if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] Please provide OUTPUT DIRECTORY' sys.exit(2) if args.trans_read_files: trans_read_files = [os.path.abspath(x) for x in args.trans_read_files] else: print '[ERROR] Please provide TRANSCRIPTOME READ FILES' sys.exit(2) if args.project_name: project_name = args.project_name[0] else: print '[ERROR] Please provide PROJECTN NAME' sys.exit(2) if args.genome_assembly: genome_assembly = os.path.abspath(args.genome_assembly[0]) else: print '[ERROR] Please provide transcriptome read files' sys.exit(2) if args.augustus_species: augustus_species = args.augustus_species[0] else: print '[ERROR] Please provide transcriptome AUGUSTUS SPECIES' sys.exit(2) if args.org_id: org_id = args.org_id[0] else: print '[ERROR] Please provide transcriptome ORGANISM ID' sys.exit(2) if args.sister_proteome: sister_proteome = os.path.abspath(args.sister_proteome[0]) else: print '[ERROR] Please provide TRANSCRIPTOME READ FILES' sys.exit(2) if args.num_cores: num_cores = args.num_cores[0] else: print '[ERROR] Please provide NUMBER OF CORES' sys.exit(2) if args.with_hisat2: with_hisat2 = os.path.abspath(args.with_hisat2) else: with_hisat2 = '' if args.with_trinity: with_trinity = os.path.abspath(args.with_trinity) else: with_trinity = '' if args.with_maker: with_maker = os.path.abspath(args.maker) else: with_maker = '' if args.with_repeat_modeler: with_repeat_modeler = os.path.abspath(args.with_repeat_modeler) else: with_repeat_modeler = '' if args.with_braker1: with_braker1 = os.path.abspath(args.with_braker1) else: with_braker1 = '' if args.with_busco: with_busco = os.path.abspath(args.with_busco) else: with_busco = '' if args.with_interproscan: with_interproscan = os.path.abspath(args.with_interproscan) else: with_interproscan = '' # For non-fungus genomes if args.no_braker_fungus: no_braker_fungus = '' else: no_braker_fungus = '--fungus' if args.no_jaccard_clip: no_jaccard_clip = '' else: no_jaccard_clip = '--jaccard_clip' if args.no_genemark_fungus: no_genemark_fungus = '' else: no_genemark_fungus = '--gmes_fungus' if args.max_intron: max_intron = int(args.max_intron) else: max_intron = 2000 # Create nessasary dirs create_dir(output_dir) # Set logging log_file = os.path.join(output_dir, 'logs', 'pipeline', 'fungap.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) logger_txt.debug("\n============ New Run %s ============" % (datetime.now())) # Run functions :) Slow is as good as Fast config_file = run_check_dependencies(output_dir, with_hisat2, with_trinity, with_maker, with_repeat_modeler, with_braker1, with_busco, with_interproscan) trans_bams = run_hisat2(genome_assembly, trans_read_files, output_dir, num_cores, config_file, max_intron) trinity_asms = run_trinity(trans_bams, output_dir, project_name, num_cores, config_file, no_jaccard_clip, max_intron) repeat_model_file = run_repeat_modeler(genome_assembly, output_dir, project_name, num_cores, config_file) maker_gff3s, maker_faas = run_maker(genome_assembly, output_dir, augustus_species, project_name, sister_proteome, num_cores, repeat_model_file, trinity_asms, config_file, no_genemark_fungus) # Get masked assembly masked_assembly = os.path.join(output_dir, 'gpre_maker', 'masked_assembly.fasta') # Run Augustus augustus_gff3, augustus_faa = run_augustus(masked_assembly, output_dir, augustus_species) # Run Braker1 braker1_gff3s, braker1_faas = run_braker1(masked_assembly, trans_bams, output_dir, num_cores, config_file, no_braker_fungus) # Run BUSCO on each gene models if not glob(os.path.join(output_dir, 'gpre_busco')): os.mkdir(os.path.join(output_dir, 'gpre_busco')) for maker_faa in maker_faas: maker_prefix = os.path.basename(maker_faa).split('.')[0] maker_busco = os.path.join(output_dir, 'gpre_busco', maker_prefix) run_busco(maker_faa, maker_busco, num_cores, config_file) augustus_prefix = os.path.basename(augustus_faa).split('.')[0] augustus_busco = os.path.join(output_dir, 'gpre_busco', augustus_prefix) run_busco(augustus_faa, augustus_busco, num_cores, config_file) for braker1_faa in braker1_faas: braker1_prefix = os.path.basename(braker1_faa).split('.')[0] braker1_busco = os.path.join(output_dir, 'gpre_busco', braker1_prefix) run_busco(braker1_faa, braker1_busco, num_cores, config_file) busco_dir = os.path.join(output_dir, 'gpre_busco') # Get protein nr by removing identical proteins all_prot_files = maker_faas + [augustus_faa] + braker1_faas nr_prot_file, nr_prot_mapping_file = make_nr_prot(all_prot_files, output_dir) # Run BLASTp with nr prot file blastp_output = run_blastp(nr_prot_file, output_dir, sister_proteome, num_cores) # Run IPRscan with nr prot file ipr_output = run_iprscan(nr_prot_file, output_dir, config_file) # Import BLAST, BUSCO and Pfam score blast_dict_score, blast_dict_evalue = import_blast(blastp_output, nr_prot_mapping_file) busco_dict_score, busco_dict_list = import_busco(busco_dir) pfam_dict_score, pfam_dict_count = import_pfam(ipr_output, nr_prot_mapping_file) # Catch bad genes D_bad_pickle = catch_bad_genes(maker_gff3s, augustus_gff3, braker1_gff3s, genome_assembly, output_dir) filter_gff3s(maker_gff3s, augustus_gff3, braker1_gff3s, blast_dict_score, blast_dict_evalue, busco_dict_score, busco_dict_list, pfam_dict_score, pfam_dict_count, D_bad_pickle, nr_prot_file, nr_prot_mapping_file, org_id, output_dir) # Copy output files copy_output(output_dir) # Create markdown create_markdown(genome_assembly, output_dir, trinity_asms)
def main(argv): argparse_usage = ( 'filter_gff3s.py -a <genome_assembly> -i <input_gff3s> ' '-m <mapping_file> -b <blastp_dict> -B <busco_dict> -p <pfam_dict> ' '-N <blastn_dict> -g <bad_dict> -n <nr_prot_file> -o <output_dir>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument("-a", "--genome_assembly", nargs=1, required=True, help="Genome assembly file") parser.add_argument("-i", "--input_gff3s", nargs='+', required=True, help="Multiple gff3 files") parser.add_argument("-m", "--mapping_file", nargs=1, required=True, help="Mapping txt file (make_nr_prot.py)") parser.add_argument( "-b", "--blastp_dict", nargs=1, required=True, help="Parsed blastp output in dictionary (import_blastp.py)") parser.add_argument( "-B", "--busco_dict", nargs=1, required=True, help="Parsed BUSCO output in dictionary (import_busco.py)") parser.add_argument( "-p", "--pfam_dict", nargs=1, required=True, help="Parsed Pfam_scan output in dictionary (import_pfam.py)") parser.add_argument( "-N", "--blastn_dict", nargs=1, required=True, help="Parsed BLASTn output in dictionary (import_blastn.py)") parser.add_argument("-g", "--bad_dict", nargs=1, required=True, help="Parsed IPRscan output in dictionary") parser.add_argument("-n", "--nr_prot_file", nargs=1, required=True, help="nr_prot.faa file (make_nr_prot.py)") parser.add_argument("-o", "--output_dir", nargs='?', default='gene_filtering', help="Output directory") parser.add_argument("-l", "--log_dir", nargs='?', default='log_dir', help="Log directory") args = parser.parse_args() genome_assembly = os.path.abspath(args.genome_assembly[0]) input_gff3s = [os.path.abspath(x) for x in args.input_gff3s] mapping_file = os.path.abspath(args.mapping_file[0]) blastp_dict = os.path.abspath(args.blastp_dict[0]) busco_dict = os.path.abspath(args.busco_dict[0]) pfam_dict = os.path.abspath(args.pfam_dict[0]) blastn_dict = os.path.abspath(args.blastn_dict[0]) bad_dict = os.path.abspath(args.bad_dict[0]) D_bad = cPickle.load(open(bad_dict, 'rb')) nr_prot_file = os.path.abspath(args.nr_prot_file[0]) output_dir = os.path.abspath(args.output_dir) log_dir = os.path.abspath(args.log_dir) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'filter_gff3s.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast logger_time.debug('START: Filtering GFF3') D_mapping, D_mapping_rev = import_mapping(mapping_file) # Import dictionaries D_blastp = cPickle.load(open(blastp_dict, 'rb')) D_busco = cPickle.load(open(busco_dict, 'rb')) D_pfam = cPickle.load(open(pfam_dict, 'rb')) D_blastn = cPickle.load(open(blastn_dict, 'rb')) # Self-filtering for input_gff3 in input_gff3s: prefix = re.sub(r'\.gff3$', '', os.path.basename(input_gff3)) D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3([input_gff3]) self_filtered = filtering(D_cds, D_cds_len, D_blastp, D_busco, D_pfam, D_blastn, D_bad, output_dir) outfile_self = os.path.join(output_dir, '{}_filtered.list'.format(prefix)) outhandle_self = open(outfile_self, 'w') cds_len_filtered = 0 for tup in self_filtered: outhandle_self.write('{}\n'.format(tup[1])) cds_len_filtered += D_cds_len[tup] outhandle_self.close() # Filtering D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3(input_gff3s) final_gene_set = filtering(D_cds, D_cds_len, D_blastp, D_busco, D_pfam, D_blastn, D_bad, output_dir) D_prot = import_prot(nr_prot_file, D_mapping_rev) write_final_prots(final_gene_set, D_mapping, output_dir) write_files(genome_assembly, final_gene_set, D_gene, D_gff3, D_prot, D_exon, output_dir, D_cds) cds_len_final = 0 for tup in final_gene_set: cds_len_final += D_cds_len[tup] logger_time.debug('DONE : Filtering GFF3')
def main(argv): argparse_usage = ( 'run_blastn.py -q <query_fasta> -d <db_fasta> -o <output_prefix> ' '-l <log_dir>' ) parser = ArgumentParser(usage=argparse_usage) parser.add_argument( "-q", "--query_fasta", dest="query_fasta", nargs=1, help="input fasta file" ) parser.add_argument( "-d", "--db_fasta", dest="db_fasta", nargs=1, help="input fasta file" ) parser.add_argument( "-o", "--output_prefix", dest="output_prefix", nargs=1, help="Output prefix" ) parser.add_argument( "-l", "--log_dir", dest="log_dir", nargs=1, help="Log directory" ) args = parser.parse_args() if args.query_fasta: query_fasta = os.path.abspath(args.query_fasta[0]) else: print '[ERROR] Please provide QUERY FASTA' parser.print_help() sys.exit(2) if args.db_fasta: db_fasta = os.path.abspath(args.db_fasta[0]) else: print '[ERROR] Please provide DB FASTA' parser.print_help() sys.exit(2) if args.output_prefix: output_prefix = os.path.abspath(args.output_prefix[0]) else: print '[ERROR] Please provide OUTPUT PREFIX' parser.print_help() sys.exit(2) if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide LOG DIRECTORY' parser.print_help() sys.exit(2) # Set logging log_file = os.path.join( log_dir, 'pipeline', 'run_blastn.log' ) global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast logger_time.debug('START running BLASTn for %s' % ( os.path.basename(query_fasta) )) # Run functions :) Slow is as good as Fast run_blastn(query_fasta, db_fasta, output_prefix)
def main(argv): optparse_usage = ( 'run_repeat_modeler.py -g <genome_assembly> -o <output_dir> ' '-l <log_dir> -p <project_name> -c <num_cores> -C <config_file>') parser = ArgumentParser(usage=optparse_usage) parser.add_argument("-g", "--genome_assembly", dest="genome_assembly", nargs=1, help="Genome assembly file in FASTA format") parser.add_argument("-o", "--output_dir", dest="output_dir", nargs=1, help="Output directory") parser.add_argument("-l", "--log_dir", dest="log_dir", nargs=1, help="Log directory") parser.add_argument( "-p", "--project_name", dest="project_name", nargs=1, help="Project name without space. e.g. Mag, Eco, Pst_LUM") parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help="Number of cores to be used") parser.add_argument("-C", "--config_file", dest="config_file", nargs=1, help="Config file generated by check_dependencies.py") args = parser.parse_args() if args.genome_assembly: genome_assembly = os.path.abspath(args.genome_assembly[0]) else: print '[ERROR] Please provide INPUT ASSEMBLY' sys.exit(2) if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] Please provide OUTPUT DIRECTORY' sys.exit(2) if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide LOG DIRECTORY' sys.exit(2) if args.project_name: project_name = args.project_name[0] else: print '[ERROR] Please provide PROJECT NAME' sys.exit(2) if args.num_cores: num_cores = args.num_cores[0] else: print '[ERROR] Please provide NUMBER OF CORES' sys.exit(2) if args.config_file: config_file = os.path.abspath(args.config_file[0]) else: print '[ERROR] Please provide CONFIG FILE' sys.exit(2) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'pipeline', 'run_repeat_modeler.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast repeat_modeler_bin = parse_config(config_file) run_repeat_modeler(genome_assembly, output_dir, log_dir, project_name, num_cores, repeat_modeler_bin)
def main(argv): argparse_usage = ( 'check_dependencies.py -o <output_dir> -H <with_hisat2>' ' -t <with_trinity> -m <with_maker> -r <with_repeat_modeler>' ' -b <with_braker1> -B <with_busco> -i <with_interproscan>' ' -g <with_genemark>' ) parser = ArgumentParser(usage=argparse_usage) parser.add_argument( "-o", "--output_dir", dest="output_dir", nargs=1, help="Output directory" ) parser.add_argument( "-H", "--with_hisat2", dest="with_hisat2", nargs='?', help="User-defined Hisat2 installation path (binary directory)" ) parser.add_argument( "-t", "--with_trinity", dest="with_trinity", nargs='?', help="User-defined Trinity installation path (binary directory)" ) parser.add_argument( "-m", "--with_maker", dest="with_maker", nargs='?', help="User-defined Maker installation path (binary directory)" ) parser.add_argument( "-r", "--with_repeat_modeler", dest="with_repeat_modeler", nargs='?', help="User-defined Repeat Modeler installation path (binary directory)" ) parser.add_argument( "-b", "--with_braker1", dest="with_braker1", nargs='?', help="User-defined Braker1 installation path (binary directory)" ) parser.add_argument( "-B", "--with_busco", dest="with_busco", nargs='?', help="User-defined BUSCO installation path (binary directory)" ) parser.add_argument( "-i", "--with_interproscan", dest="with_interproscan", nargs='?', help="User-defined InterproScan installation path (binary directory)" ) parser.add_argument( "-g", "--with_genemark", dest="with_genemark", nargs='?', help="User-defined GeneMark installation path (binary directory)" ) args = parser.parse_args() if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] You should provide OUTPUT DIRECTORY' sys.exit(2) if args.with_hisat2: with_hisat2 = os.path.abspath(args.with_hisat2) else: with_hisat2 = '' if args.with_trinity: with_trinity = os.path.abspath(args.with_trinity) else: with_trinity = '' if args.with_maker: with_maker = os.path.abspath(args.maker) else: with_maker = '' if args.with_repeat_modeler: with_repeat_modeler = os.path.abspath(args.with_repeat_modeler) else: with_repeat_modeler = '' if args.with_braker1: with_braker1 = os.path.abspath(args.with_braker1) else: with_braker1 = '' if args.with_busco: with_busco = os.path.abspath(args.with_busco) else: with_busco = '' if args.with_interproscan: with_interproscan = os.path.abspath(args.with_interproscan) else: with_interproscan = '' if args.with_genemark: with_genemark = os.path.abspath(args.with_genemark) else: with_genemark = '' # Create necessary dirs create_dir(output_dir) # Set logging log_dir = os.path.join(output_dir, 'logs') log_file = os.path.join( log_dir, 'pipeline', 'check_dependencies.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast logger_time.debug('Check dependencies: get paths') ( hisat2_path, trinity_path, maker_path, repeat_modeler_path, braker1_path, busco_path, interproscan_path, genemark_path ) = get_path( with_hisat2, with_trinity, with_maker, with_repeat_modeler, with_braker1, with_busco, with_interproscan, with_genemark ) logger_txt.debug('') logger_time.debug('Check dependencies: check tools working') check_working( hisat2_path, trinity_path, maker_path, repeat_modeler_path, braker1_path, busco_path, interproscan_path, genemark_path ) write_config( output_dir, hisat2_path, trinity_path, maker_path, repeat_modeler_path, braker1_path, busco_path, interproscan_path, genemark_path ) # Check BLAST installation check_blast()
def main(argv): optparse_usage = ('run_blast_reduce.py -i <input_fasta> -f <ref_fasta> ' '-o <output> -c <num_cores> --nr') parser = ArgumentParser(usage=optparse_usage) parser.add_argument("-i", "--input_fasta", dest="input_fasta", nargs=1, help='input fasta file') parser.add_argument( "-f", "--ref_fasta", dest="ref_fasta", nargs='*', help=('Multiple reference FASTA files (order dependent, ' 'smallest dataset should be posed at first)')) parser.add_argument("-o", "--output_prefix", dest="output_prefix", nargs=1, help="output prefix") parser.add_argument("-r", "--root_dir", dest="root_dir", nargs=1, help=('Root directory where log directory will be ' 'generated (default: ".")'), default=[os.getcwd()]) parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help="Number of cores to be used") args = parser.parse_args() if args.input_fasta: input_fasta = os.path.abspath(args.input_fasta[0]) else: print '[ERROR] Please provide INPUT FASTA' sys.exit(2) if args.ref_fasta: references = [os.path.abspath(x) for x in args.ref_fasta] else: references = [] if args.output_prefix: output_prefix = args.output_prefix[0] else: print '[ERROR] Please provide OUTPUT_PREFIX' sys.exit(2) if args.num_cores: num_cores = args.num_cores[0] else: print '[ERROR] Please provide NUMBER OF CORES' sys.exit(2) root_dir = os.path.abspath(args.root_dir[0]) # Check input fasta is valid if not glob(input_fasta): print '[ERROR] No such file: %s' % (input_fasta) sys.exit(2) # Create necessary dirs create_dir(root_dir) # Set logging log_file = os.path.join(root_dir, 'logs', 'pipeline', 'run_blastp_reduce.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast logger_time.debug('START running BLASTp-reduce for %s' % (os.path.basename(input_fasta))) if references: filtered_fasta = input_fasta tmp_num = 1 for ref in references: tmp_output_blast = run_blastp_ref(filtered_fasta, ref, output_prefix, tmp_num, num_cores) filtered_fasta, tmp_num = filtering(filtered_fasta, output_prefix, tmp_num, tmp_output_blast) else: filtered_fasta = input_fasta integrate(output_prefix, tmp_num) logger_time.debug('DONE running BLASTp-reduce for %s' % (os.path.basename(input_fasta)))
def main(argv): argparse_usage = ( 'run_braker1.py -m <masked_assembly> -b <bam_files> -o <output_dir> ' '-l <log_dir> -c <num_cores> -C <config_file>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument("-m", "--maksed_assembly", dest="masked_assembly", nargs=1, help="Assembly file in FASTA") parser.add_argument("-b", "--bam_fileles", dest="bam_files", nargs='+', help="BAM files generated by Hisat2") parser.add_argument("-o", "--output_dir", dest="output_dir", nargs='+', help="Output directory") parser.add_argument("-l", "--log_dir", dest="log_dir", nargs='+', help="Log directory") parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help="Number of cores to be used") parser.add_argument("-C", "--config_file", dest="config_file", nargs=1, help="Config file generated by check_dependencies.py") parser.add_argument('--fungus', dest='fungus_flag', action='store_true', help='Fungus flag of BRAKER1') args = parser.parse_args() if args.masked_assembly: masked_assembly = os.path.abspath(args.masked_assembly[0]) else: print '[ERROR] Please provide INPUT ASSEMBLY' sys.exit(2) if args.bam_files: bam_files = [os.path.abspath(x) for x in args.bam_files] else: print '[ERROR] Please provide BAM FILES' sys.exit(2) if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] Please provide OUTPUT DIRECTORY' sys.exit(2) if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide LOG DIRECTORY' sys.exit(2) if args.num_cores: num_cores = args.num_cores[0] else: num_cores = 1 if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide LOG DIRECTORY' sys.exit(2) if args.config_file: config_file = os.path.abspath(args.config_file[0]) else: print '[ERROR] Please provide CONFIG FILE' sys.exit(2) if args.fungus_flag: fungus_flag = '--fungus' else: fungus_flag = '' # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'pipeline', 'run_braker1.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast braker1_bin = parse_config(config_file) run_braker1(masked_assembly, bam_files, output_dir, log_dir, num_cores, braker1_bin, fungus_flag)
def main(argv): argparser_usage = ( 'run_hisat2.py -r <fastq1> <fastq2> <fastq3> ...' ' -o <output_dir> -l <log_dir> -f <ref_fasta> -c <num_cores>' ' -C <config_file>') parser = ArgumentParser(usage=argparser_usage) parser.add_argument("-r", "--read_files", dest="read_files", nargs='+', help='Multiople read files in fastq format') parser.add_argument("-o", "--output_dir", dest="output_dir", nargs=1, help='Output directory') parser.add_argument("-l", "--log_dir", dest="log_dir", nargs=1, help='Log directory') parser.add_argument("-f", "--ref_fasta", dest="ref_fasta", nargs=1, help='Reference fasta') parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help='Number of cores') parser.add_argument("-C", "--config_file", dest="config_file", nargs=1, help="Config file generated by check_dependencies.py") args = parser.parse_args() if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] Please provide proper OUTPUT DIRECTORY' sys.exit(2) if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide proper LOG DIRECTORY' sys.exit(2) if args.read_files: read_files = [os.path.abspath(x) for x in args.read_files] else: print '[ERROR] Please provide proper READ FILES' sys.exit(2) # Reference fasta if args.ref_fasta: ref_fasta = os.path.abspath(args.ref_fasta[0]) else: print '[ERROR] Please provide proper file: REFERENCE FASTA' sys.exit(2) if args.num_cores: num_cores = int(args.num_cores[0]) else: num_cores = 1 if args.config_file: config_file = os.path.abspath(args.config_file[0]) else: print '[ERROR] Please provide CONFIG FILE' sys.exit(2) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'pipeline', 'run_hisat2.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast hisat2_bin = parse_config(config_file) logger_time.debug('START: Hisat2') hisat2_outputs = run_hisat2(read_files, output_dir, log_dir, ref_fasta, num_cores, hisat2_bin) post_process_sam(hisat2_outputs) logger_time.debug('DONE : Hisat2')
def __init__(self, db_name='sqlite.db'): self.sqlite_db = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'database/' + db_name) self.create_folder() self.logger = set_logging.set_logging('sqlite_operator')
def main(argv): optparse_usage = ( 'run_busco.py -i <input_fasta> -o <output_dir> -l <log_dir> ' '-c <num_cores> -C <config_file>') parser = ArgumentParser(usage=optparse_usage) parser.add_argument("-i", "--input_fasta", dest="input_fasta", nargs=1, help="Input protein FASTA file") parser.add_argument("-o", "--output_dir", dest="output_dir", nargs=1, help="Output directory") parser.add_argument("-l", "--log_dir", dest="log_dir", nargs=1, help='Log directory') parser.add_argument("-c", "--num_cores", dest="num_cores", nargs=1, help="Number of cores to be used") parser.add_argument("-C", "--config_file", dest="config_file", nargs=1, help="Config file generated by check_dependencies.py") args = parser.parse_args() if args.input_fasta: input_fasta = os.path.abspath(args.input_fasta[0]) else: print '[ERROR] Please provide INPUT FASTA' sys.exit(2) if args.output_dir: output_dir = os.path.abspath(args.output_dir[0]) else: print '[ERROR] Please provide OUTPUT DIRECTORY' sys.exit(2) if args.log_dir: log_dir = os.path.abspath(args.log_dir[0]) else: print '[ERROR] Please provide LOG DIRECTORY' sys.exit(2) if args.num_cores: num_cores = args.num_cores[0] else: print '[ERROR] Please provide NUMBER OF CORES' sys.exit(2) if args.config_file: config_file = os.path.abspath(args.config_file[0]) else: print '[ERROR] Please provide CONFIG FILE' sys.exit(2) # Create necessary dirs create_dir(output_dir, log_dir) # Set logging log_file = os.path.join(log_dir, 'pipeline', 'run_busco.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Check BUSCO library if not glob(os.path.join(lineage_path, 'hmms/*hmm')): logger_txt.debug( '\n[ERROR] You did not download BUSCO library\n' 'Go to FunGAP_PATH/data/ and type\n' 'wget http://busco.ezlab.org/v1/files/fungi_buscos.tar.gz;' 'tar -zxvf fungi_buscos.tar.gz\n' 'You can resume FunGAP without restarting ' '(run FunGAP in the same directory)') sys.exit(2) # Run functions :) Slow is always better than Fast busco_bin = parse_config(config_file) run_busco(input_fasta, output_dir, log_dir, num_cores, busco_bin)
def main(argv): argparse_usage = ( 'fungap.py -g <genome_assembly> -12UA <trans_read_files> ' '-o <output_dir> -a <augustus_species> ' '-s <sister_proteome>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument('-o', '--output_dir', nargs='?', default='fungap_out', help='Output directory (default: fungap_out)') parser.add_argument('-1', '--trans_read_1', nargs='?', default='', help='Paired-end read1 "<prefix>_1.fastq"') parser.add_argument('-2', '--trans_read_2', nargs='?', default='', help='Paired-end read2 "<prefix>_2.fastq"') parser.add_argument('-U', '--trans_read_single', nargs='?', default='', help='Single read "<prefix>_s.fastq"') parser.add_argument( '-A', '--trans_bam', nargs='?', default='', help='BAM file (RNA-seq reads alignment to a genome assembly') parser.add_argument('-g', '--genome_assembly', nargs=1, required=True, help='Genome assembly file in FASTA format') parser.add_argument('-a', '--augustus_species', nargs=1, required=True, help='AUGUSTUS species') parser.add_argument('-s', '--sister_proteome', nargs=1, required=True, help='Sister proteome sequences in .faa') parser.add_argument('-c', '--num_cores', nargs='?', default=1, type=int, help='Number of cores to be used (default: 1)') parser.add_argument('-v', '--version', action='version', version='%(prog)s {}'.format(__version__)) # Options for non-fungus genome parser.add_argument( '--no_braker_fungus', action='store_true', help='No --fungus flag in BRAKER for non-fungus genomes') parser.add_argument( '--no_jaccard_clip', action='store_true', help='No --jaccard_clip flag in Trinity for non-fungus genomes') parser.add_argument( '--no_genemark_fungus', action='store_true', help='No --fungus flag in GeneMark for non-fungus genomes') parser.add_argument('-M', '--max_intron', nargs='?', default=2000, type=int, help='Max intron length (Default: 2000 bp)') args = parser.parse_args() output_dir = os.path.abspath(args.output_dir) trans_read_1 = args.trans_read_1 trans_read_2 = args.trans_read_2 trans_read_single = args.trans_read_single trans_bam = args.trans_bam genome_assembly = os.path.abspath(args.genome_assembly[0]) augustus_species = args.augustus_species[0] sister_proteome = os.path.abspath(args.sister_proteome[0]) num_cores = args.num_cores max_intron = args.max_intron # For non-fungus genomes if args.no_braker_fungus: no_braker_fungus = '' else: no_braker_fungus = '--fungus' if args.no_jaccard_clip: no_jaccard_clip = '' else: no_jaccard_clip = '--jaccard_clip' if args.no_genemark_fungus: no_genemark_fungus = '' else: no_genemark_fungus = '--gmes_fungus' # Create nessasary dirs create_dir(output_dir) # Set logging log_file = os.path.join(output_dir, 'logs', 'fungap.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) logger_txt.debug('\n============ New Run {} ============'.format( datetime.now())) # Run functions :) Slow is as good as Fast trans_read_files = check_inputs(trans_read_1, trans_read_2, trans_read_single, trans_bam, genome_assembly, sister_proteome) trans_bams = run_hisat2(genome_assembly, trans_read_files, output_dir, num_cores, max_intron) trinity_asms = run_trinity(trans_bams, output_dir, num_cores, no_jaccard_clip, max_intron) repeat_model_file = run_repeat_modeler(genome_assembly, output_dir, num_cores) maker_gff3s, maker_faas = run_maker(genome_assembly, output_dir, augustus_species, sister_proteome, num_cores, repeat_model_file, trinity_asms, no_genemark_fungus) # Get masked assembly masked_assembly = os.path.join(output_dir, 'maker_out', 'masked_assembly.fasta') # Run Augustus augustus_gff3, augustus_faa = run_augustus(masked_assembly, output_dir, augustus_species) # Run Braker1 braker1_gff3s, braker1_faas = run_braker1(masked_assembly, trans_bams, output_dir, num_cores, no_braker_fungus) # Run BUSCO on each gene models faa_files = [augustus_faa] + maker_faas + braker1_faas for faa_file in faa_files: run_busco(faa_file, output_dir, num_cores) busco_out_dir = os.path.join(output_dir, 'busco_out') # Get protein nr by removing identical proteins nr_prot_file, nr_prot_mapping_file = make_nr_prot(faa_files, output_dir) # Run BLASTp with nr prot file blastp_output = run_blastp(nr_prot_file, output_dir, sister_proteome, num_cores) # Run Pfam_scan with nr prot file pfam_scan_out = run_pfam_scan(nr_prot_file, output_dir, num_cores) # Concatenate all transcripts files gene_filtering_dir = os.path.join(output_dir, 'gene_filtering') trinity_asm = os.path.join(gene_filtering_dir, 'trinity_transcripts.fna') command = 'cat {} > {}'.format(' '.join(trinity_asms), trinity_asm) logger_time.debug('Create transcript') logger_txt.debug('[Run] {}'.format(command)) os.system(command) gff3_files = [augustus_gff3] + maker_gff3s + braker1_gff3s blastn_out_files = [] for gff3_file in gff3_files: transcript_file = make_transcripts(genome_assembly, gff3_file) blastn_out_file = run_blastn(transcript_file, trinity_asm, output_dir) blastn_out_files.append(blastn_out_file) # Import BLAST, BUSCO and Pfam score blastp_dict = import_blastp(blastp_output, nr_prot_mapping_file) busco_dict = import_busco(busco_out_dir, output_dir) pfam_dict = import_pfam(pfam_scan_out, nr_prot_mapping_file) blastn_dict = import_blastn(blastn_out_files, output_dir) # Catch bad genes bad_dict = catch_bad_genes(gff3_files, genome_assembly, output_dir) filter_gff3s(genome_assembly, gff3_files, blastp_dict, busco_dict, pfam_dict, blastn_dict, bad_dict, nr_prot_file, nr_prot_mapping_file, output_dir) gff3_postprocess(genome_assembly, output_dir) # Copy output files copy_output(output_dir) # Create markdown create_markdown(genome_assembly, output_dir, trans_bams, trinity_asms)
def main(argv): argparse_usage = ( 'filter_gff3s.py -i <input_gff3s> -m <mapping_file> -b <blast_dict> ' '-B <busco_dict> -p <ipr_dict> -g <bad_dict> -n <nr_prot_file> ' '-s <short_id> -o <output_prefix> -r <root_dir>') parser = ArgumentParser(usage=argparse_usage) parser.add_argument("-i", "--input_gff3s", dest="input_gff3s", nargs='+', help="Multiple gff3 files") parser.add_argument("-m", "--mapping_file", dest="mapping_file", nargs=1, help="Mapping txt file (make_nr_prot.py)") parser.add_argument("-b", "--blast_dict", dest="blast_dict", nargs=2, help="Parsed blast output in dictionary") parser.add_argument("-B", "--busco_dict", dest="busco_dict", nargs=2, help="Parsed BUSCO output in dictionary") parser.add_argument("-p", "--ipr_dict", dest="ipr_dict", nargs=2, help="Parsed IPRscan output in dictionary") parser.add_argument("-g", "--bad_dict", dest="bad_dict", nargs=1, help="Parsed IPRscan output in dictionary") parser.add_argument("-n", "--nr_prot_file", dest="nr_prot_file", nargs=1, help="nr_prot.faa file (make_nr_prot.py)") parser.add_argument("-s", "--short_id", dest="short_id", nargs=1, help="Short ID for gene numbers") parser.add_argument("-o", "--output_prefix", dest="output_prefix", nargs=1, help="Output prefix") parser.add_argument("-r", "--root_dir", dest="root_dir", nargs=1, help=('Root directory where log directory will be ' 'generated (default: ".")'), default=[os.getcwd()]) args = parser.parse_args() if args.input_gff3s: input_gff3s = [os.path.abspath(x) for x in args.input_gff3s] else: print '[ERROR] Please provide INPUT GFF3' sys.exit(2) if args.mapping_file: mapping_file = os.path.abspath(args.mapping_file[0]) else: print '[ERROR] Please provide MAPPING TXT FILE' sys.exit(2) if args.blast_dict: blast_dict_score = os.path.abspath(args.blast_dict[0]) blast_dict_evalue = os.path.abspath(args.blast_dict[1]) else: print '[ERROR] Please provide BLAST DICT' sys.exit(2) if args.busco_dict: busco_dict_score = os.path.abspath(args.busco_dict[0]) busco_dict_list = os.path.abspath(args.busco_dict[1]) else: print '[ERROR] Please provide BUSCO DICT' sys.exit(2) if args.ipr_dict: ipr_dict_score = os.path.abspath(args.ipr_dict[0]) ipr_dict_count = os.path.abspath(args.ipr_dict[1]) else: print '[ERROR] Please provide IPR DICT PICKLE' sys.exit(2) if args.bad_dict: bad_dict = os.path.abspath(args.bad_dict[0]) D_bad = cPickle.load(open(bad_dict, 'rb')) else: print '[WARNING] Please provide BAD DICT PICKLE' D_bad = defaultdict(bool) if args.nr_prot_file: nr_prot_file = os.path.abspath(args.nr_prot_file[0]) else: print '[ERROR] Please provide "nr_prot.faa" FILE' sys.exit(2) if args.short_id: short_id = args.short_id[0] else: print '[ERROR] Please provide SHORT ID' sys.exit(2) if args.output_prefix: output_prefix = args.output_prefix[0] else: print '[ERROR] Please provide OUTPUT PREFIX' sys.exit(2) root_dir = os.path.abspath(args.root_dir[0]) # Create necessary dirs create_dir(root_dir) # Set logging log_file = os.path.join(root_dir, 'logs', 'pipeline', 'filter_gff3s.log') global logger_time, logger_txt logger_time, logger_txt = set_logging(log_file) # Run functions :) Slow is as good as Fast logger_time.debug('START: Filtering GFF3') D_mapping, D_mapping_rev = import_mapping(mapping_file) # Import dictionaries D_blast_score = cPickle.load(open(blast_dict_score, 'rb')) D_blast_evalue = cPickle.load(open(blast_dict_evalue, 'rb')) D_busco_score = cPickle.load(open(busco_dict_score, 'rb')) D_busco_list = cPickle.load(open(busco_dict_list, 'rb')) D_pfam_score = cPickle.load(open(ipr_dict_score, 'rb')) D_pfam_count = cPickle.load(open(ipr_dict_count, 'rb')) # Self-filtering to get stats D_stats = {} for input_gff3 in input_gff3s: prefix = os.path.basename(input_gff3).split('.')[0] D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3([input_gff3]) self_filtered, stats = filtering(D_gene, D_cds, D_cds_len, D_mapping, D_blast_score, D_blast_evalue, D_busco_score, D_busco_list, D_pfam_score, D_pfam_count, D_bad, output_prefix) outfile_self = '%s_%s_filtered.list' % (output_prefix, prefix) outhandle_self = open(outfile_self, 'w') cds_len_filtered = 0 for tup in self_filtered: outhandle_self.write('%s\n' % (tup[1])) cds_len_filtered += D_cds_len[tup] outhandle_self.close() (raw_num_genes, final_num_genes, blast_hit, pfam_hit, pfam_domains, busco_hit) = stats new_stats = (raw_num_genes, final_num_genes, blast_hit, pfam_hit, pfam_domains, busco_hit, cds_len_filtered) D_stats[prefix] = new_stats # Filtering D_gff3, D_gene, D_cds, D_cds_len, D_exon = import_gff3(input_gff3s) final_gene_set, final_stats = filtering(D_gene, D_cds, D_cds_len, D_mapping, D_blast_score, D_blast_evalue, D_busco_score, D_busco_list, D_pfam_score, D_pfam_count, D_bad, output_prefix) D_prot = import_prot(nr_prot_file, D_mapping_rev) write_final_prots(final_gene_set, D_mapping, output_prefix) write_files(final_gene_set, D_gene, D_gff3, D_prot, D_exon, output_prefix, short_id) cds_len_final = 0 for tup in final_gene_set: cds_len_final += D_cds_len[tup] (raw_num_genes, final_num_genes, blast_hit, pfam_hit, pfam_domains, busco_hit) = final_stats new_final_stats = (raw_num_genes, final_num_genes, blast_hit, pfam_hit, pfam_domains, busco_hit, cds_len_final) D_stats['final'] = new_final_stats write_stats(D_stats, output_prefix) logger_time.debug('DONE : Filtering GFF3')