Beispiel #1
0
def main():

    fmtdate = '%H:%M:%S %d-%m'
    now = datetime.datetime.now().strftime(fmtdate)
    home = os.path.expanduser("~")
    args = arguments.setting()
    if args.pasa_db == "":
        pasadb = ''.join(random.sample(string.ascii_lowercase, 5))
    else:
        pasadb = args.pasa_db
    augustus_species = logistic.augustus_species_func()

    if not augustus_species.get(args.species) and args.long_reads == "" and args.short_reads == "":
        sys.exit("#####PLEASE DEFINE A SPECIES NAME OR ANY KIND OF RNA-SEQ AND RE-RUN\t" + now + "\t#####\n")
    max_threads = multiprocessing.cpu_count()
    gmap_name = args.reference + '_GMAPindex'
    pasa_name = 'assembler-' + pasadb
    if args.upgrade == "":
        protein_loc = os.path.abspath(args.proteins)
    iprscan_log = iprscan.check_iprscan()
    # Useful variables for later
    root = os.getcwd()

    #if args.out_dir != "":# and args.out_dir.startswith("/"):
    #    output_dir = os.path.join(root, "LoReAn" + args.out_dir)
    #else:
    output_dir = os.path.join(root, "LoReAn_" + args.out_dir)
    logistic.check_create_dir(output_dir)
    if args.keep_tmp or args.verbose:
        wd = os.path.join(output_dir, "run/")
        logistic.check_create_dir(wd)
    else:
        temp_dir = tempfile.TemporaryDirectory(prefix='run_', dir=output_dir, suffix="/", )
        wd = temp_dir.name

    if args.upgrade == "":
        #if not os.path.isfile(home + "/.gm_key"):
        #    sys.exit("#####LOREAN STOPS HERE. CHECK THAT THE gm_key IS IN THE HOME FOLDER#####\n")
        if args.proteins == "":
            if not args.keep_tmp or not args.verbose:
                shutil.rmtree(wd)
            sys.exit("#####LOREAN STOPS HERE. CHECK THAT THE PROTEIN OPTION IS SET#####\n")
    if args.long_reads != "":
        if args.stranded or args.adapter:
            if args.adapter == '':
                adapter_value = True
                sys.stdout.write('### RUNNING IN STRAND MODE AND FINDING ADAPTER AUTOMATICALLY ###\n')
                stranded_value = True
            else:
                adapter_value = args.adapter
                sys.stdout.write('### RUNNING IN STRAND MODE AND USING ADAPTER PROVIDED ###\n')
                stranded_value = True
        else:
            stranded_value = False
            sys.stdout.write('### RUNNING IN NON-STRAND MODE ###\n')
            adapter_value = False
    ref_orig = os.path.abspath(args.reference)
    ref_link = os.path.join(wd, args.reference)
    if not os.path.exists(ref_link):
        shutil.copyfile(ref_orig, ref_link)
    long_reads = args.long_reads
    fasta = (".fasta", ".fa", ".fas", ".fsta")
    fastq = (".fastq", ".fq")
    '''Core of the program'''
    # Parse the arguments
    if int(args.threads) > max_threads:
        threads_use = str(max_threads)
        sys.stdout.write(('### MAX NUMBER OF USED THREADS IS ' + str(max_threads) + ' AND NOT ' + args.threads + ' AS SET ###\n'))
    else:
        threads_use = args.threads
    if args.external:
        external_file = args.external
    else:
        external_file = ''
    if args.upgrade == "":
        if args.species == "":
            sys.exit("#####PLEASE DEFINE A SPECIES NAME\t" + now + "\t#####\n")
        else:
            if args.short_reads == '' and long_reads == '':
                if external_file.endswith("gff3") or external_file.endswith(fasta):
                    weights_dic = {'Augustus': args.augustus_weigth, 'GeneMark.hmm': args.genemark_weigth, 'exonerate': args.exonerate_weigth,
                                   'external' : args.external_weigth}
                else:
                    weights_dic = {'Augustus': args.augustus_weigth, 'GeneMark.hmm': args.genemark_weigth, 'exonerate': args.exonerate_weigth}
            elif args.short_reads != '' or long_reads != '':
                if external_file.endswith("gff3") or external_file.endswith(fasta):
                    weights_dic = {'Augustus': args.augustus_weigth, pasa_name: args.pasa_weigth, 'GeneMark.hmm': args.genemark_weigth,
                                   'exonerate': args.exonerate_weigth, gmap_name: args.trinity_weigth, 'external' : args.external_weigth}
                else:
                    weights_dic = {'Augustus': args.augustus_weigth, pasa_name: args.pasa_weigth, 'GeneMark.hmm': args.genemark_weigth,
                               'exonerate': args.exonerate_weigth, gmap_name: args.trinity_weigth}
    final_files = []  # STORE THE IMPORTANT OUTPUT FILES
    logistic.check_create_dir(wd)
    logistic.check_file(ref_link)
    gmap_wd = os.path.join(wd ,'gmap_output/')
    exonerate_wd = os.path.join(wd , 'exonerate')
    pasa_dir = os.path.join(wd , 'PASA/')
    star_out = os.path.join(wd , 'STAR/')
    trin_dir = os.path.join(wd , 'Trinity/')
    evm_inputs_dir = os.path.join(wd , 'evm_inputs/')
    braker_folder = os.path.join(wd , 'braker/')
    evm_output_dir = os.path.join(wd , 'evm_output/')
    interproscan_out_dir = os.path.join(wd , 'interproscan')
    wd_split = os.path.join(wd , 'split/')
    logistic.check_create_dir(wd_split)
    logistic.check_create_dir(evm_inputs_dir)
    logistic.check_create_dir(evm_output_dir)
    logistic.check_create_dir(trin_dir)
    logistic.check_create_dir(star_out)
    logistic.check_create_dir(pasa_dir)
    logistic.check_create_dir(gmap_wd)
    logistic.check_create_dir(exonerate_wd)
    if args.interproscan:
        logistic.check_create_dir(interproscan_out_dir)
    if long_reads:
        consensus_wd = os.path.join(wd , 'consensus/')
        logistic.check_create_dir(consensus_wd)
    if long_reads != "" or args.short_reads != "":
        logistic.check_gmap(threads_use, 'samse', args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                        args.verbose)

    if args.repeat_masked != "":
        sys.stdout.write(('###MASKING THE GENOME STARTED AT:\t' + now + '\t###\n'))
        masked_ref = mseq.maskedgenome(wd_split, ref_link, args.repeat_masked, args.repeat_lenght, args.verbose)
    elif args.mask_genome:
        sys.stdout.write(('###RUNNNG REPEATSCOUT AND REPEATMASK TO MASK THE GENOME STARTED AT:\t' + now + '\t###\n'))
        masked_ref, repeats_families, repeats_gff = mseq.repeatsfind(ref_link, wd_split, threads_use, args.verbose)
        if os.path.exists(repeats_families):
            final_files.append(repeats_families)
        if os.path.exists(repeats_gff):
            final_files.append(repeats_gff)
    else:
        masked_ref = ref_link
    list_fasta_names, dict_ref_name, ref_rename = multiple.single_fasta(masked_ref, wd_split)
    if args.short_reads or long_reads:
        if int(threads_use) > 1:
            trinity_cpu = int(int(threads_use) / int(2))
        else:
            trinity_cpu = int(threads_use)
        now = datetime.datetime.now().strftime(fmtdate)
        # SHORT READS
        if args.short_reads.endswith(fastq):
            sys.stdout.write(('###STAR MAPPING  STARTED AT:\t' + now + '\t###\n'))
            if ',' in args.short_reads:
                paired_end_files = args.short_reads.split(',')
                short_1 = os.path.abspath(paired_end_files[0])
                short_2 = os.path.abspath(paired_end_files[1])
                short_reads_file = [short_1, short_2]
            else:
                short_reads_file = os.path.abspath(args.short_reads)
            # Map with STAR
            short_bam = mapping.star(ref_rename, short_reads_file, threads_use, args.max_intron_length, star_out,
                                     args.verbose)
            short_sorted_bam = mapping.samtools_sort(short_bam, threads_use, wd, args.verbose)
            final_mapping_star = mapping.change_chr(short_sorted_bam, dict_ref_name, star_out, threads_use, args.verbose, "short")
            default_bam = short_sorted_bam
            # Keep the output
            final_files.append(final_mapping_star)
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###TRINITY STARTS AT:\t' + now + '\t###\n'))
            trinity_out = transcripts.trinity(short_sorted_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)
            if args.upgrade == "":
                trinity_gff3 = mapping.gmap('trin', ref_rename, trinity_out, threads_use, 'gff3_gene',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=True)
                trinity_path = trinity_gff3
            long_sorted_bam = False
        # BAM SORTED FILES GET IN HERE
        elif args.short_reads.endswith("bam") or long_reads.endswith("bam"):
            logistic.check_create_dir(star_out)
            if args.short_reads.endswith("bam"):
                map_reads = os.path.abspath(args.short_reads)
                short_sorted_bam = mapping.change_chr_to_seq(map_reads, dict_ref_name, star_out, threads_use, args.verbose)
            else:
                map_reads = os.path.abspath(long_reads)
                short_sorted_bam = mapping.change_chr_to_seq(map_reads, dict_ref_name, star_out, threads_use, args.verbose)
                mapping.samtools_index(short_sorted_bam, star_out, args.verbose)
                long_reads = transcripts.bamtofastq(short_sorted_bam, args.verbose)
            #short_sorted_bam = os.path.abspath(args.short_reads)
            default_bam = short_sorted_bam
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###TRINITY STARTS AT:\t' + now + '\t###\n'))
            trinity_out = transcripts.trinity(short_sorted_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)
            if args.upgrade == "":
                trinity_gff3 = mapping.gmap('trin', ref_rename, trinity_out, threads_use, 'gff3_gene',
                                        args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                        args.verbose, Fflag=True)
                trinity_path = trinity_gff3
            long_sorted_bam = False
        # LONG READS
        elif long_reads.endswith(fastq) or long_reads.endswith(fasta):
            # with this operation, reads are filtered for their length.
            # Nanopore reads can be chimeras or sequencing artefacts.
            # filtering on length reduces the amount of sequencing
            # artefacts
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(("###FILTERING OUT LONG READS STARTED AT:\t" + now + "\t###\n"))
            long_fasta, stranded_value = mseq.filterLongReads(long_reads, args.assembly_overlap_length, args.max_long_read, gmap_wd,
                                              adapter_value, threads_use, args.adapter_match_score, ref_rename,
                                              args.max_intron_length, args.verbose, stranded_value)


                # If short reads have been mapped dont do it
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###GMAP\t' + now + 't###\n'))
            if args.minimap2:
                long_sam = mapping.minimap(ref_rename, long_fasta, threads_use, args.max_intron_length, gmap_wd, args.verbose)
            else:
                long_sam = mapping.gmap('sam', ref_rename, long_fasta, threads_use, 'samse',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=False)

            # Convert to sorted BAM
            long_sorted_bam = mapping.sam_to_sorted_bam(long_sam, threads_use, gmap_wd, args.verbose)
            sam_orig_id = mapping.change_chr(long_sorted_bam, dict_ref_name, gmap_wd, threads_use, args.verbose, "long")
            default_bam = long_sorted_bam
            # Keep the output

            final_files.append(sam_orig_id)
            # TRANSCRIPT ASSEMBLY
            # TRINITY
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###TRINITY STARTS AT:\t' + now + '\t###\n'))
            trinity_out = transcripts.trinity(long_sorted_bam, trin_dir, args.max_intron_length, trinity_cpu, args.verbose)
            if args.upgrade == "":
                trinity_gff3 = mapping.gmap('trin', ref_rename, trinity_out, threads_use, 'gff3_gene',
                                        args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                        args.verbose, Fflag=True)
                trinity_path = trinity_gff3
        else:
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###NO LONG READS FILE OR SHORT READS\t' + now + '\t###\n'))
        # PASA Pipeline
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('###PASA STARTS AT:\t' + now + '\t###\n'))
        # Create PASA folder and configuration file
        #align_pasa_conf = pasa.pasa_configuration(pasa_dir, pasadb, args.verbose)
        # Launch PASA

        if args.upgrade == "":
            #if os.path.isfile(home + "/.gm_key") and args.proteins != "":
            if args.proteins != "":
                pasa_gff3 = pasa.pasa_call(pasa_dir, pasadb, ref_rename, trinity_out, args.max_intron_length,
                                           threads_use, args.verbose)
                final_files.append(grs.trasform_gff(pasa_gff3, dict_ref_name))
        # HERE WE PARALLELIZE PROCESSES WHEN MULTIPLE THREADS ARE USED
                if args.species in augustus_species:
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('###AUGUSTUS, GENEMARK-ES AND EXONERATE STARTED AT:' + now + '\t###\n'))
                    queue = Queue()
                    for software in range(3):
                        queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                        for software in range(3):
                            t = Thread(target=handler.august_gmes_exonerate, args=(queue, ref_rename, args.species, protein_loc,
                                                                           threads_use, args.fungus, list_fasta_names, wd, exonerate_wd,
                                                                           args.verbose))
                            t.daemon = True
                            t.start()
                    queue.join()
                    augustus_file = wd + 'augustus/augustus.gff'
                    augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                    final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
                    genemark_file = wd + 'gmes/genemark.gtf'
                    genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                    final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
                    merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
                    final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))

                elif args.short_reads or long_reads:  # USING PROTEINS AND SHORT READS
                    logistic.check_create_dir(braker_folder)
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('###BRAKER1 (USING SHORT READS) AND EXONERATE STARTED AT:\t' + now + '\t###\n'))
                    queue = Queue()
                    for software in range(2):
                        queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                        for software in range(2):
                            t = Thread(target=handler.braker_exonerate, args=(queue, ref_rename, default_bam, args.species, protein_loc,
                                                                       threads_use, args.fungus, wd,
                                                                        braker_folder, exonerate_wd, args.verbose))
                            t.daemon = True
                            t.start()
                    queue.join()
                    augustus_file, genemark_file = inputEvm.braker_folder_find(braker_folder)
                    augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                    genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                    merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
                    final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))

                else:  # USING PROTEINS AND LONG READS
                    queue = Queue()
                    now = datetime.datetime.now().strftime(fmtdate)
                    sys.stdout.write(('###BRAKER1 (USING LONG READS) AND EXONERATE STARTED AT: \t' + now + '\t###\n'))
                    logistic.check_create_dir(braker_folder)
                    for software in range(2):
                        queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                        for software in range(2):
                            t = Thread(target=handler.braker_exonerate,
                                       args=(queue, ref_rename, long_sorted_bam, args.species, protein_loc,
                                             threads_use, args.fungus, wd, braker_folder, exonerate_wd, args.verbose))
                            t.daemon = True
                            t.start()
                    queue.join()
                    augustus_file, genemark_file = inputEvm.braker_folder_find(braker_folder)
                    augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
                    genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
                    merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
                    final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
                    final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))
    elif args.species in augustus_species or args.species != "" or args.upgrade != "":
        #if os.path.isfile(home + "/.gm_key") and args.proteins != "":
        if args.proteins != "":
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###AUGUSTUS, GENEMARK-ES AND EXONERATE STARTED AT:' + now + '\t###\n'))
            queue = Queue()
            for software in range(3):
                queue.put(software)  # QUEUE WITH A ZERO AND A ONE
                for software in range(3):
                    t = Thread(target=handler.august_gmes_exonerate, args=(queue, ref_rename, args.species, protein_loc,
                                                                   threads_use, args.fungus, list_fasta_names, wd, exonerate_wd,
                                                                   args.verbose))
                    t.daemon = True
                    t.start()
            queue.join()
            augustus_file = wd + 'augustus/augustus.gff'
            augustus_gff3 = inputEvm.convert_augustus(augustus_file, wd)
            genemark_file = wd + 'gmes/genemark.gtf'
            genemark_gff3 = inputEvm.convert_genemark(genemark_file, wd)
            merged_prot_gff3 = wd + 'exonerate/protein_evidence.gff3'
            final_files.append(grs.trasform_gff(augustus_gff3, dict_ref_name))
            final_files.append(grs.trasform_gff(genemark_gff3, dict_ref_name))
            final_files.append(grs.trasform_gff(merged_prot_gff3, dict_ref_name))
    else:
        now = datetime.datetime.now().strftime(fmtdate)
        sys.exit("#####UNRECOGNIZED SPECIES FOR AUGUSTUS AND NO READS\t" + now + "\t#####\n")
    # Prepare EVM input files
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(('###EVM STARTED AT:\t' + now + '\t###\n'))
    # HERE WE CONVERT FILES FOR EVM AND PLACE THEM IN INPUT FOLDER
    round_n = 0
    if args.upgrade == "":
        if not args.short_reads and not long_reads:
            if external_file:
                if external_file.endswith(fasta):
                    external_file_gff3 = mapping.gmap('ext', ref_rename, external_file, threads_use, 'gff3_gene',
                                                      args.min_intron_length, args.max_intron_length, args.end_exon,
                                                      gmap_wd, args.verbose, Fflag=True)
                    external_file_changed = update.external(external_file_gff3, gmap_wd, args.verbose)
                elif external_file.endswith("gff3"):
                    external_file_changed = update.external(external_file, gmap_wd, args.verbose)
                evm_inputs = {'augustus': augustus_gff3, 'genemark': genemark_gff3, 'exonerate': merged_prot_gff3,
                              'external': external_file_changed}
            else:
                evm_inputs = {'augustus': augustus_gff3, 'genemark': genemark_gff3, 'exonerate': merged_prot_gff3}
        elif args.short_reads or long_reads:
            if args.external:
                external_file = args.external
                if external_file.endswith(fasta):
                    external_file_gff3 = mapping.gmap('ext', ref_rename, external_file, threads_use, 'gff3_gene',
                                                      args.min_intron_length, args.max_intron_length, args.end_exon,
                                                      gmap_wd, args.verbose, Fflag=True)
                    external_file_changed = update.external(external_file_gff3, gmap_wd, args.verbose)
                elif external_file.endswith("gff3"):
                    external_file_changed = update.external(external_file, gmap_wd, args.verbose)
                evm_inputs = {'pasa': pasa_gff3, 'augustus': augustus_gff3, 'genemark': genemark_gff3,
                                  'exonerate': merged_prot_gff3, 'gmap': trinity_path,'external': external_file_changed}
            else:
                evm_inputs = {'pasa': pasa_gff3, 'augustus': augustus_gff3, 'genemark': genemark_gff3,
                              'exonerate': merged_prot_gff3, 'gmap': trinity_path}
        # HERE WE RUN EVM; WE PREPARE FILES THAT ARE REQUIRED BY EVM LIKE
        # WEIGTH TABLE

        list_soft, pred_file, transcript_file, protein_file = inputEvm.group_EVM_inputs(evm_inputs_dir, evm_inputs)
        weight_file = inputEvm.evm_weight(evm_inputs_dir, weights_dic, list_soft, pasa_name, gmap_name)
        # EVM PIPELINE


        if args.short_reads or long_reads:  # WE HAVE SHORT READS AND PROTEINS
            evm_gff3 = evmPipeline.evm_pipeline(evm_output_dir, threads_use, ref_rename, weight_file, pred_file,
                                                transcript_file, protein_file, args.segmentSize, args.overlap_size,
                                                args.verbose)
            final_evm = grs.genename_evm(evm_gff3, args.verbose, evm_output_dir, dict_ref_name, args.upgrade)
            now = datetime.datetime.now().strftime(fmtdate)
            sys.stdout.write(('###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
            round_n += 1
            final_output = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb, ref_rename, trinity_out,
                                              final_evm, args.verbose)
            if long_reads == '':
                final_update_all = grs.genename_last(final_output, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "pasa")
                final_update_stats = evmPipeline.gff3_stats(final_update_all, pasa_dir)
                final_files.append(final_update_all)
                final_files.append(final_update_stats)
                if "command" not in (iprscan_log.decode("utf-8")) and args.interproscan:
                    annot, bad_models = iprscan.iprscan(masked_ref, final_update_all, interproscan_out_dir, args.threads)
                    final_files.append(annot)
                    final_files.append(bad_models)
                final_output_dir = os.path.join(output_dir, args.out_dir + '_output')
                logistic.check_create_dir(final_output_dir)
                for filename in final_files:
                    if filename != '':
                        logistic.copy_file(filename, final_output_dir)
                cmdstring = "chmod -R 775 %s" % wd
                os.system(cmdstring)
                now = datetime.datetime.now().strftime(fmtdate)
                sys.exit("#####LOREAN FINISHED WITHOUT USING LONG READS\t" + now + "\t. GOOD BYE.#####\n")

            else:
                final_keep = grs.genename_last(final_output, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "pasa")
                final_keep_stats = evmPipeline.gff3_stats(final_keep, pasa_dir)
                final_files.append(final_keep)
                final_files.append(final_keep_stats)
        elif not args.short_reads and not long_reads:  # WE HAVE PROTEINS BUT NOT SHORT READS
            transcript_file = ''
            evm_gff3 = evmPipeline.evm_pipeline(evm_output_dir, threads_use, ref_rename, weight_file, pred_file,
                                                transcript_file, protein_file, args.segmentSize, args.overlap_size,
                                                args.verbose)
            final_update_all = grs.genename_last(evm_gff3, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "pasa")
            final_update_stats = evmPipeline.gff3_stats(final_update_all, pasa_dir)
            final_files.append(final_update_all)
            final_files.append(final_update_stats)
            now = datetime.datetime.now().strftime(fmtdate)
            if "command" not in (iprscan_log.decode("utf-8")) and args.interproscan:
                annot, bad_models = iprscan.iprscan(masked_ref, final_update_all, interproscan_out_dir, args.threads)
                final_files.append(annot)
                final_files.append(bad_models)
            final_output_dir = os.path.join(output_dir, args.out_dir + '_output')
            logistic.check_create_dir(final_output_dir)
            for filename in final_files:
                if filename != '':
                    logistic.copy_file(filename, final_output_dir)
            cmdstring = "chmod -R 775 %s" % wd
            os.system(cmdstring)
            now = datetime.datetime.now().strftime(fmtdate)
            sys.exit("##### EVM FINISHED AT:\t" + now + "\t#####\n")
    else:
        final_evm = grs.genename_evm(args.upgrade, args.verbose, evm_output_dir, dict_ref_name, args.upgrade)
        now = datetime.datetime.now().strftime(fmtdate)
        sys.stdout.write(('###UPDATE WITH PASA DATABASE STARTED AT:\t ' + now + '\t###\n'))
        round_n += 1
        final_output = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb, ref_rename, trinity_out,
                                            final_evm, args.verbose)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(('###RUNNING iASSEMBLER\t' + now + '\t###\n'))

    if not long_sorted_bam:
        #print("line 430")
        long_fasta, stranded_value_new = mseq.filterLongReads(long_reads, args.assembly_overlap_length, args.max_long_read, gmap_wd,
                                          adapter_value, threads_use, args.adapter_match_score, ref_rename,
                                              args.max_intron_length, args.verbose, stranded_value)
        if stranded_value != stranded_value_new:
            stranded_value = stranded_value_new

        if args.minimap2:
            long_sam = mapping.minimap(ref_rename, long_fasta, threads_use, args.max_intron_length, gmap_wd, args.verbose)
        else:
            long_sam = mapping.gmap('sam', ref_rename, long_fasta, threads_use, 'samse',
                                    args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                    args.verbose, Fflag=False)
        long_sorted_bam = mapping.sam_to_sorted_bam(long_sam, threads_use, wd, args.verbose)
        sam_orig_id = mapping.change_chr(long_sorted_bam, dict_ref_name, gmap_wd, threads_use, args.verbose, "long")
        final_files.append(sam_orig_id)

    # HERE WE MERGE THE GMAP OUTPUT WITH THE EVM OUTPUT TO HAVE ONE            # FILE
    # HERE WE CHECK IF WE HAVE THE PASA UPDATED FILE OR THE EVM
    # ORIGINAL FILE
    mergedmap_gff3 = logistic.catTwoBeds(long_sorted_bam, final_evm, args.verbose, consensus_wd)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("\t###GFFREAD\t" + now + "\t###\n"))

    # HERE WE TRANSFORM THE COODINATES INTO SEQUENCES USING THE
    # REFERENCE
    gffread_fasta_file = consensus.gffread(mergedmap_gff3, ref_rename, consensus_wd, args.verbose)
    # HERE WE STORE THE SEQUENCE IN A DICTIONARY

    gffread_dict = consensus.fasta2Dict(gffread_fasta_file)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("\t#CLUSTERING\t" + now + "\t###\n"))

    # HERE WE CLUSTER THE SEQUENCES BASED ON THE GENOME POSITION
    cluster_list = consensus.cluster_pipeline(mergedmap_gff3, stranded_value, args.verbose)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("\t#CONSENSUS FOR EACH CLUSTER\t" + now + "\t###\n"))
    # HERE WE MAKE CONSENSUS FOR EACH CLUSTER
    tmp_wd = consensus_wd + 'tmp/'
    logistic.check_create_dir(tmp_wd)
    tmp_assembly_file = tmp_wd + 'assembly.fasta'
    if os.path.isfile(tmp_assembly_file):
        sys.stdout.write('No assembly')
    else:
        consensus.generate_fasta(cluster_list, gffread_dict, args.cluster_min_evidence,
                                 args.cluster_max_evidence, args.assembly_overlap_length, stranded_value, tmp_wd)
        consensus.assembly(args.assembly_overlap_length, args.assembly_percent_identity, threads_use, tmp_wd,
                           args.verbose)
        utrs.lengthSupport(tmp_wd, threads_use)

    # WITH THE ELSE, WE ALLOW THE USER TO DECIDE TO CHANGE THE ASSEMBLY
    # PARAMETERS AND COLLECT DIFFERENT ASSEMBLED SEQUENCES WITHOT RUNNING
    # THE FULL PIPELINE
    # HERE WE COLLECT THE ASSEMBLED SEQUENCES. WE COLLECT ONLY SEQUENCE
    # THAT PASS THE FILTER
    tmp_consensus = os.path.join(consensus_wd , 'tmp/')
    collect.parse_only(args.assembly_read_threshold, tmp_consensus, args.verbose)
    tmp_assembly = collect.cat_assembled(tmp_consensus)
    tmp_assembly_all = collect.cat_assembled_all(tmp_consensus)
    # HERE WE COLLECT THE NEW ASSEMBLED SEQUENCES AND WE COLLECT THE OLD
    # EVM DATA
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("###MAPPING CONSENSUS ASSEMBLIES\t" + now + "\t###\n"))
    # HERE WE MAP ALL THE FASTA FILES TO THE GENOME USING GMAP
    consensus_mapped_gff3 = mapping.gmap('cons', ref_rename, tmp_assembly, threads_use, 'gff3_gene',
                                         args.min_intron_length, args.max_intron_length, args.end_exon, gmap_wd,
                                         args.verbose, Fflag=True)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("###GETTING THE STRAND RIGHT\t" + now + "\t###\n"))
    merged_gff3 = collect.add_EVM(final_output, gmap_wd, consensus_mapped_gff3)
    #print(merged_gff3)
    update2 = grs.exonerate(ref_rename, merged_gff3, threads_use, exonerate_wd, args.verbose)
    print(ref_rename, update2)
    update3_1 = grs.remove_redudant(ref_rename, update2)
    print(update3_1)
    update3 = grs.genename_lorean(update3_1, args.verbose, exonerate_wd)
    print(update3)
    # HERE WE COMBINE TRINITY OUTPUT AND THE ASSEMBLY OUTPUT TO RUN AGAIN
    # PASA TO CORRECT SMALL ERRORS
    sys.stdout.write(("###FIXING GENES NON STARTING WITH MET\t" + now + "\t###\n"))
    fasta_all = logistic.cat_two_fasta(trinity_out, tmp_assembly_all, long_fasta, pasa_dir)
    round_n += 1
    update5 = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb,  ref_rename, fasta_all,
                                   update3, args.verbose)
    if args.verbose:
        sys.stdout.write(update5)
    round_n += 1
    update6 = pasa.update_database(threads_use, str(round_n), pasa_dir, pasadb,  ref_rename, fasta_all,
                                   update5, args.verbose)
    if args.verbose:
        sys.stdout.write(update6)
    final_update_update = grs.genename_last(update6, args.prefix_gene, args.verbose, pasa_dir, dict_ref_name, "lorean")
    final_files.append(final_update_update)
    final_update_stats = evmPipeline.gff3_stats(final_update_update, pasa_dir)
    final_files.append(final_update_stats)
    if "command" not in (iprscan_log.decode("utf-8")) and args.interproscan:
        annot, bad_models = iprscan.iprscan(masked_ref, final_update_update, interproscan_out_dir, args.threads)
        final_files.append(annot)
        final_files.append(bad_models)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(('###CREATING OUTPUT DIRECTORY\t' + now + '\t###\n'))
    final_output_dir = os.path.join(output_dir,  args.out_dir + '_output')
    logistic.check_create_dir(final_output_dir)
    now = datetime.datetime.now().strftime(fmtdate)
    sys.stdout.write(("##PLACING OUTPUT FILES IN OUTPUT DIRECTORY\t" + now + "\t###\n"))
    for filename in final_files:
        if os.path.exists(filename):
            logistic.copy_file(filename, final_output_dir)
            cmdstring = "chmod -R 775 %s" % wd
            os.system(cmdstring)
    sys.exit("##### LOREAN FINISHED HERE. GOOD BYE. #####\n")
Beispiel #2
0
def adapter_find(reference_database, reads, threads, max_intron_length, working_dir, verbose):
    subset_fasta = reads + "subset.10000.fasta"
    count_reads = 0
    with open(subset_fasta, "w") as fh:
        for rec in SeqIO.parse(reads, "fasta"):
            if int(rec.id) < 10000:
                SeqIO.write(rec, fh, "fasta")

    bam = mapping.minimap(reference_database, subset_fasta, threads, max_intron_length, working_dir, verbose)
    #soft_clip_regions = soft_clip(bam)
    fasta_gz = bam + ".fasta.gz"
    cmd = "extractSoftclipped %s > %s" % (bam, fasta_gz)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % cmd)
    extract_clip = subprocess.Popen(cmd, cwd=working_dir, shell=True)
    extract_clip.communicate()

    list_short = []
    list_long = []
    dict_uniq = {}
    with gzip.open(fasta_gz, "rt") as handle:
        for rec in SeqIO.parse(handle, "fasta"):
            name_seq = str(rec.id)
            name = name_seq.split("_")[0]
            if name in dict_uniq:
                if len(dict_uniq[name].seq) > len(rec.seq):
                    list_long.append(dict_uniq[name])
                    list_short.append(rec)
                else:
                    list_short.append(dict_uniq[name])
                    list_long.append(rec)
            else:
                dict_uniq[name] = rec
    long_file = fasta_gz + ".long.fasta"
    with open(long_file, "w") as fh:
        SeqIO.write(list_long, fh, "fasta")
    short_file = fasta_gz + ".short.fasta"
    with open(short_file, "w") as fh:
        SeqIO.write(list_short, fh, "fasta")
    list_file_clip = [(long_file, "long"),(short_file,"short")]

    list_file_adapter = []
    for clip_file in list_file_clip:
        kmer_start = 21
        list_kmer = []
        while kmer_start < 120:
            cmd = "jellyfish count -s 10000000 -m %s -o %s.%s.kmer %s" % (kmer_start, kmer_start, clip_file[1], clip_file[0])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_count = subprocess.Popen(cmd, cwd=working_dir, shell=True)
            jelly_count.communicate()
            cmd = "jellyfish dump -L 2 -ct %s.%s.kmer | sort -k2n | tail -n 1" % (kmer_start, clip_file[1])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_dump = subprocess.Popen(cmd, cwd=working_dir, stdout=subprocess.PIPE, shell=True)
            out_dump = jelly_dump.communicate()[0].decode('utf-8')
            mer = out_dump.split("\t")[0]
            a_count = mer.count("A")
            t_count = mer.count("T")
            if a_count > t_count:
                bias_count = a_count
            else:
                bias_count = t_count
            data_kmer = (kmer_start, mer, GC(mer), (bias_count/kmer_start)*100,  (bias_count/kmer_start)*100 - GC(mer) )
            list_kmer.append(data_kmer)
            kmer_start += 5

        value_adapter = 0
        for i in list_kmer:
            if i[4] > int(value_adapter):
                value_adapter = i[4]
                kmer_done = i[1]
    adapter_file = "adapter.fasta"
    with open(adapter_file, "w") as fh:
        record = SeqRecord(Seq(str(kmer_done)), id="adapter")
        SeqIO.write(record, fh, "fasta")
    return adapter_file
Beispiel #3
0
def adapter_find(reference_database, reads, threads, max_intron_length,
                 working_dir, verbose):
    subset_fasta = reads + "subset.10000.fasta"
    with open(subset_fasta, "w") as fh:
        for rec in SeqIO.parse(reads, "fasta"):
            if int(rec.id) < 10000:
                SeqIO.write(rec, fh, "fasta")

    bam = mapping.minimap(reference_database, subset_fasta, threads,
                          max_intron_length, working_dir, verbose)
    #soft_clip_regions = soft_clip(bam)
    fasta_gz = bam + ".fasta"
    cmd = "extractSoftclipped %s | zcat | fastqToFa /dev/stdin %s" % (bam,
                                                                      fasta_gz)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % cmd)
    extract_clip = subprocess.Popen(cmd, cwd=working_dir, shell=True)
    extract_clip.communicate()

    list_short = []
    list_long = []
    dict_uniq = {}
    with open(fasta_gz, "r") as handle:
        for rec in SeqIO.parse(handle, "fasta"):
            name_seq = str(rec.id)
            name = name_seq.split("_")[0]
            if name in dict_uniq:
                if len(dict_uniq[name].seq) > len(rec.seq):
                    list_long.append(dict_uniq[name])
                    list_short.append(rec)
                else:
                    list_short.append(dict_uniq[name])
                    list_long.append(rec)
            else:
                dict_uniq[name] = rec
    long_file = fasta_gz + ".long.fasta"
    with open(long_file, "w") as fh:
        SeqIO.write(list_long, fh, "fasta")
    short_file = fasta_gz + ".short.fasta"
    with open(short_file, "w") as fh:
        SeqIO.write(list_short, fh, "fasta")
    list_file_clip = [(long_file, "long"), (short_file, "short")]
    for clip_file in list_file_clip:
        kmer_start = 21
        list_kmer = []
        while kmer_start < 120:
            cmd = "jellyfish count -s 10000000 -m %s -o %s.%s.kmer %s" % (
                kmer_start, kmer_start, clip_file[1], clip_file[0])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_count = subprocess.Popen(cmd, cwd=working_dir, shell=True)
            jelly_count.communicate()
            cmd = "jellyfish dump -L 2 -ct %s.%s.kmer | sort -k2n | tail -n 1" % (
                kmer_start, clip_file[1])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_dump = subprocess.Popen(cmd,
                                          cwd=working_dir,
                                          stdout=subprocess.PIPE,
                                          shell=True)
            out_dump = jelly_dump.communicate()[0].decode('utf-8')
            mer = out_dump.split("\t")[0]
            a_count = mer.count("A")
            t_count = mer.count("T")
            if a_count > t_count:
                bias_count = a_count
            else:
                bias_count = t_count
            data_kmer = (kmer_start, mer, GC(mer),
                         (bias_count / kmer_start) * 100,
                         (bias_count / kmer_start) * 100 - GC(mer))
            list_kmer.append(data_kmer)
            kmer_start += 5
        value_adapter = 0
        for i in list_kmer:
            if i[4] > int(value_adapter):
                value_adapter = i[4]
                kmer_done = i[1]

    adapter_file = os.path.join(working_dir, "adapter.fasta")
    if value_adapter > 0:
        with open(adapter_file, "w") as fh:
            record = SeqRecord(Seq(str(kmer_done)), id="adapter")
            SeqIO.write(record, fh, "fasta")
    return adapter_file