Ejemplo n.º 1
0
        output_swissprot_pfam_supported_transcripts_ids,
        mode="combine")
    HMMER3.intersect_ids_from_files(
        output_swissprot_pfam_supported_transcripts_ids,
        output_supported_stats_ids,
        output_swissprot_pfam_or_hints_supported_transcripts_ids,
        mode="combine")
    HMMER3.intersect_ids_from_files(
        output_swissprot_pfam_supported_transcripts_ids,
        output_supported_stats_ids,
        output_swissprot_pfam_and_hints_supported_transcripts_ids,
        mode="common")

    print("Extracting sequences...")
    SequenceRoutines.extract_sequence_by_ids(
        output_pep, output_swissprot_pfam_or_hints_supported_transcripts_ids,
        output_swissprot_pfam_or_hints_supported_transcripts_pep)
    SequenceRoutines.extract_sequence_by_ids(
        output_pep, output_swissprot_pfam_and_hints_supported_transcripts_ids,
        output_swissprot_pfam_and_hints_supported_transcripts_pep)

    print("Extracting evidence...")
    AUGUSTUS.extract_evidence_by_ids(
        output_evidence_stats,
        output_swissprot_pfam_or_hints_supported_transcripts_ids,
        output_swissprot_pfam_or_hints_supported_transcripts_evidence)
    AUGUSTUS.extract_evidence_by_ids(
        output_evidence_stats,
        output_swissprot_pfam_and_hints_supported_transcripts_ids,
        output_swissprot_pfam_and_hints_supported_transcripts_evidence)
    print("Extracting longest isoforms...")
Ejemplo n.º 2
0
    def extract_proteins_from_output(self,
                                     augustus_output,
                                     protein_output,
                                     evidence_stats_file=None,
                                     supported_by_hints_file=None,
                                     complete_proteins_id_file=None,
                                     id_prefix="p."):
        if evidence_stats_file:
            ev_fd = open(evidence_stats_file, "w")
            ev_fd.write(
                "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t"
            )
            ev_fd.write(
                "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n"
            )

        if evidence_stats_file:
            sup_fd = open(supported_by_hints_file, "w")
            sup_fd.write(
                "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t"
            )
            sup_fd.write(
                "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n"
            )

        if complete_proteins_id_file:
            complete_fd = open(complete_proteins_id_file, "w")

        with open(protein_output, "w") as out_fd:
            with open(augustus_output, "r") as in_fd:
                for line in in_fd:
                    if line[:12] == "# start gene":
                        gene = line.strip().split()[-1]
                    elif "\ttranscript\t" in line:
                        transcript_id = line.split("\t")[8].split(
                            ";")[0].split("=")[1]
                        start_presence = False
                        stop_presence = False
                        #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene))
                    elif "\tstart_codon\t" in line:
                        start_presence = True
                    elif "\tstop_codon\t" in line:
                        stop_presence = True
                    elif "# protein sequence" in line:
                        protein = line.strip().split("[")[-1]
                        if "]" in protein:
                            protein = protein.split("]")[0]
                        else:
                            while True:
                                part = in_fd.readline().split()[-1]
                                if "]" in part:
                                    protein += part.split("]")[0]
                                    break
                                else:
                                    protein += part
                        if complete_proteins_id_file:
                            #print "AAAAA"
                            #print (start_presence, stop_presence)
                            if start_presence and stop_presence:
                                complete_fd.write("%s%s\n" %
                                                  (id_prefix, transcript_id))

                        out_fd.write(
                            ">%s%s\t gene=%s start_presence=%s stop_presence=%s\n"
                            % (id_prefix, transcript_id, gene,
                               str(start_presence), str(stop_presence)))
                        out_fd.write(protein)
                        protein_len = len(protein)
                        out_fd.write("\n")

                    elif evidence_stats_file or supported_by_hints_file:
                        if line[:17] == "# % of transcript":
                            supported_fraction = line.strip().split()[-1]
                            while True:
                                tmp_line = in_fd.readline()
                                if tmp_line[:12] == "# CDS exons:":
                                    cds_support = tmp_line.strip().split()[-1]
                                elif tmp_line[:14] == "# CDS introns:":
                                    introns_support = tmp_line.strip().split(
                                    )[-1]
                                elif tmp_line[:13] == "# 5'UTR exons":
                                    five_utr_support = tmp_line.strip().split(
                                    )[-1]
                                elif tmp_line[:13] == "# 3'UTR exons":
                                    three_introns_support = tmp_line.strip(
                                    ).split()[-1]
                                elif tmp_line[:
                                              27] == "# incompatible hint groups:":
                                    incompatible_hint_groups = tmp_line.strip(
                                    ).split()[-1]
                                    if evidence_stats_file:
                                        ev_fd.write("%s\t%s\t%s\t" %
                                                    (gene, transcript_id,
                                                     supported_fraction))
                                        ev_fd.write(
                                            "%s\t%s\t%s\t%s\t%s\t%i\n" %
                                            (cds_support, introns_support,
                                             five_utr_support,
                                             three_introns_support,
                                             incompatible_hint_groups,
                                             protein_len))
                                    if supported_by_hints_file and (
                                            float(supported_fraction) > 0):
                                        sup_fd.write("%s\t%s\t%s\t" %
                                                     (gene, transcript_id,
                                                      supported_fraction))
                                        sup_fd.write(
                                            "%s\t%s\t%s\t%s\t%s\t%i\n" %
                                            (cds_support, introns_support,
                                             five_utr_support,
                                             three_introns_support,
                                             incompatible_hint_groups,
                                             protein_len))

                                    break

        if evidence_stats_file:
            ev_fd.close()

        self.extract_longest_isoforms(evidence_stats_file,
                                      "%s.longest_pep" % evidence_stats_file,
                                      minimum_supported_fraction=0)
        SequenceRoutines.extract_sequence_by_ids(
            protein_output, "%s.longest_pep.ids" % evidence_stats_file,
            "%s.longest_pep.pep" % evidence_stats_file)

        if supported_by_hints_file:
            supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file
            supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file
            supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file
            self.extract_longest_isoforms(
                evidence_stats_file,
                supported_by_hints_longest_pep_evidence,
                minimum_supported_fraction=0.00001)
            SequenceRoutines.extract_sequence_by_ids(
                protein_output, supported_by_hints_longest_pep_ids,
                supported_by_hints_longest_pep)

        evidence_files = (evidence_stats_file,
                          "%s.longest_pep" % evidence_stats_file,
                          "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \
                          (evidence_stats_file,)
        for evidence_file in evidence_files:
            print("Drawing transcript support distribution for %s" %
                  evidence_file)
            MatplotlibRoutines.percent_histogram_from_file(
                evidence_file,
                evidence_file,
                column_list=(2, ),
                separator=None,
                comments="#",
                n_bins=20,
                title="Transcript support by hints",
                xlabel="%%",
                ylabel="Number",
                extensions=["svg", "png"],
                legend_location="upper center",
                stats_as_legend=True)
Ejemplo n.º 3
0
descr_longest_isoform_fd = open(pep_description_longest_isoform, "w")
descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")

for gene in syn_dict:
    len_list = []
    longest_isoform = None
    max_len = 0
    for isoform_id in syn_dict[gene]:
        length = length_dict[isoform_id]
        len_list.append(length)
        if length > max_len:
            max_len = length
            longest_isoform = isoform_id

    descr_with_len_fd.write(
        "%s\t%s\t%s\n" %
        (gene, ",".join(syn_dict[gene]), ",".join(map(str, len_list))))
    descr_longest_isoform_fd.write("%s\t%s\t%i\n" %
                                   (gene, longest_isoform, max_len))
    descr_longest_isoform_ids_fd.write(longest_isoform)
    descr_longest_isoform_ids_fd.write("\n")

for file_descriptor in descr_with_len_fd, descr_longest_isoform_fd, descr_longest_isoform_ids_fd:
    file_descriptor.close()

SequenceRoutines.extract_sequence_by_ids(args.input,
                                         pep_description_longest_isoform_ids,
                                         pep_description_longest_isoform_pep,
                                         format="fasta",
                                         verbose=True)
Ejemplo n.º 4
0
    help=
    "Allow multiple coincidence report of sequences for partial coincidence mode."
    "By default an error is raised")
parser.add_argument("-s",
                    "--syn_file",
                    action="store",
                    dest="syn_file",
                    help="File with synonyms of ids to use. Default - not set")
parser.add_argument(
    "-r",
    "--invert_match",
    action="store_true",
    dest="invert_match",
    help="Invert match, i. e. remove sequences. Default - not set")

args = parser.parse_args()

SequenceRoutines.extract_sequence_by_ids(
    args.input,
    args.id_file,
    args.output,
    format=args.format,
    verbose=True,
    id_column_number=args.id_column,
    coincidence_mode=args.coincidence_mode,
    allow_multiple_coincidence_report=args.allow_multiple_coincidence_report,
    syn_file=args.syn_file,
    parsing_mode="parse",
    index_file="tmp.idx",
    invert_match=args.invert_match)