Esempio n. 1
0
def writeGFF(out_path, shallow, args):
    """
    Write shallow areas there annotations in a GFF3 file.

    :param out_path: Path to the output file.
    :type out_path: str
    :param shallow: The list of shallow areas.
    :type shallow: anacore.region.RegionList
    :param args: Parameters used in analysis.
    :type args: NameSpace
    """
    with GFF3IO(out_path, "w") as FH_out:
        for curr_shallow in sorted(shallow, key=lambda x: (x.reference.name, x.start, x.end)):
            record = GFF3Record(
                curr_shallow.reference.name,
                "shallowsAnalysis",
                "experimental_feature",
                curr_shallow.start,
                curr_shallow.end
            )
            if args.input_annotations is not None:
                for idx, annot in enumerate(curr_shallow.annot["ANN"]):
                    fields = []
                    for k, v in sorted(annot.items()):
                        fields.append("{}:{}".format(k, v))
                    record.annot["ann_{}".format(idx + 1)] = "|".join(fields)
            if len(args.inputs_variants) > 0:
                for idx, var_region in enumerate(curr_shallow.annot["VAR"]):
                    fields = []
                    for k, v in sorted(var_region.annot.items()):
                        fields.append("{}:{}".format(k, v))
                    record.annot["var_{}".format(idx + 1)] = "|".join(fields)
            FH_out.write(record)
Esempio n. 2
0
def annotateDomains(annot_by_gene_id, in_domains):
    """
    Add proteins domains information in annotations.

    :param annot_by_gene_id: Light genomic annotations by gene ID. Only genes overlapping breakends are present.
    :type annot_by_gene_id: dict
    :param in_domains: Path to the domains annotations file (format: GFF3). Each entry in file is one proteic domain. Required attributes are "Note" (long name) and "target_protein" (ID of the protein present in "input-annotations"). See AnaCore-utils/bin/ensemblInterProToGFF.py.
    :type in_domains: str
    """
    # Get protein by ID
    prot_by_id = dict()
    for gene_id, gene in annot_by_gene_id.items():
        for transcript in gene["transcripts"]:
            for protein in transcript["proteins"]:
                prot_by_id[protein["name"]] = protein
    # Add domains
    with GFF3IO(in_domains) as reader_dom:
        for domain in reader_dom:
            prot_id = domain.annot["target_protein"].split(".", 1)[0]
            if prot_id in prot_by_id:
                protein = prot_by_id[prot_id]
                if "domains" not in protein["annot"]:
                    protein["annot"]["domains"] = []
                domain_data = {
                    "annot": {
                        "desc": domain.annot["Note"]
                    },
                    "end": domain.end,
                    "start": domain.start
                }
                if "Dbxref" in domain.annot:
                    domain_data["annot"]["id"] = domain.annot["Dbxref"]
                if "sub_segments" in domain.annot:
                    domain_data["annot"]["sub"] = domain.annot["sub_segments"]
                protein["annot"]["domains"].append(domain_data)
Esempio n. 3
0
 def testRead(self):
     # Read
     observed_records = []
     with GFF3IO(self.tmp_in_gff) as FH_in:
         for record in FH_in:
             observed_records.append(record)
     # Assert
     self.assertTrue(len(self.expected_records) == len(observed_records))
     for record_a, record_b in zip(self.expected_records, observed_records):
         self.assertFalse(diffRecord(record_a, record_b))
Esempio n. 4
0
 def testWrite(self):
     # Write
     with GFF3IO(self.tmp_out_gff, "w") as FH_out:
         for record in self.expected_records:
             FH_out.write(record)
     # Read
     observed_content = None
     with open(self.tmp_out_gff) as FH_out:
         observed_content = "".join(FH_out.readlines()).strip()
     # Assert
     self.assertEqual(
         self.expected_content.replace("test	encode and decode", "test encode and decode"),
         observed_content
     )
Esempio n. 5
0
def getTranscriptAnnot(in_annot, gene_by_tr):
    """
    Get genomic model (genes, transcripts and exons) for the selected transcripts.

    :param in_annot: Path to the genomic annotations (format: GFF3).
    :type in_annot: str
    :param gene_by_tr: Gene by selected transcripts.
    :type gene_by_tr: dict
    :return: The list of selected transcripts.
    :rtype: anacore.region.RegionList
    """
    tr_by_id = dict()
    with GFF3IO(in_annot) as FH_annot:
        for record in FH_annot:
            if record.type == "mRNA" and "transcript_id" in record.annot:
                tr_id = record.annot["transcript_id"]
                tr_id = tr_id.split(".")[0]  # Remove transcript version
                if tr_id in gene_by_tr:  # Transcript is in panel
                    if tr_id not in tr_by_id:
                        tr_by_id[tr_id] = Transcript(record.start, record.end,
                                                     record.strand,
                                                     record.seq_id, tr_id, {},
                                                     gene_by_tr[tr_id])
            if record.type == "exon" and "transcript_id" in record.annot:
                tr_id = record.annot["transcript_id"]
                tr_id = tr_id.split(".")[0]  # Remove transcript version
                if tr_id in gene_by_tr:  # Transcript is in panel
                    # Store the exon
                    tr_by_id[tr_id].addChild(
                        Exon(record.start, record.end, record.strand,
                             record.seq_id))
    if len(gene_by_tr) != len(tr_by_id):
        raise Exception(
            "The following transcripts are missing in {}: {}".format(
                args.input_annotation,
                set(gene_by_tr.keys()).difference(set(tr_by_id.keys()))))
    return RegionList(tr_by_id.values())
                                prev_annot.end,
                                tr_id
                            )
                        )
                        prev_annot.start = min(genomic_start, prev_annot.start)
                        prev_annot.end = max(genomic_end, prev_annot.end)
    domains = [domain for tr_id, domain_by_dom_id in domains_by_tr_id.items() for domain_id, domain in domain_by_dom_id.items()]
    domains = sorted(domains, key=lambda x: (x.reference.name, x.start, x.end))

    # Add split info
    log.info("Add sub-segment without intron(s) for domains.")
    for curr_domain in domains:
        tr_id = curr_domain.annot["target_transcript"].split(".", 1)[0]
        transcript = tr_by_id[tr_id]
        segments = []
        for exon in sorted(transcript.children, key=lambda x: (x.start, x.end)):
            if exon.hasOverlap(curr_domain):
                segments.append("{}-{}".format(
                    max(curr_domain.start, exon.start),
                    min(curr_domain.end, exon.end)
                ))
        if len(segments) > 1:
            curr_domain.annot["sub_segments"] = ",".join(segments)

    # Write output
    log.info("Write output.")
    with GFF3IO(args.output_annotations, "w") as writer:
        for curr_domain in domains:
            writer.write(curr_domain)
    log.info("End of job")