def writeGFF(out_path, shallow, args): """ Write shallow areas there annotations in a GFF3 file. :param out_path: Path to the output file. :type out_path: str :param shallow: The list of shallow areas. :type shallow: anacore.region.RegionList :param args: Parameters used in analysis. :type args: NameSpace """ with GFF3IO(out_path, "w") as FH_out: for curr_shallow in sorted(shallow, key=lambda x: (x.reference.name, x.start, x.end)): record = GFF3Record( curr_shallow.reference.name, "shallowsAnalysis", "experimental_feature", curr_shallow.start, curr_shallow.end ) if args.input_annotations is not None: for idx, annot in enumerate(curr_shallow.annot["ANN"]): fields = [] for k, v in sorted(annot.items()): fields.append("{}:{}".format(k, v)) record.annot["ann_{}".format(idx + 1)] = "|".join(fields) if len(args.inputs_variants) > 0: for idx, var_region in enumerate(curr_shallow.annot["VAR"]): fields = [] for k, v in sorted(var_region.annot.items()): fields.append("{}:{}".format(k, v)) record.annot["var_{}".format(idx + 1)] = "|".join(fields) FH_out.write(record)
def annotateDomains(annot_by_gene_id, in_domains): """ Add proteins domains information in annotations. :param annot_by_gene_id: Light genomic annotations by gene ID. Only genes overlapping breakends are present. :type annot_by_gene_id: dict :param in_domains: Path to the domains annotations file (format: GFF3). Each entry in file is one proteic domain. Required attributes are "Note" (long name) and "target_protein" (ID of the protein present in "input-annotations"). See AnaCore-utils/bin/ensemblInterProToGFF.py. :type in_domains: str """ # Get protein by ID prot_by_id = dict() for gene_id, gene in annot_by_gene_id.items(): for transcript in gene["transcripts"]: for protein in transcript["proteins"]: prot_by_id[protein["name"]] = protein # Add domains with GFF3IO(in_domains) as reader_dom: for domain in reader_dom: prot_id = domain.annot["target_protein"].split(".", 1)[0] if prot_id in prot_by_id: protein = prot_by_id[prot_id] if "domains" not in protein["annot"]: protein["annot"]["domains"] = [] domain_data = { "annot": { "desc": domain.annot["Note"] }, "end": domain.end, "start": domain.start } if "Dbxref" in domain.annot: domain_data["annot"]["id"] = domain.annot["Dbxref"] if "sub_segments" in domain.annot: domain_data["annot"]["sub"] = domain.annot["sub_segments"] protein["annot"]["domains"].append(domain_data)
def testRead(self): # Read observed_records = [] with GFF3IO(self.tmp_in_gff) as FH_in: for record in FH_in: observed_records.append(record) # Assert self.assertTrue(len(self.expected_records) == len(observed_records)) for record_a, record_b in zip(self.expected_records, observed_records): self.assertFalse(diffRecord(record_a, record_b))
def testWrite(self): # Write with GFF3IO(self.tmp_out_gff, "w") as FH_out: for record in self.expected_records: FH_out.write(record) # Read observed_content = None with open(self.tmp_out_gff) as FH_out: observed_content = "".join(FH_out.readlines()).strip() # Assert self.assertEqual( self.expected_content.replace("test encode and decode", "test encode and decode"), observed_content )
def getTranscriptAnnot(in_annot, gene_by_tr): """ Get genomic model (genes, transcripts and exons) for the selected transcripts. :param in_annot: Path to the genomic annotations (format: GFF3). :type in_annot: str :param gene_by_tr: Gene by selected transcripts. :type gene_by_tr: dict :return: The list of selected transcripts. :rtype: anacore.region.RegionList """ tr_by_id = dict() with GFF3IO(in_annot) as FH_annot: for record in FH_annot: if record.type == "mRNA" and "transcript_id" in record.annot: tr_id = record.annot["transcript_id"] tr_id = tr_id.split(".")[0] # Remove transcript version if tr_id in gene_by_tr: # Transcript is in panel if tr_id not in tr_by_id: tr_by_id[tr_id] = Transcript(record.start, record.end, record.strand, record.seq_id, tr_id, {}, gene_by_tr[tr_id]) if record.type == "exon" and "transcript_id" in record.annot: tr_id = record.annot["transcript_id"] tr_id = tr_id.split(".")[0] # Remove transcript version if tr_id in gene_by_tr: # Transcript is in panel # Store the exon tr_by_id[tr_id].addChild( Exon(record.start, record.end, record.strand, record.seq_id)) if len(gene_by_tr) != len(tr_by_id): raise Exception( "The following transcripts are missing in {}: {}".format( args.input_annotation, set(gene_by_tr.keys()).difference(set(tr_by_id.keys())))) return RegionList(tr_by_id.values())
prev_annot.end, tr_id ) ) prev_annot.start = min(genomic_start, prev_annot.start) prev_annot.end = max(genomic_end, prev_annot.end) domains = [domain for tr_id, domain_by_dom_id in domains_by_tr_id.items() for domain_id, domain in domain_by_dom_id.items()] domains = sorted(domains, key=lambda x: (x.reference.name, x.start, x.end)) # Add split info log.info("Add sub-segment without intron(s) for domains.") for curr_domain in domains: tr_id = curr_domain.annot["target_transcript"].split(".", 1)[0] transcript = tr_by_id[tr_id] segments = [] for exon in sorted(transcript.children, key=lambda x: (x.start, x.end)): if exon.hasOverlap(curr_domain): segments.append("{}-{}".format( max(curr_domain.start, exon.start), min(curr_domain.end, exon.end) )) if len(segments) > 1: curr_domain.annot["sub_segments"] = ",".join(segments) # Write output log.info("Write output.") with GFF3IO(args.output_annotations, "w") as writer: for curr_domain in domains: writer.write(curr_domain) log.info("End of job")