def read_matrix_file_and_add_as_features(self): """Read the HiC matrix file and add each bin as further feature in the annotation. """ print("- Reading HiC matrix file") self.hic_matrix = hicsuntdracones.hicmatrix.HiCMatrix( self._matrix_file) self.hic_matrix.hic_matrix_df.set_index( self.hic_matrix.hic_matrix_df["Regions"], inplace=True) self.hic_matrix.normalize_by_columns_sum() for genome_bin in self.hic_matrix.bins(): chrom = hicsuntdracones.hicmatrix.remove_position_information( genome_bin) # As the bin counting starts with 0 but the gff starts a 1 # we have to add 1 here to the position. pos = hicsuntdracones.hicmatrix.bin_number(genome_bin) pos_adjusted = pos + 1 self._features.update([gffutils.Feature( seqid=chrom, source='-', featuretype='HiC_bin', start=pos_adjusted, end=pos_adjusted+self._bin_size, strand="+", attributes=f"ID={genome_bin}")])
def create_parents(orphans, parent_dict, feature_list, intermediate_dict, gene): added_parent_ids = [] new_orphans = [] top_feature = None for orphan, parent in orphans: if parent not in added_parent_ids: children = [ feature for feature in feature_list if "Parent" in feature.attributes and feature.attributes["Parent"][0] == parent ] starts = [child.start for child in children] ends = [child.end for child in children] if parent in parent_dict: original_parent = parent_dict[parent] else: original_parent = intermediate_dict[parent] parent_feature = gffutils.Feature( seqid=children[0].seqid, source="Liftoff", featuretype=original_parent.featuretype, start=min(starts), end=max(ends), strand=children[0].strand, id=original_parent.id, attributes=original_parent.attributes) feature_list.append(parent_feature) if parent_feature.id != gene.id: new_orphans.append( (parent_feature, parent_feature["Parent"][0])) else: top_feature = parent_feature added_parent_ids.append(parent) return new_orphans, top_feature
def location_to_feature(db, chrom, start, stop, strand, source, featuretype): if strand not in STRANDS: strand = '.' overlapping_genes = db.region(seqid=chrom, start=start, end=stop, strand=strand, featuretype='gene') exon_id = 'exon:{chrom}:{start}-{stop}:{strand}'.format(chrom=chrom, start=start, stop=stop, strand=strand) attributes = {} for g in overlapping_genes: attributes = merge_attributes(attributes, g.attributes) exon = gffutils.Feature(chrom, source=source, featuretype=featuretype, start=start, end=stop, strand=strand, id=exon_id, attributes=attributes) return exon
def test_pr_144(): # previously this would fail with: # UnboundLocalError: local variable 'part' referenced before assignment f = gffutils.Feature(attributes={'a': ['']}) # Make sure everything got converted correctly assert f.attributes['a'] == [''] assert str(f) == ". . . . . . . . a" g = gffutils.feature.feature_from_line(str(f)) assert g == f
def top_level_features(db): """Generator that returns top level features from db :db: gffutils.FeatureDB :returns: Iterator of top-level features (having no parent) """ # Get all top level features top_level_features = db.execute( "select * " "from features " "where id not in (select distinct(child) from relations)") for row in top_level_features: yield (gffutils.Feature(**row))
def convert_all_children_coords(shortest_path_nodes, children, parent): shortest_path_nodes.sort(key=lambda x: x.query_block_start) mapped_children = {} for child in children: lifted_start = convert_coord(child.start, parent, shortest_path_nodes) lifted_end = convert_coord(child.end, parent, shortest_path_nodes) if (lifted_start != lifted_end or child.start == child.end) and lifted_start != 0: strand = get_strand(shortest_path_nodes[1], parent) mapped_children[child.id] = gffutils.Feature( id=child.id, seqid=shortest_path_nodes[1].reference_name, start=min(lifted_start, lifted_end) + 1, end=max(lifted_start, lifted_end) + 1, featuretype=child.featuretype, source="Liftoff", attributes=child.attributes, strand=strand) return mapped_children
def add_matrix_bins_as_features(self, matrix_file, features): hic_matrix = hicsuntdracones.hicmatrix.HiCMatrix(matrix_file) hic_matrix.normalize_by_columns_sum() interaction_matrix = hic_matrix.hic_matrix_df interaction_matrix.set_index(interaction_matrix["Regions"], inplace=True) for genome_bin in interaction_matrix["Regions"]: chrom_part = "-".join(genome_bin.split("-")[:-1]) # As the bin counting starts with 0 but the gff starts at 1 # we have to add 1 here to the position. # pos = int(genome_bin.split("-")[-1]) + 1 pos = int(genome_bin.split("-")[-1]) pos_adjusted = pos + 1 features.update([ gffutils.Feature(seqid=chrom_part, source='-', featuretype='HiC_bin', start=pos_adjusted, end=pos_adjusted + self._bin_size, strand="+", attributes=f"ID={genome_bin}") ]) return interaction_matrix
def output_RI(gff_out, donor, acceptor, intron_len, source="RI"): """ Output a retained intron event. """ chrom = donor.chrom strand = donor.strand donor_start = donor.start_coord donor_end = donor.end_coord acceptor_start = acceptor.start_coord acceptor_end = acceptor.end_coord ri_name = "%s@%s" % (donor.coords_str, acceptor.coords_str) gene_start = donor_start gene_end = acceptor_end # For GFF record purposes, ensure start < end always if gene_start > gene_end: gene_start, gene_end = gene_end, gene_start gene_rec = gffutils.Feature(seqid=chrom, source=source, featuretype="gene", start=gene_start, end=gene_end, strand=strand, attributes={ "ID": [ri_name], "Name": [ri_name] }) # Output mRNA containing the retained intron and then output its exons # First output retained intron using "withRI" suffix long_mRNA_name = "%s.A" % (ri_name) # Long mRNA record has same start/end as gene record long_mRNA_rec = gffutils.Feature(seqid=chrom, source=source, featuretype="mRNA", start=gene_start, end=gene_end, strand=strand, attributes={ "ID": [long_mRNA_name], "Parent": [ri_name] }) # Retained intron belongs to long mRNA ri_exon_name = "%s.withRI" % (long_mRNA_name) # Retained intron record has same start/end as gene record as well ri_exon_rec = gffutils.Feature(seqid=chrom, source=source, featuretype="exon", start=gene_start, end=gene_end, strand=strand, attributes={ "ID": [ri_exon_name], "Parent": [long_mRNA_name] }) # Output mRNA splicing out the intron and then output its exons short_mRNA_name = "%s.B" % (ri_name) # Short mRNA has same start/end as gene record short_mRNA_rec = gffutils.Feature(seqid=chrom, source=source, featuretype="mRNA", start=gene_start, end=gene_end, strand=strand, attributes={ "ID": [short_mRNA_name], "Parent": [ri_name] }) up_exon_name = "%s.up" % (short_mRNA_name) up_exon_rec = gffutils.Feature(seqid=chrom, source=source, featuretype="exon", start=donor.gff_start, end=donor.gff_end, strand=strand, attributes={ "ID": [up_exon_name], "Parent": [short_mRNA_name] }) dn_exon_name = "%s.dn" % (short_mRNA_name) dn_exon_rec = gffutils.Feature(seqid=chrom, source=source, featuretype="exon", start=acceptor.gff_start, end=acceptor.gff_end, strand=strand, attributes={ "ID": [dn_exon_name], "Parent": [short_mRNA_name] }) # Serialize records to GFF # gene gff_out.write_rec(gene_rec) # long mRNA gff_out.write_rec(long_mRNA_rec) # retained intron gff_out.write_rec(ri_exon_rec) # short mRNA gff_out.write_rec(short_mRNA_rec) gff_out.write_rec(up_exon_rec) gff_out.write_rec(dn_exon_rec)
assert len(parent_gene) < 2 if feature.featuretype == 'gene': feature.attributes[args.geneid] = feature.attributes['ID'] elif len(parent_gene) == 1: parent_gene = parent_gene[0] gene_id = parent_gene.attributes['ID'] assert len(gene_id) == 1 feature.attributes[args.geneid] = gene_id if args.add_tss != '' and feature.featuretype == args.add_tss: if feature.strand == '+': start = feature.start elif feature.strand == '-': start = feature.end else: raise Exception('Invalid strand: %s' % feature.strand) tss = gffutils.Feature(seqid=feature.seqid, source=feature.source, featuretype='TSS', start=start, end=start, strand=feature.strand, attributes={ 'ID': ['TSS_%s' % tss_id], 'Parent': feature['ID'], args.geneid: feature.attributes[args.geneid] }) tss_id = tss_id + 1 print(tss) print(feature)