Example #1
0
 def read_matrix_file_and_add_as_features(self):
     """Read the HiC matrix file and add each bin as further feature in the
     annotation.
     """
     print("- Reading HiC matrix file")
     self.hic_matrix = hicsuntdracones.hicmatrix.HiCMatrix(
         self._matrix_file)
     self.hic_matrix.hic_matrix_df.set_index(
         self.hic_matrix.hic_matrix_df["Regions"], inplace=True)
     self.hic_matrix.normalize_by_columns_sum()
     for genome_bin in self.hic_matrix.bins():
         chrom = hicsuntdracones.hicmatrix.remove_position_information(
             genome_bin)
         # As the bin counting starts with 0 but the gff starts a 1
         # we have to add 1 here to the position.
         pos = hicsuntdracones.hicmatrix.bin_number(genome_bin)
         pos_adjusted = pos + 1
         self._features.update([gffutils.Feature(
             seqid=chrom,
             source='-',
             featuretype='HiC_bin',
             start=pos_adjusted,
             end=pos_adjusted+self._bin_size,
             strand="+",
             attributes=f"ID={genome_bin}")])
def create_parents(orphans, parent_dict, feature_list, intermediate_dict,
                   gene):
    added_parent_ids = []
    new_orphans = []
    top_feature = None
    for orphan, parent in orphans:
        if parent not in added_parent_ids:
            children = [
                feature for feature in feature_list
                if "Parent" in feature.attributes
                and feature.attributes["Parent"][0] == parent
            ]
            starts = [child.start for child in children]
            ends = [child.end for child in children]
            if parent in parent_dict:
                original_parent = parent_dict[parent]
            else:
                original_parent = intermediate_dict[parent]
            parent_feature = gffutils.Feature(
                seqid=children[0].seqid,
                source="Liftoff",
                featuretype=original_parent.featuretype,
                start=min(starts),
                end=max(ends),
                strand=children[0].strand,
                id=original_parent.id,
                attributes=original_parent.attributes)
            feature_list.append(parent_feature)
            if parent_feature.id != gene.id:
                new_orphans.append(
                    (parent_feature, parent_feature["Parent"][0]))
            else:
                top_feature = parent_feature
            added_parent_ids.append(parent)
    return new_orphans, top_feature
Example #3
0
def location_to_feature(db, chrom, start, stop, strand, source, featuretype):
    if strand not in STRANDS:
        strand = '.'
    overlapping_genes = db.region(seqid=chrom,
                                  start=start,
                                  end=stop,
                                  strand=strand,
                                  featuretype='gene')

    exon_id = 'exon:{chrom}:{start}-{stop}:{strand}'.format(chrom=chrom,
                                                            start=start,
                                                            stop=stop,
                                                            strand=strand)

    attributes = {}
    for g in overlapping_genes:
        attributes = merge_attributes(attributes, g.attributes)

    exon = gffutils.Feature(chrom,
                            source=source,
                            featuretype=featuretype,
                            start=start,
                            end=stop,
                            strand=strand,
                            id=exon_id,
                            attributes=attributes)
    return exon
Example #4
0
def test_pr_144():
    # previously this would fail with:
    #   UnboundLocalError: local variable 'part' referenced before assignment
    f = gffutils.Feature(attributes={'a': ['']})

    # Make sure everything got converted correctly
    assert f.attributes['a'] == ['']
    assert str(f) == ".	.	.	.	.	.	.	.	a"
    g = gffutils.feature.feature_from_line(str(f))
    assert g == f
Example #5
0
def top_level_features(db):
    """Generator that returns top level features from db

    :db: gffutils.FeatureDB
    :returns: Iterator of top-level features (having no parent)

    """
    # Get all top level features
    top_level_features = db.execute(
        "select * "
        "from features "
        "where id not in (select distinct(child) from relations)")
    for row in top_level_features:
        yield (gffutils.Feature(**row))
Example #6
0
def convert_all_children_coords(shortest_path_nodes, children, parent):
    shortest_path_nodes.sort(key=lambda x: x.query_block_start)
    mapped_children = {}
    for child in children:
        lifted_start = convert_coord(child.start, parent, shortest_path_nodes)
        lifted_end = convert_coord(child.end, parent, shortest_path_nodes)
        if (lifted_start != lifted_end
                or child.start == child.end) and lifted_start != 0:
            strand = get_strand(shortest_path_nodes[1], parent)
            mapped_children[child.id] = gffutils.Feature(
                id=child.id,
                seqid=shortest_path_nodes[1].reference_name,
                start=min(lifted_start, lifted_end) + 1,
                end=max(lifted_start, lifted_end) + 1,
                featuretype=child.featuretype,
                source="Liftoff",
                attributes=child.attributes,
                strand=strand)
    return mapped_children
Example #7
0
 def add_matrix_bins_as_features(self, matrix_file, features):
     hic_matrix = hicsuntdracones.hicmatrix.HiCMatrix(matrix_file)
     hic_matrix.normalize_by_columns_sum()
     interaction_matrix = hic_matrix.hic_matrix_df
     interaction_matrix.set_index(interaction_matrix["Regions"],
                                  inplace=True)
     for genome_bin in interaction_matrix["Regions"]:
         chrom_part = "-".join(genome_bin.split("-")[:-1])
         # As the bin counting starts with 0 but the gff starts at 1
         # we have to add 1 here to the position.
         # pos = int(genome_bin.split("-")[-1]) + 1
         pos = int(genome_bin.split("-")[-1])
         pos_adjusted = pos + 1
         features.update([
             gffutils.Feature(seqid=chrom_part,
                              source='-',
                              featuretype='HiC_bin',
                              start=pos_adjusted,
                              end=pos_adjusted + self._bin_size,
                              strand="+",
                              attributes=f"ID={genome_bin}")
         ])
     return interaction_matrix
Example #8
0
def output_RI(gff_out, donor, acceptor, intron_len, source="RI"):
    """
    Output a retained intron event.
    """
    chrom = donor.chrom
    strand = donor.strand
    donor_start = donor.start_coord
    donor_end = donor.end_coord
    acceptor_start = acceptor.start_coord
    acceptor_end = acceptor.end_coord
    ri_name = "%s@%s" % (donor.coords_str, acceptor.coords_str)
    gene_start = donor_start
    gene_end = acceptor_end
    # For GFF record purposes, ensure start < end always
    if gene_start > gene_end:
        gene_start, gene_end = gene_end, gene_start
    gene_rec = gffutils.Feature(seqid=chrom,
                                source=source,
                                featuretype="gene",
                                start=gene_start,
                                end=gene_end,
                                strand=strand,
                                attributes={
                                    "ID": [ri_name],
                                    "Name": [ri_name]
                                })
    # Output mRNA containing the retained intron and then output its exons
    # First output retained intron using "withRI" suffix
    long_mRNA_name = "%s.A" % (ri_name)
    # Long mRNA record has same start/end as gene record
    long_mRNA_rec = gffutils.Feature(seqid=chrom,
                                     source=source,
                                     featuretype="mRNA",
                                     start=gene_start,
                                     end=gene_end,
                                     strand=strand,
                                     attributes={
                                         "ID": [long_mRNA_name],
                                         "Parent": [ri_name]
                                     })
    # Retained intron belongs to long mRNA
    ri_exon_name = "%s.withRI" % (long_mRNA_name)
    # Retained intron record has same start/end as gene record as well
    ri_exon_rec = gffutils.Feature(seqid=chrom,
                                   source=source,
                                   featuretype="exon",
                                   start=gene_start,
                                   end=gene_end,
                                   strand=strand,
                                   attributes={
                                       "ID": [ri_exon_name],
                                       "Parent": [long_mRNA_name]
                                   })
    # Output mRNA splicing out the intron and then output its exons
    short_mRNA_name = "%s.B" % (ri_name)
    # Short mRNA has same start/end as gene record
    short_mRNA_rec = gffutils.Feature(seqid=chrom,
                                      source=source,
                                      featuretype="mRNA",
                                      start=gene_start,
                                      end=gene_end,
                                      strand=strand,
                                      attributes={
                                          "ID": [short_mRNA_name],
                                          "Parent": [ri_name]
                                      })
    up_exon_name = "%s.up" % (short_mRNA_name)
    up_exon_rec = gffutils.Feature(seqid=chrom,
                                   source=source,
                                   featuretype="exon",
                                   start=donor.gff_start,
                                   end=donor.gff_end,
                                   strand=strand,
                                   attributes={
                                       "ID": [up_exon_name],
                                       "Parent": [short_mRNA_name]
                                   })
    dn_exon_name = "%s.dn" % (short_mRNA_name)
    dn_exon_rec = gffutils.Feature(seqid=chrom,
                                   source=source,
                                   featuretype="exon",
                                   start=acceptor.gff_start,
                                   end=acceptor.gff_end,
                                   strand=strand,
                                   attributes={
                                       "ID": [dn_exon_name],
                                       "Parent": [short_mRNA_name]
                                   })
    # Serialize records to GFF
    # gene
    gff_out.write_rec(gene_rec)
    # long mRNA
    gff_out.write_rec(long_mRNA_rec)
    # retained intron
    gff_out.write_rec(ri_exon_rec)
    # short mRNA
    gff_out.write_rec(short_mRNA_rec)
    gff_out.write_rec(up_exon_rec)
    gff_out.write_rec(dn_exon_rec)
Example #9
0
    assert len(parent_gene) < 2
    if feature.featuretype == 'gene':
        feature.attributes[args.geneid] = feature.attributes['ID']
    elif len(parent_gene) == 1:
        parent_gene = parent_gene[0]

        gene_id = parent_gene.attributes['ID']
        assert len(gene_id) == 1
        feature.attributes[args.geneid] = gene_id
    if args.add_tss != '' and feature.featuretype == args.add_tss:
        if feature.strand == '+':
            start = feature.start
        elif feature.strand == '-':
            start = feature.end
        else:
            raise Exception('Invalid strand: %s' % feature.strand)
        tss = gffutils.Feature(seqid=feature.seqid,
                               source=feature.source,
                               featuretype='TSS',
                               start=start,
                               end=start,
                               strand=feature.strand,
                               attributes={
                                   'ID': ['TSS_%s' % tss_id],
                                   'Parent': feature['ID'],
                                   args.geneid: feature.attributes[args.geneid]
                               })
        tss_id = tss_id + 1
        print(tss)
    print(feature)