Ejemplo n.º 1
0
def cis_sites():
    return [
        CisSite(id='CIS1', chromosome='1', position=182408172,
                strand=1, metadata=frozendict()),
        CisSite(id='CIS2', chromosome='4', position=132408091,
                strand=1, metadata=frozendict())
    ] # yapf: disable
Ejemplo n.º 2
0
def cis_insertions():
    return [
        # 1000 bp upstream of Trp53bp2.
        Insertion(id='INS1', chromosome='1', position=182408172, strand=1,
                support=2, metadata=frozendict({'cis_id': 'CIS1'})),
        # Different chromosome.
        Insertion(id='INS2', chromosome='4', position=77843175, strand=1,
                  support=2, metadata=frozendict({'cis_id': 'CIS2'}))
    ] # yapf: disable
Ejemplo n.º 3
0
def insertions():
    # Trp53bp2 location: 1: 182,409,172-182,462,432.
    # Myh9 location: 15: 77,760,587-77,842,175.

    return [
        # 1000 bp upstream of Trp53bp2.
        Insertion(id='INS1', chromosome='1', position=182408172,
                  strand=1, support=2, metadata=frozendict()),
        # 2000 bp downstream of Myh9.
        Insertion(id='INS2', chromosome='15', position=77758587,
                  strand=1, support=2, metadata=frozendict()),
        # Different chromosome.
        Insertion(id='INS3', chromosome='4', position=77843175,
                  strand=1, support=2, metadata=frozendict())
    ] # yapf: disable
Ejemplo n.º 4
0
    def _annotate_insertion(self, insertion):
        trees = self._trees

        # Identify overlapping features.
        hits = set()
        for window in self._windows:
            applied_window = window.apply(insertion.chromosome,
                                          insertion.position, insertion.strand)

            hits |= {(feature['gene_id'], feature['gene_name'], window.name)
                     for feature in applied_window.get_overlap(trees)}

        # Filter for blacklist.
        if self._blacklist is not None:
            hits = {hit for hit in hits if hit[1] not in self._blacklist}

        if len(hits) > 0:
            # Annotate insertion with overlapping genes.
            for gene_id, gene_name, window_name in hits:
                metadata = {'gene_id': gene_id, 'gene_name': gene_name}

                if window_name is not None:
                    metadata['window'] = window_name

                metadata = toolz.merge(insertion.metadata, metadata)
                yield insertion._replace(metadata=frozendict(metadata))
        else:
            # In case of no overlap, return original insertion.
            yield insertion
Ejemplo n.º 5
0
def assign_strand(cis_sites, insertions, mapping, min_homogeneity=0.75):
    """Assigns CIS sites the average strand of their insertions."""

    ins_lookup = {insertion.id: insertion for insertion in insertions}

    for cis_site in cis_sites:
        # Lookup strands of CIS insertions.
        cis_strands = np.array(
            [ins_lookup[ins_id].strand for ins_id in mapping[cis_site.id]])

        # Calculate mean strand, strand and homogeneity.
        mean_strand = np.mean(cis_strands)
        strand = np.sign(mean_strand)
        homogeneity = np.sum(cis_strands == strand) / len(cis_strands)

        # If homogeneity is below the given threshold, then we don't
        # assign a specific strand (signified by a nan).
        if homogeneity < min_homogeneity:
            strand = np.nan

        # Merge strand metadata with existing metadata.
        strand_metadata = {
            'strand_mean': mean_strand,
            'strand_homogeneity': homogeneity
        }
        metadata = toolz.merge(cis_site.metadata, strand_metadata)

        yield cis_site._replace(strand=strand, metadata=frozendict(metadata))
Ejemplo n.º 6
0
def _to_insertion(ref, pos, strand, ends, id_=None, **kwargs):
    metadata = toolz.merge({
        'depth': len(ends),
        'depth_unique': len(set(ends))
    }, kwargs)
    return Insertion(id=id_,
                     chromosome=ref,
                     position=pos,
                     strand=strand,
                     support=metadata['depth_unique'],
                     metadata=frozendict(metadata))
Ejemplo n.º 7
0
    def _annotate_insertions(self, insertions, cis_map):
        for insertion in insertions:
            genes = cis_map.get(insertion.metadata['cis_id'], set())

            if len(genes) > 0:
                for gene_name, gene_id in genes:
                    metadata = {'gene_id': gene_id, 'gene_name': gene_name}
                    metadata = toolz.merge(insertion.metadata, metadata)

                    if self._drop_cis_id:
                        metadata.pop('cis_id')

                    yield insertion._replace(metadata=frozendict(metadata))
            else:
                if self._drop_cis_id:
                    metadata = dict(insertion.metadata)
                    metadata.pop('cis_id')
                    yield insertion._replace(metadata=frozendict(metadata))
                else:
                    yield insertion
Ejemplo n.º 8
0
    def from_frame(cls, df):
        """Converts dataframe into a list of objects."""

        cls.check_frame(df)

        basic_fields = cls._non_metadata_fields()
        metadata_fields = list(set(df.columns) - set(basic_fields))

        for row in df.itertuples():
            row_dict = row._asdict()

            metadata = {k: row_dict.pop(k) for k in metadata_fields}
            metadata = frozendict(toolz.valfilter(_not_nan, metadata))

            row_dict.pop('Index', None)

            if not set(basic_fields) == set(row_dict.keys()):
                missing_fields = set(basic_fields) - set(row_dict.keys())
                raise ValueError('Missing required fields ({})'.format(
                    ', '.join(missing_fields)))

            yield cls(metadata=metadata, **row_dict)