Beispiel #1
0
def corr_profile(profile1, profile2, nspread, profile2_ready=False):
    """Calculate the spearmans correlation coefficient btween two
    (possibly extended) cross-link profiles.
    
    Parameters
    ----------
    profile1, profile2 : pandas.Series of int
        Cross-link profiles. Index is positions, value is counts.
    nspread : int
        Number of bases to extend each profile in each direction
    profile2_ready : bool, optional
        Has profile2 already been reindexed and spread (see below)
    
    Return
    ------
    float
        Spearmans correlation coefficient between the two profiles
        
    Notes
    -----
    During each call profile is reindexed so that bases with no cross-links
    are included in the index, and cross-links sites are extended.
    
    As this is a slow process, and it is imagined that if randomisaitons
    are applied to profile1, but profile2 is held constant, it would be
    more efficent to apply this only once to profile2 and compare it to 
    many randomisaitons of profile1. If `profile2_read=True`, it is assumed
    that `profile2` is already supplied with 0 count positions indexed and
    cross-links extended. The can be achieved with:
        
        profile2 = profile2.reindex(range(start, end))
        profile2 = profile2.fillna(x)
        profile2 = profile2.spread(profile2, nspread)
        
    A known flaw in this measure of the relationship between two profiles
    is that if profiles contain many 0s, even if the 0s are in the same 
    positions, the correlation will be low. 
    """
    
    profile1 = profile1.reindex(
                    range(int(profile1.index.values.min())-1,
                          int(profile1.index.values.max())+1)).fillna(0)
    profile1 = spread(profile1, nspread, False)
        
    if not profile2_ready:
        profile2 = profile2.reindex(
                       range(int(profile2.index.values.min()),
                             int(profile2.index.values.max()))).fillna(0)
        profile2 = spread(profile2, nspread, False)
        
    return profile1.corr(profile2, method="spearman")
def corr_profile(profile1, profile2, nspread, profile2_ready=False):
    
    profile1 = profile1.reindex(
                    range(int(profile1.index.values.min())-1,
                          int(profile1.index.values.max())+1)).fillna(0)
    profile1 = spread(profile1, nspread, False)
        
    if not profile2_ready:
        profile2 = profile2.reindex(
                       range(int(profile2.index.values.min()),
                             int(profile2.index.values.max()))).fillna(0)
        profile2 = spread(profile2, nspread, False)
        
    return profile1.corr(profile2, method="spearman")
Beispiel #3
0
def corr_profile(profile1, profile2, nspread, profile2_ready=False):

    profile1 = profile1.reindex(
        range(
            int(profile1.index.values.min()) - 1,
            int(profile1.index.values.max()) + 1)).fillna(0)
    profile1 = spread(profile1, nspread, False)

    if not profile2_ready:
        profile2 = profile2.reindex(
            range(int(profile2.index.values.min()),
                  int(profile2.index.values.max()))).fillna(0)
        profile2 = spread(profile2, nspread, False)

    return profile1.corr(profile2, method="spearman")
Beispiel #4
0
def pentamer_frequency(profile, length, regex_matches, nSpread=15):
    '''Calculate the frequency of the each of a collection
    of sequence regexes on the provided read profile, and the
    coresponding sequence

        :param profile: A profile of the number of reads at each base
        :type profile: pandas.Series
        :param length: Length of the sequence represented by profile
        :param regex_matches: A pandas.Series of the locations of hits
                              for a set of regexs, as returned by 
                              find_all_matches
        :param nSpread: How far either side of each read to consider

        :rtype: pandas.Series with the count of each regex'''

    kmer = len(regex_matches.index.values[0])
    profile = profile.reindex(np.arange(-nSpread, length + nSpread - kmer),
                              fill_value=0)
    profile = spread(profile, nSpread, False, nSpread - kmer)
    profile = profile.values

    def _count_regex(hits):

        if hits.size:
            return profile[hits].sum()
        else:
            return 0

    return regex_matches.map(_count_regex)
Beispiel #5
0
def fdr(profile, exon, nspread, randomizations):
    '''Calculate the FDR of finding a particular heights
    by using randomizations'''

    profile_Ph = Ph(profile, exon, nspread)
    rands = rand_apply(profile, exon,
                       randomizations,
                       Ph,
                       False,
                       exon,
                       nspread)
    rands = rands.reindex(columns=profile_Ph.index)
    rands = rands.fillna(0)
    muh = rands.mean()
    sigmah = rands.std()
    fdr_thresholds = (muh + sigmah) / profile_Ph
    spread_profile = spread(profile, nspread)
    fdrs = spread_profile.map(fdr_thresholds)
    try:
        fdrs = fdrs.loc[profile.index]
    except KeyError:
        print profile.index
        print spread_profile.index
        print fdrs.index
        raise
    fdrs = fdrs.reindex(profile.index)
    return fdrs
def pentamer_frequency(profile, length, regex_matches, nSpread=15):
    '''Calculate the frequency of the each of a collection
    of sequence regexes on the provided read profile, and the
    coresponding sequence

        :param profile: A profile of the number of reads at each base
        :type profile: pandas.Series
        :param length: Length of the sequence represented by profile
        :param regex_matches: A pandas.Series of the locations of hits
                              for a set of regexs, as returned by 
                              find_all_matches
        :param nSpread: How far either side of each read to consider

        :rtype: pandas.Series with the count of each regex'''

    kmer = len(regex_matches.index.values[0])
    profile = profile.reindex(
        np.arange(-nSpread, length+nSpread-kmer), fill_value=0)
    profile = spread(profile, nSpread, False, nSpread - kmer)
    profile = profile.values

    def _count_regex(hits):

        if hits.size:
            return profile[hits].sum()
        else:
            return 0

    return regex_matches.map(_count_regex)
Beispiel #7
0
def pentamer_frequency(profile, length, regex_matches, nSpread=15):
    '''Calculate the frequency of overlaps between a list of cross-link
    sites (possibly extended) and a collection of sites
    
    The second collection of site would usually be a Series of ararys of
    positions as returned by :func:`find_all_matches`.

    Parameters
    ----------
    profile : pandas.Series
        A profile of the number of cross-links at each base.
    length : int
        Length of the sequence represetnted by `profile`.
    regex_matches : pandas.Series of nd.array of int
        Each array entry represents a single match on the sequence. Each 
        array represents a different thing matched (e.g. a different regex)
        . This structure is would usually be returned by
        :func:`find_all_matches`.
    nSpread : int, optional
        How far either side of a cross-link location to consider when 
        calculating overlaps (defaults to 15).
    
    Returns
    -------
    pandas.Series of int
        Each entry is the number of overlaps between cross-links sites
        in profile and a single entry in `regex_matches`. Each same index
        as `regex_matches`.
        
    '''

    try:
        kmer = len(regex_matches.index.values[0])
    except IndexError:
        return pd.Series()

    profile = profile.reindex(np.arange(-nSpread, length + nSpread),
                              fill_value=0)
    profile = spread(profile, nSpread, False)
    profile = profile.values

    def _count_regex(hits):

        if hits.size:
            return profile[hits].sum()
        else:
            return 0

    results = regex_matches.map(_count_regex)
    results = results
    return results
Beispiel #8
0
def pentamer_frequency(profile, length, regex_matches, nSpread=15):
    '''Calculate the frequency of overlaps between a list of cross-link
    sites (possibly extended) and a collection of sites
    
    The second collection of site would usually be a Series of ararys of
    positions as returned by :func:`find_all_matches`.

    Parameters
    ----------
    profile : pandas.Series
        A profile of the number of cross-links at each base.
    length : int
        Length of the sequence represetnted by `profile`.
    regex_matches : pandas.Series of nd.array of int
        Each array entry represents a single match on the sequence. Each 
        array represents a different thing matched (e.g. a different regex)
        . This structure is would usually be returned by
        :func:`find_all_matches`.
    nSpread : int, optional
        How far either side of a cross-link location to consider when 
        calculating overlaps (defaults to 15).
    
    Returns
    -------
    pandas.Series of int
        Each entry is the number of overlaps between cross-links sites
        in profile and a single entry in `regex_matches`. Each same index
        as `regex_matches`.
        
    '''

    try:
        kmer = len(regex_matches.index.values[0])
    except IndexError:
        return pd.Series()
    
    profile = profile.reindex(
        np.arange(-nSpread, length+nSpread), fill_value=0)
    profile = spread(profile, nSpread, False)
    profile = profile.values

    def _count_regex(hits):

        if hits.size:
            return profile[hits].sum()
        else:
            return 0

    results = regex_matches.map(_count_regex)
    results = results
    return results
Beispiel #9
0
def Ph(profile, exon, nspread):
    '''Calculates a Series, Ph, such that Ph[i] is the
    P(X >= i) where X is the height of signal on a base of the
    profile'''

    profile = profile.reindex(np.arange(exon.start-nspread,
                                        exon.end + nspread + 1),
                              fill_value=0)
    profile = spread(profile, nspread, reindex=False)
    profile = profile[profile > 0]
    pdf = profile.value_counts()
    pdf = pdf/pdf.sum()
    pdf = pdf.reindex(np.arange(0, pdf.index.values.max()), fill_value=0)
    cdf = 1 - pdf.cumsum()
    cdf.index = cdf.index + 1
    return cdf
Beispiel #10
0
def Ph(profile, exon, nspread):
    '''Calculates a Series, Ph, such that Ph[i] is the
    P(X >= i) where X is the height of signal on a base of the
    profile'''

    profile = profile.reindex(np.arange(exon.start - spread,
                                        exon.end + spread + 1),
                              fill_value=0)
    profile = spread(profile, nspread, reindex=False)
    profile = profile[profile > 0]
    pdf = profile.value_counts()
    pdf = pdf / pdf.sum()
    pdf = pdf.reindex(np.arange(0, pdf.index.values.max()), fill_value=0)
    cdf = 1 - pdf.cumsum()
    cdf.index = cdf.index + 1
    return cdf
Beispiel #11
0
def fdr(profile, exon, nspread, randomizations):
    '''Calculate the FDR of finding a particular heights
    by using randomizations'''

    profile_Ph = Ph(profile, exon, nspread)
    rands = rand_apply(profile, exon, randomizations, Ph, False, exon, nspread)
    rands = rands.reindex(columns=profile_Ph.index)
    rands = rands.fillna(0)
    muh = rands.mean()
    sigmah = rands.std()
    fdr_thresholds = (muh + sigmah) / profile_Ph
    spread_profile = spread(profile, nspread)
    fdrs = spread_profile.map(fdr_thresholds)
    try:
        fdrs = fdrs.loc[profile.index]
    except KeyError:
        print profile.index
        print spread_profile.index
        print fdrs.index
        raise
    fdrs = fdrs.reindx(profile.index)
    return fdrs
Beispiel #12
0
            def _update_spot_record(cache: Optional[GlobalCache] = None):
                if not isinstance(spot_record_data, PostData):
                    return None
                spot_record_time_: Union[dt, str, None]
                device_: Union[Device, str, int, None]
                device_name_: Optional[str]

                spot_record_time_, device_, device_name_ = spread(
                    'spot_record_time',
                    'device',
                    'device_name')(spot_record_data)

                spot_record_time: Optional[dt]
                spot_record_time = (str_dt_normalizer(spot_record_time_,
                                                      normalize_time(5)))

                # query with device id or device name
                # when have multiple record that refers to device
                # `device_name_` is prefered since it compatible with
                # the scheduler.
                device: Optional[Device]
                if device_name_ is None:
                    # query with device id or device name
                    if isinstance(device_, Device):
                        device = device_

                    # device id as str for query cache.
                    elif can_be_int(device_):
                        if cache is not None:
                            device = get_cache(
                                cache,
                                ModelDataEnum._Device,
                                int(cast(Union[int, str], device_)))
                        else:
                            logger.debug('using database')
                            device = (
                                Device.query
                                .filter_by(device_id=int(cast(Union[int, str],
                                                              device_)))
                                      .first())
                    else:
                        # there must be a device for spot record.
                        logger.error(
                            'spot_record must have a device')
                        return None

                else:
                    device = (Device.query
                              .filter_by(device_name=device_name_)
                              .first())

                cache_key = ((spot_record_time, device)
                             if (spot_record_time is not None
                                 and device is not None)
                             else None)

                # search in cache.
                if cache is not None and cache_key is not None:
                    spot_record = (get_cache(
                        cache,
                        ModelDataEnum._SpotRecord,
                        cache_key))
                else:
                    spot_record = (
                        SpotRecord
                        .query
                        .filter_by(spot_record_time=spot_record_time)
                        .filter(
                            and_(
                                SpotRecord.spot_record_time
                                == spot_record_time,
                                SpotRecord.device == device))
                        .first())

                new_spot_record = ModelOperations._make_spot_reocrd(
                    spot_record_data)

                if spot_record is not None and new_spot_record is not None:
                    spot_record.update(new_spot_record)

                db.session.merge(spot_record)
                del new_spot_record
                return spot_record
Beispiel #13
0
            def _add_spot_record(cache: Optional[GlobalCache] = None) \
                    -> Optional[SpotRecord]:
                if not isinstance(spot_record_data, PostData):
                    return None
                spot_record_time_: Union[str, dt, None]
                device_: Union[str, int, Device, None]
                device_name_: Optional[str]

                device: Optional[Device]
                spot_record_time: Optional[dt]

                (spot_record_time_, device_, device_name_) = spread(
                    'spot_record_time',
                    'device',
                    'device_name')(spot_record_data)

                logger.debug(spot_record_data)

                # time can either be dt or string.
                spot_record_time = (str_dt_normalizer(spot_record_time_,
                                                      normalize_time(5)))

                # when have multiple record that refers to device
                # `device_name_` is prefered since it compatible with
                # the scheduler.
                if device_name_ is None:
                    # query with device id or device name
                    if isinstance(device_, Device):
                        device = device_

                    # device id as str for query cache.
                    elif can_be_int(device_):
                        if cache is not None:
                            device = get_cache(
                                cache,
                                ModelDataEnum._Device,
                                int(cast(Union[int, str], device_)))
                        else:
                            logger.debug('using database')
                            device = (
                                Device.query
                                .filter_by(device_id=int(cast(Union[int, str],
                                                              device_)))
                                      .first())
                    else:
                        # there must be a device for spot record.
                        logger.error(
                            'spot_record must have a device')
                        return None

                else:
                    device = (Device.query
                              .filter_by(device_name=device_name_)
                              .first())

                # change in 2020-01-08
                # same device and same spot record time means the same record.
                # skip the record if device is None.

                # change in 2020-01-21
                # generate cache key for records in _LRUDictionary.

                cache_key = ((spot_record_time, device)
                             if (spot_record_time is not None
                                 and device is not None)
                             else None)

                if cache is not None and cache_key is not None:
                    spot_record = (get_cache(
                        cache,
                        ModelDataEnum._SpotRecord,
                        cache_key))

                else:  # find same spot_record expensive.
                    spot_record = (
                        SpotRecord
                        .query
                        .filter_by(
                            spot_record_time=spot_record_time)
                        .filter(
                            and_(
                                SpotRecord.spot_record_time
                                == spot_record_time,
                                SpotRecord.device == device))
                        .first())

                if spot_record:
                    logger.debug('record already exists.')
                    return spot_record

                @db_exception('add_spot_record')
                def new() -> Optional[SpotRecord]:
                    new_spot_record = ModelOperations._make_spot_reocrd(
                        spot_record_data)
                    db.session.add(new_spot_record)

                    # add new record into cache.
                    if (cache_key is not None
                            and new_spot_record is not None
                            and cache is not None):
                        _enum = ModelDataEnum._SpotRecord
                        cache[_enum][cache_key] = new_spot_record
                    return new_spot_record
                return new()
Beispiel #14
0
        def _make(cache: Optional[GlobalCache] = None):
            if not isinstance(spot_record_data, PostData):
                return None
            # time can either be dt or string.
            spot_record_time_: Union[dt, str, None]
            device_: Optional[Union[Device, str, int]]
            device_name_: Optional[str]

            spot_record_time_, device_, device_name_ = spread(
                'spot_record_time',
                'device',
                'device_name')(spot_record_data)

            spot_record_time: Optional[dt]
            spot_record_time = str_dt_normalizer(
                spot_record_data.get('spot_record_time'), normalize_time(5))

            # query with device id or device name
            # get device first, then fetch device id
            device_ = spot_record_data.get('device')

            # query with device id or device name
            # when have multiple record that refers to device
            # `device_name_` is prefered since it compatible with
            # the scheduler.
            device: Optional[Device]
            if device_name_ is None:
                # query with device id or device name
                if isinstance(device_, Device):
                    device = device_

                # device id as str for query cache.
                elif can_be_int(device_):
                    if cache is not None:
                        device = get_cache(
                            cache,
                            ModelDataEnum._Device,
                            int(cast(Union[int, str], device_)))
                    else:
                        logger.debug('using database')
                        device = (
                            Device.query
                            .filter_by(device_id=int(cast(Union[int, str],
                                                          device_)))
                                  .first())

                else:
                    # there must be a device for spot record.
                    logger.error(
                        'spot_record must have a device')
                    return None
            else:
                device = (Device.query
                          .filter_by(device_name=device_name_)
                          .first())

            device_id: Optional[int]
            device_id = device.device_id if device is not None else None

            json_convert(spot_record_data, 'window_opened', json_to_bool)
            json_convert(spot_record_data, 'temperature', float)
            json_convert(spot_record_data, 'humidity', float)
            json_convert(spot_record_data, 'ac_power', float)
            json_convert(spot_record_data, 'pm25', float)
            json_convert(spot_record_data, 'co2', float)

            try:
                spot_record = SpotRecord(
                    spot_record_time=spot_record_time,
                    device_id=device_id,
                    window_opened=spot_record_data.get("window_opened"),
                    temperature=spot_record_data.get("temperature"),
                    humidity=spot_record_data.get("humidity"),
                    ac_power=spot_record_data.get("ac_power"),
                    pm25=spot_record_data.get("pm25"),
                    co2=spot_record_data.get("co2"))

            except IntegrityError as e:
                logger.error(f"integirty error {e}")

            return spot_record