def corr_profile(profile1, profile2, nspread, profile2_ready=False): """Calculate the spearmans correlation coefficient btween two (possibly extended) cross-link profiles. Parameters ---------- profile1, profile2 : pandas.Series of int Cross-link profiles. Index is positions, value is counts. nspread : int Number of bases to extend each profile in each direction profile2_ready : bool, optional Has profile2 already been reindexed and spread (see below) Return ------ float Spearmans correlation coefficient between the two profiles Notes ----- During each call profile is reindexed so that bases with no cross-links are included in the index, and cross-links sites are extended. As this is a slow process, and it is imagined that if randomisaitons are applied to profile1, but profile2 is held constant, it would be more efficent to apply this only once to profile2 and compare it to many randomisaitons of profile1. If `profile2_read=True`, it is assumed that `profile2` is already supplied with 0 count positions indexed and cross-links extended. The can be achieved with: profile2 = profile2.reindex(range(start, end)) profile2 = profile2.fillna(x) profile2 = profile2.spread(profile2, nspread) A known flaw in this measure of the relationship between two profiles is that if profiles contain many 0s, even if the 0s are in the same positions, the correlation will be low. """ profile1 = profile1.reindex( range(int(profile1.index.values.min())-1, int(profile1.index.values.max())+1)).fillna(0) profile1 = spread(profile1, nspread, False) if not profile2_ready: profile2 = profile2.reindex( range(int(profile2.index.values.min()), int(profile2.index.values.max()))).fillna(0) profile2 = spread(profile2, nspread, False) return profile1.corr(profile2, method="spearman")
def corr_profile(profile1, profile2, nspread, profile2_ready=False): profile1 = profile1.reindex( range(int(profile1.index.values.min())-1, int(profile1.index.values.max())+1)).fillna(0) profile1 = spread(profile1, nspread, False) if not profile2_ready: profile2 = profile2.reindex( range(int(profile2.index.values.min()), int(profile2.index.values.max()))).fillna(0) profile2 = spread(profile2, nspread, False) return profile1.corr(profile2, method="spearman")
def corr_profile(profile1, profile2, nspread, profile2_ready=False): profile1 = profile1.reindex( range( int(profile1.index.values.min()) - 1, int(profile1.index.values.max()) + 1)).fillna(0) profile1 = spread(profile1, nspread, False) if not profile2_ready: profile2 = profile2.reindex( range(int(profile2.index.values.min()), int(profile2.index.values.max()))).fillna(0) profile2 = spread(profile2, nspread, False) return profile1.corr(profile2, method="spearman")
def pentamer_frequency(profile, length, regex_matches, nSpread=15): '''Calculate the frequency of the each of a collection of sequence regexes on the provided read profile, and the coresponding sequence :param profile: A profile of the number of reads at each base :type profile: pandas.Series :param length: Length of the sequence represented by profile :param regex_matches: A pandas.Series of the locations of hits for a set of regexs, as returned by find_all_matches :param nSpread: How far either side of each read to consider :rtype: pandas.Series with the count of each regex''' kmer = len(regex_matches.index.values[0]) profile = profile.reindex(np.arange(-nSpread, length + nSpread - kmer), fill_value=0) profile = spread(profile, nSpread, False, nSpread - kmer) profile = profile.values def _count_regex(hits): if hits.size: return profile[hits].sum() else: return 0 return regex_matches.map(_count_regex)
def fdr(profile, exon, nspread, randomizations): '''Calculate the FDR of finding a particular heights by using randomizations''' profile_Ph = Ph(profile, exon, nspread) rands = rand_apply(profile, exon, randomizations, Ph, False, exon, nspread) rands = rands.reindex(columns=profile_Ph.index) rands = rands.fillna(0) muh = rands.mean() sigmah = rands.std() fdr_thresholds = (muh + sigmah) / profile_Ph spread_profile = spread(profile, nspread) fdrs = spread_profile.map(fdr_thresholds) try: fdrs = fdrs.loc[profile.index] except KeyError: print profile.index print spread_profile.index print fdrs.index raise fdrs = fdrs.reindex(profile.index) return fdrs
def pentamer_frequency(profile, length, regex_matches, nSpread=15): '''Calculate the frequency of the each of a collection of sequence regexes on the provided read profile, and the coresponding sequence :param profile: A profile of the number of reads at each base :type profile: pandas.Series :param length: Length of the sequence represented by profile :param regex_matches: A pandas.Series of the locations of hits for a set of regexs, as returned by find_all_matches :param nSpread: How far either side of each read to consider :rtype: pandas.Series with the count of each regex''' kmer = len(regex_matches.index.values[0]) profile = profile.reindex( np.arange(-nSpread, length+nSpread-kmer), fill_value=0) profile = spread(profile, nSpread, False, nSpread - kmer) profile = profile.values def _count_regex(hits): if hits.size: return profile[hits].sum() else: return 0 return regex_matches.map(_count_regex)
def pentamer_frequency(profile, length, regex_matches, nSpread=15): '''Calculate the frequency of overlaps between a list of cross-link sites (possibly extended) and a collection of sites The second collection of site would usually be a Series of ararys of positions as returned by :func:`find_all_matches`. Parameters ---------- profile : pandas.Series A profile of the number of cross-links at each base. length : int Length of the sequence represetnted by `profile`. regex_matches : pandas.Series of nd.array of int Each array entry represents a single match on the sequence. Each array represents a different thing matched (e.g. a different regex) . This structure is would usually be returned by :func:`find_all_matches`. nSpread : int, optional How far either side of a cross-link location to consider when calculating overlaps (defaults to 15). Returns ------- pandas.Series of int Each entry is the number of overlaps between cross-links sites in profile and a single entry in `regex_matches`. Each same index as `regex_matches`. ''' try: kmer = len(regex_matches.index.values[0]) except IndexError: return pd.Series() profile = profile.reindex(np.arange(-nSpread, length + nSpread), fill_value=0) profile = spread(profile, nSpread, False) profile = profile.values def _count_regex(hits): if hits.size: return profile[hits].sum() else: return 0 results = regex_matches.map(_count_regex) results = results return results
def pentamer_frequency(profile, length, regex_matches, nSpread=15): '''Calculate the frequency of overlaps between a list of cross-link sites (possibly extended) and a collection of sites The second collection of site would usually be a Series of ararys of positions as returned by :func:`find_all_matches`. Parameters ---------- profile : pandas.Series A profile of the number of cross-links at each base. length : int Length of the sequence represetnted by `profile`. regex_matches : pandas.Series of nd.array of int Each array entry represents a single match on the sequence. Each array represents a different thing matched (e.g. a different regex) . This structure is would usually be returned by :func:`find_all_matches`. nSpread : int, optional How far either side of a cross-link location to consider when calculating overlaps (defaults to 15). Returns ------- pandas.Series of int Each entry is the number of overlaps between cross-links sites in profile and a single entry in `regex_matches`. Each same index as `regex_matches`. ''' try: kmer = len(regex_matches.index.values[0]) except IndexError: return pd.Series() profile = profile.reindex( np.arange(-nSpread, length+nSpread), fill_value=0) profile = spread(profile, nSpread, False) profile = profile.values def _count_regex(hits): if hits.size: return profile[hits].sum() else: return 0 results = regex_matches.map(_count_regex) results = results return results
def Ph(profile, exon, nspread): '''Calculates a Series, Ph, such that Ph[i] is the P(X >= i) where X is the height of signal on a base of the profile''' profile = profile.reindex(np.arange(exon.start-nspread, exon.end + nspread + 1), fill_value=0) profile = spread(profile, nspread, reindex=False) profile = profile[profile > 0] pdf = profile.value_counts() pdf = pdf/pdf.sum() pdf = pdf.reindex(np.arange(0, pdf.index.values.max()), fill_value=0) cdf = 1 - pdf.cumsum() cdf.index = cdf.index + 1 return cdf
def Ph(profile, exon, nspread): '''Calculates a Series, Ph, such that Ph[i] is the P(X >= i) where X is the height of signal on a base of the profile''' profile = profile.reindex(np.arange(exon.start - spread, exon.end + spread + 1), fill_value=0) profile = spread(profile, nspread, reindex=False) profile = profile[profile > 0] pdf = profile.value_counts() pdf = pdf / pdf.sum() pdf = pdf.reindex(np.arange(0, pdf.index.values.max()), fill_value=0) cdf = 1 - pdf.cumsum() cdf.index = cdf.index + 1 return cdf
def fdr(profile, exon, nspread, randomizations): '''Calculate the FDR of finding a particular heights by using randomizations''' profile_Ph = Ph(profile, exon, nspread) rands = rand_apply(profile, exon, randomizations, Ph, False, exon, nspread) rands = rands.reindex(columns=profile_Ph.index) rands = rands.fillna(0) muh = rands.mean() sigmah = rands.std() fdr_thresholds = (muh + sigmah) / profile_Ph spread_profile = spread(profile, nspread) fdrs = spread_profile.map(fdr_thresholds) try: fdrs = fdrs.loc[profile.index] except KeyError: print profile.index print spread_profile.index print fdrs.index raise fdrs = fdrs.reindx(profile.index) return fdrs
def _update_spot_record(cache: Optional[GlobalCache] = None): if not isinstance(spot_record_data, PostData): return None spot_record_time_: Union[dt, str, None] device_: Union[Device, str, int, None] device_name_: Optional[str] spot_record_time_, device_, device_name_ = spread( 'spot_record_time', 'device', 'device_name')(spot_record_data) spot_record_time: Optional[dt] spot_record_time = (str_dt_normalizer(spot_record_time_, normalize_time(5))) # query with device id or device name # when have multiple record that refers to device # `device_name_` is prefered since it compatible with # the scheduler. device: Optional[Device] if device_name_ is None: # query with device id or device name if isinstance(device_, Device): device = device_ # device id as str for query cache. elif can_be_int(device_): if cache is not None: device = get_cache( cache, ModelDataEnum._Device, int(cast(Union[int, str], device_))) else: logger.debug('using database') device = ( Device.query .filter_by(device_id=int(cast(Union[int, str], device_))) .first()) else: # there must be a device for spot record. logger.error( 'spot_record must have a device') return None else: device = (Device.query .filter_by(device_name=device_name_) .first()) cache_key = ((spot_record_time, device) if (spot_record_time is not None and device is not None) else None) # search in cache. if cache is not None and cache_key is not None: spot_record = (get_cache( cache, ModelDataEnum._SpotRecord, cache_key)) else: spot_record = ( SpotRecord .query .filter_by(spot_record_time=spot_record_time) .filter( and_( SpotRecord.spot_record_time == spot_record_time, SpotRecord.device == device)) .first()) new_spot_record = ModelOperations._make_spot_reocrd( spot_record_data) if spot_record is not None and new_spot_record is not None: spot_record.update(new_spot_record) db.session.merge(spot_record) del new_spot_record return spot_record
def _add_spot_record(cache: Optional[GlobalCache] = None) \ -> Optional[SpotRecord]: if not isinstance(spot_record_data, PostData): return None spot_record_time_: Union[str, dt, None] device_: Union[str, int, Device, None] device_name_: Optional[str] device: Optional[Device] spot_record_time: Optional[dt] (spot_record_time_, device_, device_name_) = spread( 'spot_record_time', 'device', 'device_name')(spot_record_data) logger.debug(spot_record_data) # time can either be dt or string. spot_record_time = (str_dt_normalizer(spot_record_time_, normalize_time(5))) # when have multiple record that refers to device # `device_name_` is prefered since it compatible with # the scheduler. if device_name_ is None: # query with device id or device name if isinstance(device_, Device): device = device_ # device id as str for query cache. elif can_be_int(device_): if cache is not None: device = get_cache( cache, ModelDataEnum._Device, int(cast(Union[int, str], device_))) else: logger.debug('using database') device = ( Device.query .filter_by(device_id=int(cast(Union[int, str], device_))) .first()) else: # there must be a device for spot record. logger.error( 'spot_record must have a device') return None else: device = (Device.query .filter_by(device_name=device_name_) .first()) # change in 2020-01-08 # same device and same spot record time means the same record. # skip the record if device is None. # change in 2020-01-21 # generate cache key for records in _LRUDictionary. cache_key = ((spot_record_time, device) if (spot_record_time is not None and device is not None) else None) if cache is not None and cache_key is not None: spot_record = (get_cache( cache, ModelDataEnum._SpotRecord, cache_key)) else: # find same spot_record expensive. spot_record = ( SpotRecord .query .filter_by( spot_record_time=spot_record_time) .filter( and_( SpotRecord.spot_record_time == spot_record_time, SpotRecord.device == device)) .first()) if spot_record: logger.debug('record already exists.') return spot_record @db_exception('add_spot_record') def new() -> Optional[SpotRecord]: new_spot_record = ModelOperations._make_spot_reocrd( spot_record_data) db.session.add(new_spot_record) # add new record into cache. if (cache_key is not None and new_spot_record is not None and cache is not None): _enum = ModelDataEnum._SpotRecord cache[_enum][cache_key] = new_spot_record return new_spot_record return new()
def _make(cache: Optional[GlobalCache] = None): if not isinstance(spot_record_data, PostData): return None # time can either be dt or string. spot_record_time_: Union[dt, str, None] device_: Optional[Union[Device, str, int]] device_name_: Optional[str] spot_record_time_, device_, device_name_ = spread( 'spot_record_time', 'device', 'device_name')(spot_record_data) spot_record_time: Optional[dt] spot_record_time = str_dt_normalizer( spot_record_data.get('spot_record_time'), normalize_time(5)) # query with device id or device name # get device first, then fetch device id device_ = spot_record_data.get('device') # query with device id or device name # when have multiple record that refers to device # `device_name_` is prefered since it compatible with # the scheduler. device: Optional[Device] if device_name_ is None: # query with device id or device name if isinstance(device_, Device): device = device_ # device id as str for query cache. elif can_be_int(device_): if cache is not None: device = get_cache( cache, ModelDataEnum._Device, int(cast(Union[int, str], device_))) else: logger.debug('using database') device = ( Device.query .filter_by(device_id=int(cast(Union[int, str], device_))) .first()) else: # there must be a device for spot record. logger.error( 'spot_record must have a device') return None else: device = (Device.query .filter_by(device_name=device_name_) .first()) device_id: Optional[int] device_id = device.device_id if device is not None else None json_convert(spot_record_data, 'window_opened', json_to_bool) json_convert(spot_record_data, 'temperature', float) json_convert(spot_record_data, 'humidity', float) json_convert(spot_record_data, 'ac_power', float) json_convert(spot_record_data, 'pm25', float) json_convert(spot_record_data, 'co2', float) try: spot_record = SpotRecord( spot_record_time=spot_record_time, device_id=device_id, window_opened=spot_record_data.get("window_opened"), temperature=spot_record_data.get("temperature"), humidity=spot_record_data.get("humidity"), ac_power=spot_record_data.get("ac_power"), pm25=spot_record_data.get("pm25"), co2=spot_record_data.get("co2")) except IntegrityError as e: logger.error(f"integirty error {e}") return spot_record