def format_1cs_df(input_df, group_col_list, time_col_list, value_col_list=list(), field_separator='\1xf', record_seperator='\1xe'): if len(value_col_list) > 0: record_separator_columns = [ group_col_list[-1], time_col_list[-1], value_col_list[-1] ] else: record_separator_columns = [group_col_list[-1], time_col_list[-1]] df = input_df.astype(str) df_col_list = df.columns.tolist() new_line_char = '\n' structure_df = _DataFrame(_np.tile(_np.array(df.columns), len( df.index)).reshape(len(df.index), -1), index=df.index, columns=df.columns) + '=' key_value_df = structure_df.add(df) for index, col in enumerate(df_col_list): if index == len(df_col_list) - 1: key_value_df[col] = key_value_df[col].apply( lambda row_value: str(row_value) + new_line_char) elif col in record_separator_columns: key_value_df[col] = key_value_df[col].apply( lambda row_value: str(row_value) + record_seperator) else: key_value_df[col] = key_value_df[col].apply( lambda row_value: str(row_value) + field_separator) return key_value_df
def ReadFilesToDF(files, delim=None): """ ReadFilesToDF is a helper routine that takes the nominal dataframes (can be singletons) labels data index according to any found date index , or monotonically increasing sequence """ allR = [] for indexa, elem in enumerate(files): try: c0 = _read_csv(elem, error_bad_lines=False, header=None, delimiter=delim, engine='python') except _EmptyDataError as err: c0 = _DataFrame() c0[0] = [0] try: c0['date'] = _parse(elem, fuzzy=True) except ValueError as err: dateFound = FindDate(elem) if len(dateFound) == 0: c0['date'] = indexa else: c0['date'] = dateFound[0] allR.append(c0) merged = _concat(allR, ignore_index=True) merged.index = merged['date'] return allR, merged
def _to_SpatialDataFrame(self, spatial_reference=None): """Return an arcgis spatially-enabled data frame. Arguments: spatial_reference Either None (the default), a spatial reference integer code, or a definition dictionary (for example {'wkid': 3857}). If None, the spatial reference will be extracted from the GeoDataFrame if it is defined using an EPSG code. """ if not spatial_reference: crs = self.crs if crs and 'init' in crs: if isinstance(crs, dict): crs = crs['init'] if 'epsg' in crs: m = _re.search('epsg:(\d+)', crs) if m: spatial_reference = int(m.groups()[0]) if not spatial_reference: spatial_reference = 4326 _warnings.simplefilter('always', UserWarning) _warnings.warn( 'Unable to extract a spatial reference, assuming latitude/longitude (wkid 4326).' ) sdf = _DataFrame(data=self.drop('geometry', axis=1)) sdf['SHAPE'] = self.geometry.apply(_as_arcgis, spatial_reference=spatial_reference) return sdf
def PacketAnalysis(df): # Only keep non-zero entries df = df.dropna() df = df[(df.iloc[:,1:].T != 0).any()] dfValue = df.iloc[:,1] dfStart = df[dfValue > 0]['time'] dfEnd = df[dfValue < 0]['time'].reset_index(drop=True) startList = [] endList = [] for burstStart in dfStart: laterEnds = dfEnd[(dfEnd > burstStart)] if laterEnds.shape[0] > 0: burstEnd = laterEnds.sort_values().iloc[0] else: burstEnd = None startList.append(burstStart) endList.append(burstEnd) return _DataFrame({'start':startList, 'end':endList}, columns=['start', 'end'])
def dataframe(self): """Pandas DataFrame representation of this path.""" if not self._dataframe: if _DataFrame: self._dataframe = _DataFrame(self._results) else: logger.debug("'pandas' package not installed") return self._dataframe
def get_historical_price(fromSymbol, toSymbol=_CURR, frequency='d', limit=2000): ''' :param frequency: 'd', 'h' or 'm' :param limit: number of bars to return. I think max is 2000 :return: get the last bars of fromSymbol in unit of fromSymbol ''' url = f"{_URL_CRYPTOCOMPARE}/{FREQUENCY[frequency]}?fsym={fromSymbol}&tsym={toSymbol}&limit={limit}" #&e={exchange}" rawdata = _requests.get(url).content.decode() parseddata = _loads(rawdata) try: df = _DataFrame(parseddata['Data']) if not df.empty: df = df.set_index('time') df.index = _to_datetime(df.index, unit='s') except Exception as inst: print(f"{type(inst)}: {inst}") # the exception instance df = _DataFrame() return df
def ReadData(paths, delim=None): """ ReadData requires list of wild card paths, read in each set and store in master data frame """ merged = _DataFrame() for indexa, elem in enumerate(paths): files = sorted(_glob(elem)) allR, m1 = ReadFilesToDF(files, delim=delim) merged[indexa] = m1[0] return merged
def PacketDetect(df, time, thresh=0, windowSize=1): dt = time[1] - time[0] df = LowPassFilter(df, dt, windowSize=windowSize) # Filter out non-peaks df[(df < thresh)] = 0 df[(df != 0)] = 1 df = df.diff() df = _DataFrame(df) df.insert(0, 'time', time) return df
def EvaluateSets(p1, p2, cols=None): if cols is None: cols = p1.columns df = _DataFrame(index=cols, columns=['Euclid', 'Supremum', 'MSE', 'RMSE', 'NRMSE']) for col in cols: df['Euclid'][col] = Euclid(p1[col], p2[col]) df['Supremum'][col] = Supremum(p1[col], p2[col]) df['MSE'][col] = MSE(p1[col], p2[col]) df['RMSE'][col] = RMSE(p1[col], p2[col]) df['NRMSE'][col] = NRMSE(p1[col], p2[col]) df return df
def get_data(self, return_dict=False): data_dict = { 'peak_id' : [self.__peak_id], 'binding_energy' : [self.__be], 'kinetic_energy' : [self.get_kinetic_energy()], 'transmission_func' : [self.get_transmission()], 'sf_wagner' : [self.__sf_wagner], 'sf_machine' : [self.get_sf_machine()], 'peak' : [self.__peak_area], 'peak_corrected' : [self.get_peak_correction()], } if hasattr(self, '_XPSPeak__matrix_component'): data_dict['matrix_component'] = [self.__matrix_component] if return_dict == True: return data_dict return _DataFrame(data=data_dict)
def _from_records(vals, keys, typs=None, idxs=None): """ create a table from a series of records """ if numpy.size(vals): assert numpy.ndim(vals) in (1, 2) vals = (numpy.array(vals) if numpy.ndim(vals) == 2 else numpy.reshape( vals, (-1, 1))) else: vals = numpy.reshape([], (0, len(keys))) nrows, ncols = numpy.shape(vals) idxs = (_RangeIndex(stop=nrows, name=IDX_KEY) if idxs is None else _Int64Index(idxs, name=IDX_KEY)) typs = (object, ) * ncols if typs is None else typs typs = tuple(map(dt_, typs)) assert len(keys) == len(typs) == ncols and len(idxs) == nrows assert IDX_KEY not in keys cols = numpy.transpose(vals) data = _OrderedDict((key, _Series(data=col, dtype=typ, index=idxs)) for col, key, typ in zip(cols, keys, typs)) return _DataFrame(data=data, index=idxs)
def _matricize_ybus(item): raw_n_ord = _lower(_odr.Circuit.YNodeOrder()) mtx = _matricize(item) return _DataFrame(data=mtx, index=raw_n_ord, columns=raw_n_ord)
def openTSDB_data_processor(metric_names=None, query_string=None, query_offset=20, tolerance=0.0): logger = _getLogger(__name__) logger.info('read failure tolerance is %s' % tolerance) logger.info('number of metric names: %d' % (len(metric_names))) metrics = list( set([ 'm=none:' + elem['metric'] for elem in _itertools.chain.from_iterable(metric_names) ])) logger.info('number of metrics as set: %d' % (len(metrics))) response_df_list = [(_requests.get('%s%s' % (query_string, "&".join( metrics[queryIndex:queryIndex + query_offset]))).json()) for queryIndex in range(0, len(metrics), query_offset)] flattened_list = list(_itertools.chain(*response_df_list)) logger.info('number of elements in response list: %d' % (len(flattened_list))) if len(metrics) > 0: logger.info(str(metrics[:1])) super_dict = {'metric': [], 'dps': [], 'tags': [], 'aggregateTags': []} num_defective_records = 0 num_correct_records = 0 for ite in flattened_list: if ite == 'error' or type(ite) is not dict: num_defective_records += 1 else: merge_dicts(super_dict, ite) num_correct_records += 1 max_defective_records = num_defective_records * query_offset total_num_queries = max_defective_records + num_correct_records logger.info('number of read request errors are: %s' % num_defective_records) logger.info('max number of failed reads are: %s' % max_defective_records) logger.info('number of total queries sent are %s' % total_num_queries) if total_num_queries > 0: read_error_perc = max_defective_records / total_num_queries else: #in state where metrics were sent to tsdb for query, but no time series were returned. logger.warn('total_num_queries is zero. no input data found in tsdb') raise ValueError("NoRecordsFound") if read_error_perc > tolerance: raise ValueError( 'Stopping execution as read errors exceeded tolerance for num metrics queried that do not exist in OTSDB. Read error percent is %s and tolerance is %s' % (read_error_perc, tolerance)) causal_data_frame = _DataFrame.from_dict(super_dict) new_df = causal_data_frame.drop( ['dps', 'aggregateTags', 'tags'], 1).assign(**_DataFrame(causal_data_frame.dps.values.tolist())) transposed_df = new_df.T cleaned_causal_df = transposed_df.rename(columns=dict( zip(transposed_df.columns.tolist(), transposed_df.iloc[0].astype( str)))).drop(transposed_df.index[0]) numeric_causal_df = cleaned_causal_df.apply(_to_numeric, errors='ignore') return numeric_causal_df
def processInputData(inputa, fieldSeparator=None, recordSeparator=None, otsdbExcludeCharRegex='[^a-zA-Z\d\-_%s/]+', otsdbExcludeCharReplacer='_'): logger = _getLogger(__name__) logger.info("in processInputData") output = {} if fieldSeparator is None or recordSeparator is None: return output engine = 'c' if len(recordSeparator) > 1: engine = 'python' lines = inputa.strip().split('\n') partsAll = [elem.strip().split(recordSeparator) for elem in lines] subfields = [] for selector in range(3): subfields.append( [elem[selector].split(fieldSeparator) for elem in partsAll]) for i in range(len(subfields[0])): clean_row_part(subfields[0][i], otsdbExcludeCharRegex, otsdbExcludeCharReplacer=otsdbExcludeCharReplacer, excludeChars='') clean_row_part(subfields[1][i], otsdbExcludeCharRegex, otsdbExcludeCharReplacer=otsdbExcludeCharReplacer, excludeChars='') clean_row_part(subfields[2][i], otsdbExcludeCharRegex, otsdbExcludeCharReplacer=otsdbExcludeCharReplacer, excludeChars='.') columns = [elem.split('=')[0] for elem in subfields[0][0]] times = [elem.split('=')[0] for elem in subfields[1][0]] values = [elem.split('=')[0] for elem in subfields[2][0]] dfAll = [] for currentFieldIndex, currentField in enumerate(subfields): d1 = [] logger.info(currentFieldIndex) for indexa, elema in enumerate(currentField): try: d1.append( {elem.split('=')[0]: elem.split('=')[1] for elem in elema}) except Exception as e: logger.info(e) logger.debug(elema) logger.debug((d1[0].keys())) raise (e) dfAll.append(_DataFrame(d1)) dfMaster = _concat(dfAll, axis=1) logger.info('before timestamp normalization') timestamp = normalizeTimeStamps(dfMaster, time_columns=times) dfMaster['timestamp'] = timestamp if logger.isEnabledFor(_DEBUG): logger.debug("type debug:%s" % (type(timestamp[0]))) logger.debug("%s" % (timestamp[0])) logger.debug("type debug:%s" % (type(dfMaster['timestamp'][0]))) logger.debug("%s" % (dfMaster['timestamp'][0])) output = {} output['df'] = dfMaster output['time_columns'] = times output['value_columns'] = values output['group_columns'] = columns logger.info("times=%s, values=%s, columns=%s" % (times, values, columns)) logger.info("leave processInputData") return output
self.semimin = self.semimaj * (1 - self.flatten) # b self.semiminsq = self.semimin * self.semimin # b**2 self.ecc1sq = self.flatten * (2 - self.flatten) self.ecc2sq = self.ecc1sq / (1 - self.ecc1sq) # self.ecc1 = sqrt(self.ecc1sq) # self.n = float(self.f / (2 - self.f)) # self.n2 = self.n ** 2 # self.meanradius = (2 * self.semimaj + self.semimin)/3 # Geodetic Reference System 1980 # www.epsg-registry.org/export.htm?gml=urn:ogc:def:ellipsoid:EPSG::7019 GRS80 = Ellipsoid(6378137, 298.257222101) # World Geodetic System 1984 # www.epsg-registry.org/export.htm?gml=urn:ogc:def:ellipsoid:EPSG::7030 WGS84 = Ellipsoid(6378137, 298.257223563) # Australian National Spheroid # www.epsg-registry.org/export.htm?gml=urn:ogc:def:ellipsoid:EPSG::7003 ANS = Ellipsoid(6378160, 298.25) # International (Hayford) 1924 # www.epsg-registry.org/export.htm?gml=urn:ogc:def:ellipsoid:EPSG::7022 INTL24 = Ellipsoid(6378388, 297) SISRE_COEF_DF = _DataFrame(data=[[0.99, 0.98, 0.98, 0.98, 0.98], [127, 54, 49, 45, 61]], columns=['C_IGSO', 'C', 'G', 'R', 'E'], index=['alpha', 'beta'])