def setUp(self): super(TestNDFrame, self).setUp() data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], } self.frame = { 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), 'mixed': DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])) } self.panel = { 'float': Panel( dict(ItemA=self.frame['float'], ItemB=self.frame['float'] + 1)) }
def setUp(self): super(TestNDFrame, self).setUp() data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.], 'F': [Timestamp('20130102', tz='US/Eastern')] * 5, 'G': [Timestamp('20130603', tz='CET')] * 5, 'H': Categorical(['a', 'b', 'c', 'd', 'e']), 'I': Categorical(['a', 'b', 'c', 'd', 'e'], ordered=True), } self.frame = { 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), 'mixed': DataFrame(data)} self.panel = { 'float': Panel(dict(ItemA=self.frame['float'], ItemB=self.frame['float'] + 1))}
def loopsat(tlefn, dates, obs): cols = ['az', 'el', 'lat', 'lon', 'alt', 'srange'] sats, satnum = loadTLE(tlefn) data = Panel(items=dates, major_axis=satnum, minor_axis=cols) for d in dates: obs.date = d df = DataFrame(index=satnum, columns=cols) for i, s in enumerate(sats): s.compute( ) # don't compute lat/lon/alt with obs! will give wrong answer! df.at[satnum[i], ['lat', 'lon', 'alt']] = degrees( s.sublat), degrees(s.sublong), s.elevation s.compute(obs) df.at[satnum[i], ['az', 'el', 'srange']] = degrees(s.az), degrees( s.alt), s.range df.ix[df['el'] < 0, ['az', 'el', 'srange']] = nan data[d] = df return data
def makeBlocks(rinex, ntypes, maxsv, svnames, obstypes, obstimes): """ inputs: rinex: file stream ntypes: number of observation types obstimes: datetime() of each observation obstypes: type of measurment e.g. P1, P2,... maxsv: maximum number of SVs the reciever saw in this file (i.e. across the entire obs. time) outputs: blocks: dimensions timeINTERVALs x maxsv x ntypes (page x row x col) """ blocks = Panel(items=obstimes, major_axis=svnames, minor_axis=obstypes) for i in range( obstimes.size ): #this means maxtimes was specified, otherwise we'd reach end of file sathead = rinex.readline() if not sathead: break #EOF svnum = int(sathead[29:32]) obslinespersat = int(np.ceil(ntypes / 5)) blockrows = svnum * obslinespersat satnames = sathead[32:68] for _ in range(int(np.ceil(svnum / 12)) - 1): line = rinex.readline() sathead += line satnames += line[32:68] #FIXME is this right end? blocksvnames = satnumfixer(grouper(satnames, 3, svnum)) #%% read this INTERVAL's text block block = ''.join(rinex.readline() for _ in range(blockrows)) btime = _obstime(sathead[:26].split()) bdf = _block2df(block, svnum, obstypes, blocksvnames) blocks.loc[btime, blocksvnames] = bdf return blocks
def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause, method): stocks = {} failed = [] for sym_group in _in_chunks(symbols, chunksize): for sym in sym_group: try: stocks[sym] = method(sym, start, end, interval, retry_count, pause) except IOError: warnings.warn('Failed to read symbol: {0!r}, replacing with ' 'NaN.'.format(sym), SymbolWarning) failed.append(sym) try: if len(stocks) > 0 and len(failed) > 0: df_na = stocks.values()[0].copy() df_na[:] = np.nan for sym in failed: stocks[sym] = df_na return Panel(stocks).swapaxes('items', 'minor') except AttributeError: # cannot construct a panel with just 1D nans indicating no data raise RemoteDataError("No data fetched using " "{0!r}".format(method.__name__))
def downloadData(self,symbols='all'): ''' get data from yahoo ''' if symbols == 'all': symbols = self.symbols #store = HDFStore(self.dataFile) p = ProgressBar(len(symbols)) for idx,symbol in enumerate(symbols): try: df = getSymbolData(symbol,sDate=self.startDate,verbose=False) if self.autoAdjust: df = _adjust(df,removeOrig=True) if len(self.symbols)==0: self.wp = Panel({symbol:df}) else: self.wp[symbol] = df except Exception,e: print e p.animate(idx+1)
def test_take(self): indices = [1, 5, -2, 6, 3, -1] for s in [tm.makeFloatSeries(), tm.makeStringSeries(), tm.makeObjectSeries()]: out = s.take(indices) expected = Series(data=s.values.take(indices), index=s.index.take(indices), dtype=s.dtype) tm.assert_series_equal(out, expected) for df in [tm.makeTimeDataFrame()]: out = df.take(indices) expected = DataFrame(data=df.values.take(indices, axis=0), index=df.index.take(indices), columns=df.columns) tm.assert_frame_equal(out, expected) indices = [-3, 2, 0, 1] with catch_warnings(record=True): for p in [tm.makePanel()]: out = p.take(indices) expected = Panel(data=p.values.take(indices, axis=0), items=p.items.take(indices), major_axis=p.major_axis, minor_axis=p.minor_axis) tm.assert_panel_equal(out, expected)
def get_code_churn(commits): shas = commits.index[::-1] prev = shas[0] insertions = [np.nan] deletions = [np.nan] insertions = {} deletions = {} for cur in shas[1:]: i, d = get_commit_churn(cur, prev) insertions[cur] = i deletions[cur] = d # insertions.append(i) # deletions.append(d) prev = cur return Panel({'insertions': DataFrame(insertions), 'deletions': DataFrame(deletions)}, minor_axis=shas)
def test_resample_panel(self): rng = date_range('1/1/2000', '6/30/2000') n = len(rng) panel = Panel(np.random.randn(3, n, 5), items=['one', 'two', 'three'], major_axis=rng, minor_axis=['a', 'b', 'c', 'd', 'e']) result = panel.resample('M', axis=1) def p_apply(panel, f): result = {} for item in panel.items: result[item] = f(panel[item]) return Panel(result, items=panel.items) expected = p_apply(panel, lambda x: x.resample('M')) tm.assert_panel_equal(result, expected) panel2 = panel.swapaxes(1, 2) result = panel2.resample('M', axis=2) expected = p_apply(panel2, lambda x: x.resample('M', axis=1)) tm.assert_panel_equal(result, expected)
def getHistoricData(symbols, **options): ''' get data from Yahoo finance and return pandas dataframe Will get OHLCV data frame if sinle symbol is provided. If many symbols are provided, it will return a wide panel Parameters ------------ symbols : str or list Yahoo finanance symbol or a list of symbols sDate : tuple (optional) start date (y,m,d) eDate : tuple (optional) end date (y,m,d) adjust : bool T/[F] adjust data based on adj_close Returns --------- Panel ''' assert isinstance(symbols,(list,str)), 'Input must be a string symbol or a list of symbols' if isinstance(symbols,str): return getSymbolData(symbols,**options) else: data = {} print('Downloading data:') p = ProgressBar(len(symbols)) for idx,symbol in enumerate(symbols): p.animate(idx+1) data[symbol] = getSymbolData(symbol,verbose=False,**options) return Panel(data)
def test_partial_setting(self): # GH2578, allow ix and friends to partially set # series s_orig = Series([1, 2, 3]) s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() def f(): s.iloc[3] = 5. pytest.raises(IndexError, f) def f(): s.iat[3] = 5. pytest.raises(IndexError, f) # ## frame ## df_orig = DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') # iloc/iat raise df = df_orig.copy() def f(): df.iloc[4, 2] = 5. pytest.raises(IndexError, f) def f(): df.iat[4, 2] = 5. pytest.raises(IndexError, f) # row setting where it exists expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) with catch_warnings(record=True): # ## panel ## p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') # panel setting via item p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] tm.assert_panel_equal(p, expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2, 1, 0) expected['C'] = DataFrame( { 'Item1': [30, 30, 30, 30], 'Item2': [32, 32, 32, 32] }, index=p_orig.major_axis) expected = expected.transpose(2, 1, 0) p = p_orig.copy() p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) tm.assert_panel_equal(p, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) expected = pd.concat( [df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])]) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) expected = pd.concat([df_orig, exp_other], axis=1) df = df_orig.copy() df.loc[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected)
def make_source(self): return Panel(self.raw_data).tz_localize('UTC', axis=1)
def create_data(): """ create the pickle data """ from distutils.version import LooseVersion import numpy as np import pandas from pandas import (Series,TimeSeries,DataFrame,Panel, SparseSeries,SparseTimeSeries,SparseDataFrame,SparsePanel, Index,MultiIndex,PeriodIndex, date_range,period_range,bdate_range,Timestamp,Categorical) nan = np.nan data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E' : [0., 1, Timestamp('20100101'),'foo',2.], } index = dict(int = Index(np.arange(10)), date = date_range('20130101',periods=10), period = period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2 = MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float = Series(data['A']), int = Series(data['B']), mixed = Series(data['E']), ts = TimeSeries(np.arange(10).astype(np.int64),index=date_range('20130101',periods=10)), mi = Series(np.arange(5).astype(np.float64),index=MultiIndex.from_tuples(tuple(zip(*[[1,1,2,2,2], [3,4,3,4,5]])), names=['one','two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz']))) frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']])), mi = DataFrame(dict(A = np.arange(5).astype(np.float64), B = np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple(zip(*[['bar','bar','baz','baz','baz'], ['one','two','one','two','three']])), names=['first','second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), B=np.arange(3).astype(np.int64))), ) panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1)), dup = Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A'])) if LooseVersion(pandas.__version__) >= '0.14.1': # Pre-0.14.1 versions generated non-unpicklable mixed-type frames and # panels if their columns/items were non-unique. mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] frame['mixed_dup'] = mixed_dup_df panel['mixed_dup'] = mixed_dup_panel return dict( series = series, frame = frame, panel = panel, index = index, mi = mi, sp_series = dict(float = _create_sp_series(), ts = _create_sp_tsseries()), sp_frame = dict(float = _create_sp_frame()) )
def WQXtoPandas( xmlLocation, charDict, outputPath='.', fromFile=False, outputDirName='Processed-Sites', RUN_PHREEQC=False, PHREEQC_PATH='/home/mcoving/phreeqc-2.18.0/bin/', DATABASE_FILE='/home/mcoving/phreeqc-2.18.0/database/phreeqc.dat', LOG_FILE='Result.log', START_FILE=None, splittag='', bracket_charge_balance=False): """ Processes a WQX xml data file and loads data for each site in the WQX file into Pandas data objects that are stored in directories for each site. Parameters ---------- xmlLocation : string Content depends on mode in which WQXtoPandas is run. When fromFile is set to False (input methods 2 or 3 in excel file) this string contains the html for a query to the USGS NWIS database to obtain an xml file of the desired data. Alternatively, if fromFile is True (input method 1 in excel file) then this string contains the name of the xml file from which to read the data. charDict : dict A dictionary containing information about the characteristics to be processed. Keys are EPA SRS characteristic names. Each entry in the dictionary is a second dictionary that contains keys IsRequired, pcode, fraction, and quality. These entries tell WQXtoPandas whether a given characteristic is required in order to process a sample, and whether a specific pcode, fraction, or quality should be required. See excel example file for more details. outputPath : string path to directory that will contain output directory fromFile : boolean True if data will be read from an xml file already present on computer. False if xml file should be queried from NWIS. (Default=False) outputDirName : string Name of output directory where all site data will be written out. (Default='Processed-Sites') RUN_PHREEQC : boolean Set to true if samples should be processed through PHREEQC. (Default=False) PHREEQC_PATH : string Path to PHREEQC executable (folder only, not executable file name) DATABASE_FILE : string Path to database file that PHREEQC should use, including database file name. LOG_FILE : string Name of log file that WQXtoPandas will create. (Default='Result.log') START_FILE : string Name of xls start file that was used to run this instance of WQXtoPandas. Name will be written out in log file. bracket_charge_balance : bool If set to true, WQXtoPandas will alternately force charge balance on calcium and alkalinity, while the latter is not physically meaningful, this provides a useful estimate of uncertainty for cases with high charge balance errors. This is most useful for water that is very dilute or with high organic content, such that titrated alkalinity values are artificially high. Returns ------- Returns 0 if execution successful. Returns -1 in case of error. Notes ----- Designed to be run through convenience function runWQXtoPandas(). """ try: #Check to see if output directory exists absOutputDirPath = os.path.abspath(outputPath) sitesdir = os.path.join(absOutputDirPath, outputDirName) print "sitesdir", sitesdir if not (os.path.exists(sitesdir)): try: os.makedirs(sitesdir) except os.error: print( "Problem creating output directory. Check output path name: " + outputPath) return -1 #create xml tree if fromFile: #read from file wqxtree = etree.ElementTree(file=xmlLocation) else: #check whether we already have a matching xml file xmlSaveFile = LOG_FILE + splittag + '.xml' if (os.path.isfile(xmlSaveFile)): goodAnswer = False while not (goodAnswer): answer = raw_input( "An xml file (" + xmlSaveFile + ") already exists. \n Use this instead of html query (y or n)?" ) if (answer.startswith('y')): #read from file wqxtree = etree.ElementTree(file=xmlSaveFile) goodAnswer = True queryXML = False elif (answer.startswith('n')): goodAnswer = True queryXML = True else: queryXML = True #If we don't have a matching xml file, or we want to obtain a new one, then get the new xml if (queryXML): print "Obtaining xml file from USGS NWIS using html query..." #parse from html query r = requests.get(xmlLocation) #write to xml file try: #write xml to file xmlFile = open(xmlSaveFile, 'w') print >> xmlFile, r.text xmlFile.close() wqxtree = etree.ElementTree(file=xmlSaveFile) except IOError: print("Problem writing to xml file to store html query: " + xmlSaveFile) return -1 #begin parsing XML tree root = wqxtree.getroot() #get namespace map NSMAP = root.nsmap WQX = "{%s}" % NSMAP[None] #iterate over all <Activity> tags within file and process each sample samples_processed = [] samples_not_processed = [] sitesDict = {} sitesMetaDict = {} for activity in wqxtree.getiterator(tag=WQX + "Activity"): processThisSample = True reason = '' description = activity.find(WQX + "ActivityDescription") if (description != None): datetext = description.findtext(WQX + "ActivityStartDate") starttime = description.find(WQX + "ActivityStartTime") if (starttime != None): timetext = starttime.findtext(WQX + "Time") timezone = starttime.findtext(WQX + "TimeZoneCode") else: timetext = '' timezone = '' location = description.findtext(WQX + "MonitoringLocationIdentifier") descriptionDict = { 'location': location, 'date': datetext, 'time': timetext, 'timezone': timezone } else: descriptionDict = None processThisSample = False reason = 'No description' print('Processing sample from ' + location + ' on ' + datetext) #create null sample dict sampleDict = {} sampleMetaDict = {} #iterate though all results for this activity for result in activity.getiterator(tag=WQX + 'Result'): if (processThisSample): try: resultdesc = result.find(WQX + "ResultDescription") characteristic = resultdesc.findtext( WQX + "CharacteristicName") if (characteristic in charDict): samplefraction = resultdesc.findtext( WQX + "ResultSampleFractionText") pcode = resultdesc.findtext(WQX + "USGSPCode") quality = resultdesc.findtext( WQX + "ResultStatusIdentifier") measure = resultdesc.find(WQX + "ResultMeasure") count = 1.0 if not (measure == None): value = measure.findtext(WQX + "ResultMeasureValue") units = measure.findtext(WQX + "MeasureUnitCode") #split pcode into list tempPcodeList = charDict[characteristic][ 'pcode'].split(';') # print("tempPcodeList="+str(tempPcodeList)) pcodeDict = {} for codePriority, code in enumerate( tempPcodeList): code = code.strip() if code != '': pcodeDict[code] = codePriority #Check whether characteristic meets criteria #for inclusion, otherwise don't add to sampleDict addCharacteristic = True if (charDict[characteristic]['fraction'] != '0'): #test for correct fraction if (charDict[characteristic]['fraction'] != samplefraction): addCharacteristic = False if (addCharacteristic): if (charDict[characteristic]['pcode'] != '0'): #test for correct pcode # print("pcode = "+pcode) # print("pcodeList = "+str(pcodeList)) # print("pcode in list="+str(pcode in pcodeList)) if not (pcode in pcodeDict): addCharacteristic = False if (addCharacteristic): if (charDict[characteristic]['quality'] != '0'): #test for correct data quality if (charDict[characteristic]['quality'] != quality): addCharacteristic = False #end of characteristic criteria check #Process duplicate characteristics if (addCharacteristic): if (characteristic in sampleDict): priorPcode = sampleMetaDict[ characteristic]['pcode'] #if there are already multiple pcodes get only first one priorPcode = priorPcode.split(';')[0] averageValue = False if (len(pcodeDict) > 1): thisPcodePriority = pcodeDict[ pcode] priorPcodePriority = \ pcodeDict[priorPcode] if (thisPcodePriority >\ priorPcodePriority): #previous characteristic remains addCharacteristic = False elif (thisPcodePriority ==\ priorPcodePriority): averageValue = True else: averageValue = True if averageValue: #average this value with existing values count = \ sampleMetaDict[characteristic]['count'] count += 1. oldvalue = float(\ sampleDict[characteristic]) newvalue = (oldvalue * (count - 1.)\ + float(value))/count value = str(newvalue) pcode = priorPcode + '; ' + pcode priorUnits = \ sampleMetaDict[characteristic]['units'] units = priorUnits + '; ' + units if (addCharacteristic): sampleDict[characteristic] = value sampleMetaDict[characteristic] = { 'samplefraction': samplefraction, 'units': units, 'pcode': pcode, 'quality': quality, 'count': count } #end results loop except etree.XMLSyntaxError as detail: print "File contains invalid XML syntax: ", detail processThisSample = False reason = "Entry contains invalid XML syntax." #check whether sample has all the required constituents # print "Checking for requirements." if (processThisSample): for characteristic in charDict.iterkeys(): if (charDict[characteristic]['IsRequired'] != '0'): if not (characteristic in sampleDict): processThisSample = False reason += characteristic + ' not available. ' if (processThisSample): #check to see whether site directory exists, if not, create it sampledir = os.path.join(sitesdir, location) if not (os.path.exists(sampledir)): try: os.makedirs(sampledir) except os.error: print("Problem creating location directory: " + sampledir) processThisSample = False reason = "Problem creating location directory: " + sampledir if (processThisSample): #Pull daily discharge data from USGS website dischargeDict = GetDailyDischarge( location, datetext ) #currently hard-wired to pcode 00060 (daily discharge, cfs) if (dischargeDict != None): sampleDict['Stream flow, mean. daily'] = dischargeDict[ 'discharge'] sampleMetaDict['Stream flow, mean. daily'] = { 'units': 'cfs', 'pcode': '00060', 'quality': dischargeDict['quality'], 'count': 1, 'samplefraction': None } descriptionDict['name'] = dischargeDict['name'] else: #Possibly allow this sample to be thrown out if no mean daily discharge, and/or similar for instantaneous discharge sampleDict['Stream flow, mean. daily'] = None sampleMetaDict['Stream flow, mean. daily'] = { 'units': 'cfs', 'pcode': '00060', 'quality': None, 'count': 1, 'samplefraction': None } # Create data frame row for this sample date if descriptionDict['time'] != '': rowdate = to_datetime(datetext + ' ' + descriptionDict['time']) else: rowdate = to_datetime(datetext) #sampleRow = DataFrame(sampleDict, index=[rowdate], dtype='float') #Create Panel to contain sample meta data samplePanelRow = Panel({ 'data': DataFrame(sampleDict, index=[rowdate], dtype='float'), 'time': DataFrame(descriptionDict['time'], index=[rowdate], columns=sampleMetaDict.keys()), 'timezone': DataFrame(descriptionDict['timezone'], index=[rowdate], columns=sampleMetaDict.keys()), 'pcode': DataFrame( [extractValues(sampleMetaDict, ['pcode'])['values']], index=[rowdate], columns=sampleMetaDict.keys()), 'quality': DataFrame( [extractValues(sampleMetaDict, ['quality'])['values']], index=[rowdate], columns=sampleMetaDict.keys()), 'fraction': DataFrame([ extractValues(sampleMetaDict, ['samplefraction'])['values'] ], index=[rowdate], columns=sampleMetaDict.keys()), 'units': DataFrame( [extractValues(sampleMetaDict, ['units'])['values']], index=[rowdate], columns=sampleMetaDict.keys()), 'count': DataFrame( [extractValues(sampleMetaDict, ['count'])['values']], index=[rowdate], columns=sampleMetaDict.keys()), }) #sampleMetaRow = Series(sampleMetaDict, index=[to_datetime(datetext)], dtype='object') #Previous solution was reading/writing from pickle files #New solution will keep all data in memory until end. #This could cause memory problems with large data sets #Test whether a df for this location already exists if location in sitesDict: # tempDF = sitesDict[location] # sitesDict[location] = tempDF.append(sampleRow) tempPanel = sitesDict[location] sitesDict[location] = concat([tempPanel, samplePanelRow], axis=1) else: sitesDict[location] = samplePanelRow #add one to number of samples processed if (processThisSample): samples_processed.append(location + ' ' + datetext) else: samples_not_processed.append(location + ' ' + datetext + ' - ' + reason) print('Number of Samples Processed = ' + str(len(samples_processed))) print('Number of Samples Not Processed = ' + str(len(samples_not_processed))) #Write out individual site data pickle and csv files in each site directory print('Writing out site data files...') for location, pnl in sitesDict.iteritems(): print(location) pickleFile = os.path.join(sitesdir, location, location + '-Panel.pkl') pickle.dump(pnl, open(pickleFile, 'wb')) pnl.to_excel(pickleFile[:-3] + 'xls') #Retrieve and store site description metadata siteDescriptionDataDF = GetSiteData(location) siteDescriptionDataFileName = os.path.join( sitesdir, location, location + '-Site-Description.pkl') pickle.dump(siteDescriptionDataDF, open(siteDescriptionDataFileName, 'wb')) siteDescriptionDataDF.to_csv(siteDescriptionDataFileName[:-3] + 'csv') #Process sites through PHREEQC if RUN_PHREEQC: print("Processing site water chemisty data in PHREEQC...") for location, pnl in sitesDict.iteritems(): phreeqc_df = processPanel(pnl, os.path.join(sitesdir, location), PHREEQC_PATH, DATABASE_FILE) phreeqc_site_file = os.path.join(sitesdir, location, location + '-PHREEQC.pkl') try: pickle.dump(phreeqc_df, open(phreeqc_site_file, 'wb')) phreeqc_df.to_csv(phreeqc_site_file[:-3] + 'csv') except IOError: print('Problem writing out PHREEQC data file.') if bracket_charge_balance: for location, pnl in sitesDict.iteritems(): #Force balance on Calcium phreeqc_df_ca = processPanel(pnl, os.path.join( sitesdir, location), PHREEQC_PATH, DATABASE_FILE, force_balance='Ca') phreeqc_site_file_ca = os.path.join( sitesdir, location, location + '-PHREEQC-Ca.pkl') try: pickle.dump(phreeqc_df_ca, open(phreeqc_site_file_ca, 'wb')) phreeqc_df_ca.to_csv(phreeqc_site_file_ca[:-3] + 'csv') except IOError: print('Problem writing out PHREEQC Ca data file.') #Force balance on Alkalinity phreeqc_df_alk = processPanel(pnl, os.path.join( sitesdir, location), PHREEQC_PATH, DATABASE_FILE, force_balance='Alk') phreeqc_site_file_alk = os.path.join( sitesdir, location, location + '-PHREEQC-Alk.pkl') try: pickle.dump(phreeqc_df_alk, open(phreeqc_site_file_alk, 'wb')) phreeqc_df_alk.to_csv(phreeqc_site_file_alk[:-3] + 'csv') except IOError: print('Problem writing out PHREEQC Alk data file.') #Create log file print('Writing log file: ' + LOG_FILE + splittag) try: log_file = open(LOG_FILE + splittag, 'w') print >> log_file, 'Start file = ' + START_FILE print >> log_file, 'Number of Samples Processed = ' + str( len(samples_processed)) print >> log_file, 'Number of Samples Not Processed = ' + str( len(samples_not_processed)) print >> log_file, "###############" print >> log_file, "Characteristics" print >> log_file, "###############" printColumnNames = True for key, flags in charDict.iteritems(): if (printColumnNames): names = ['characteristic'] # + '\t' for column in flags.iterkeys(): names.append(str(column)) print >> log_file, str("\t".join(names)) printColumnNames = False columns = [key] for column in flags.iterkeys(): if isinstance(flags[column], basestring): columns.append(flags[column]) print >> log_file, str("\t".join(columns)) print >> log_file, "###############" print >> log_file, "Samples processed" print >> log_file, "###############" for line in samples_processed: print >> log_file, line print >> log_file, "###############" print >> log_file, "Samples not processed" print >> log_file, "###############" for line in samples_not_processed: print >> log_file, line except IOError: print("Problem opening log file: " + LOG_FILE) return -1 #exceptions for parsing of xml file except IOError: print("Error opening xml file. Does it exist?") #Note: can throw this error when discharge values are not read correctly, #I should fix this, 6/16/2014 except etree.XMLSyntaxError as detail: print "File contains invalid XML syntax: ", detail except requests.exceptions.RequestException as detail: print "Error retrieving data by xml query: ", detail return 0
def test_iloc_getitem_panel(self): with catch_warnings(record=True): # GH 7189 p = Panel(np.arange(4 * 3 * 2).reshape(4, 3, 2), items=['A', 'B', 'C', 'D'], major_axis=['a', 'b', 'c'], minor_axis=['one', 'two']) result = p.iloc[1] expected = p.loc['B'] tm.assert_frame_equal(result, expected) result = p.iloc[1, 1] expected = p.loc['B', 'b'] tm.assert_series_equal(result, expected) result = p.iloc[1, 1, 1] expected = p.loc['B', 'b', 'two'] assert result == expected # slice result = p.iloc[1:3] expected = p.loc[['B', 'C']] tm.assert_panel_equal(result, expected) result = p.iloc[:, 0:2] expected = p.loc[:, ['a', 'b']] tm.assert_panel_equal(result, expected) # list of integers result = p.iloc[[0, 2]] expected = p.loc[['A', 'C']] tm.assert_panel_equal(result, expected) # neg indices result = p.iloc[[-1, 1], [-1, 1]] expected = p.loc[['D', 'B'], ['c', 'b']] tm.assert_panel_equal(result, expected) # dups indices result = p.iloc[[-1, -1, 1], [-1, 1]] expected = p.loc[['D', 'D', 'B'], ['c', 'b']] tm.assert_panel_equal(result, expected) # combined result = p.iloc[0, [True, True], [0, 1]] expected = p.loc['A', ['a', 'b'], ['one', 'two']] tm.assert_frame_equal(result, expected) # out-of-bounds exception with pytest.raises(IndexError): p.iloc[tuple([10, 5])] with pytest.raises(IndexError): p.iloc[0, [True, True], [0, 1, 2]] # trying to use a label with pytest.raises(ValueError): p.iloc[tuple(['j', 'D'])] # GH p = Panel(np.random.rand(4, 3, 2), items=['A', 'B', 'C', 'D'], major_axis=['U', 'V', 'W'], minor_axis=['X', 'Y']) expected = p['A'] result = p.iloc[0, :, :] tm.assert_frame_equal(result, expected) result = p.iloc[0, [True, True, True], :] tm.assert_frame_equal(result, expected) result = p.iloc[0, [True, True, True], [0, 1]] tm.assert_frame_equal(result, expected) with pytest.raises(IndexError): p.iloc[0, [True, True, True], [0, 1, 2]] with pytest.raises(IndexError): p.iloc[0, [True, True, True], [2]]
def create_data(): """ create the pickle/msgpack data """ data = { 'A': [0., 1., 2., 3., np.nan], 'B': [0, 1, 0, 1, 0], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': date_range('1/1/2009', periods=5), 'E': [0., 1, Timestamp('20100101'), 'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101')) if LooseVersion(pandas.__version__) >= '0.17.0': scalars['period'] = Period('2012', 'M') index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple( zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=['one', 'two'])), dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz']))) if LooseVersion(pandas.__version__) >= '0.17.0': series['period'] = Series([Period('2000Q1')] * 5) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") frame = dict( float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), index=MultiIndex.from_tuples(tuple( zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], ['one', 'two', 'one', 'two', 'three']])), names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), cat_and_float=DataFrame( dict(A=Categorical(['foo', 'bar', 'baz']), B=np.arange(3).astype(np.int64))), mixed_dup=mixed_dup_df) mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] panel = dict(float=Panel( dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A']), mixed_dup=mixed_dup_panel) return dict(series=series, frame=frame, panel=panel, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()))
# -------------------- S1. Prepare Dataset -------------------- # Form a panel dataset comprising of all historical data # from Shanghai and Shenzhen exchages data_panel = None for root, dirs, files in os.walk(folder_stock_data): if debug == True: # Limit the number of files to load to 10 files = files[:4] data_dict = {} for file in files: if file.endswith('.xlsx') and file != "index.xlsx": logger.debug("Now loading " + root + '/' + file) stock_id, dump = file.split('.') data_temp = pd.read_excel(root + '/' + file) # Declare timeseries data_temp = data_temp.set_index('Date').tz_localize( 'Asia/Shanghai') data_dict[stock_id] = data_temp data_panel = Panel(data_dict) # Daily series ## # Calculate return and cumulative return # Non-trading stocks - The days that a stock is non-traded are excluded from the # sample. for item in data_panel.items: df_temp = data_panel.ix[item] ##
def _read_wide(self, group, where=None): return Panel(self._read_block_manager(group))
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor.from_array(index) minor = Factor.from_array(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) sorter = com._ensure_platform_int(sorter) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.ref_items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index._tuple_index unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) indexer = com._ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first if isinstance(x, PanelData): x = x.dataframe self._original = x if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError('Only 2-d or 3-d DataArrays are supported') x = x.to_pandas() if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( 'Series can only be used with a 2-level MultiIndex') if isinstance(x, (Panel, DataFrame)): if isinstance(x, DataFrame): if isinstance(x.index, pd.MultiIndex): if len(x.index.levels) != 2: raise ValueError('DataFrame input must have a ' 'MultiIndex with 2 levels') self._frame = x.copy() else: self._frame = DataFrame( {var_name: x.T.stack(dropna=False)}) else: self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False) elif isinstance(x, ndarray): if not 2 <= x.ndim <= 3: raise ValueError('2 or 3-d array required for numpy input') if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape variables = [var_name] if k == 1 else [ var_name + '.{0}'.format(i) for i in range(k) ] entities = ['entity.{0}'.format(i) for i in range(n)] time = list(range(t)) x = x.astype(np.float64) panel = Panel(x, items=variables, major_axis=time, minor_axis=entities) self._frame = panel.swapaxes(1, 2).to_frame(filter_observations=False) else: raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays ' 'supported.') if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64) time_index = Series(self._frame.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError('The index on the time dimension must be either ' 'numeric or date-like') self._k, self._t, self._n = self.panel.shape self._frame.index.levels[0].name = 'entity' self._frame.index.levels[1].name = 'time'
def test_sample(sel): # Fixes issue: 2419 # additional specific object based tests # A few dataframe test with degenerate weights. easy_weight_list = [0] * 10 easy_weight_list[5] = 1 df = pd.DataFrame({ 'col1': range(10, 20), 'col2': range(20, 30), 'colString': ['a'] * 10, 'easyweights': easy_weight_list }) sample1 = df.sample(n=1, weights='easyweights') assert_frame_equal(sample1, df.iloc[5:6]) # Ensure proper error if string given as weight for Series, panel, or # DataFrame with axis = 1. s = Series(range(10)) with pytest.raises(ValueError): s.sample(n=3, weights='weight_column') with catch_warnings(record=True): panel = Panel(items=[0, 1, 2], major_axis=[2, 3, 4], minor_axis=[3, 4, 5]) with pytest.raises(ValueError): panel.sample(n=1, weights='weight_column') with pytest.raises(ValueError): df.sample(n=1, weights='weight_column', axis=1) # Check weighting key error with pytest.raises(KeyError): df.sample(n=3, weights='not_a_real_column_name') # Check that re-normalizes weights that don't sum to one. weights_less_than_1 = [0] * 10 weights_less_than_1[0] = 0.5 tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) ### # POJO.Test axis argument ### # POJO.Test axis argument df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) second_column_weight = [0, 1] assert_frame_equal( df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) # Different axis arg types assert_frame_equal( df.sample(n=1, axis='columns', weights=second_column_weight), df[['col2']]) weight = [0] * 10 weight[5] = 0.5 assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), df.iloc[5:6]) assert_frame_equal(df.sample(n=1, axis='index', weights=weight), df.iloc[5:6]) # Check out of range axis values with pytest.raises(ValueError): df.sample(n=1, axis=2) with pytest.raises(ValueError): df.sample(n=1, axis='not_a_name') with pytest.raises(ValueError): s = pd.Series(range(10)) s.sample(n=1, axis=1) # POJO.Test weight length compared to correct axis with pytest.raises(ValueError): df.sample(n=1, axis=1, weights=[0.5] * 10) # Check weights with axis = 1 easy_weight_list = [0] * 3 easy_weight_list[2] = 1 df = pd.DataFrame({ 'col1': range(10, 20), 'col2': range(20, 30), 'colString': ['a'] * 10 }) sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) assert_frame_equal(sample1, df[['colString']]) # POJO.Test default axes with catch_warnings(record=True): p = Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], minor_axis=[1, 3, 5]) assert_panel_equal(p.sample(n=3, random_state=42), p.sample(n=3, axis=1, random_state=42)) assert_frame_equal(df.sample(n=3, random_state=42), df.sample(n=3, axis=0, random_state=42)) # POJO.Test that function aligns weights with frame df = DataFrame({ 'col1': [5, 6, 7], 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) s = Series([1, 0, 0], index=[3, 5, 9]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) # Weights have index values to be dropped because not in # sampled DataFrame s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) # Weights have empty values to be filed with zeros s3 = Series([0.01, 0], index=[3, 5]) assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) # No overlap in weight and sampled DataFrame indices s4 = Series([1, 0], index=[1, 2]) with pytest.raises(ValueError): df.sample(1, weights=s4)
def create_data(): """ create the pickle/msgpack data """ data = { u'A': [0., 1., 2., 3., np.nan], u'B': [0, 1, 0, 1, 0], u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'], u'D': date_range('1/1/2009', periods=5), u'E': [0., 1, Timestamp('20100101'), u'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple( zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', u'foo', u'qux', u'qux'], [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two'] ])), names=[u'first', u'second'])) series = dict( float=Series(data[u'A']), int=Series(data[u'B']), mixed=Series(data[u'E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=[u'one', u'two'])), dup=Series(np.arange(5).astype(np.float64), index=[u'A', u'B', u'C', u'D', u'A']), cat=Series(Categorical([u'foo', u'bar', u'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series(date_range('20130101', periods=5, tz='US/Eastern')), period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list(u"ABCDA") frame = dict( float=DataFrame({ u'A': series[u'float'], u'B': series[u'float'] + 1 }), int=DataFrame({ u'A': series[u'int'], u'B': series[u'int'] + 1 }), mixed=DataFrame({k: data[k] for k in [u'A', u'B', u'C', u'D']}), mi=DataFrame( { u'A': np.arange(5).astype(np.float64), u'B': np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples(tuple( zip(*[[u'bar', u'bar', u'baz', u'baz', u'baz'], [u'one', u'two', u'one', u'two', u'three']])), names=[u'first', u'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=[u'A', u'B', u'A']), cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}), cat_and_float=DataFrame({ u'A': Categorical([u'foo', u'bar', u'baz']), u'B': np.arange(3).astype(np.int64) }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { u'A': Timestamp('20130102', tz='US/Eastern'), u'B': Timestamp('20130603', tz='CET') }, index=range(5)), dt_mixed2_tzs=DataFrame( { u'A': Timestamp('20130102', tz='US/Eastern'), u'B': Timestamp('20130603', tz='CET'), u'C': Timestamp('20130603', tz='UTC') }, index=range(5))) with catch_warnings(record=True): mixed_dup_panel = Panel({ u'ItemA': frame[u'float'], u'ItemB': frame[u'int'] }) mixed_dup_panel.items = [u'ItemA', u'ItemA'] panel = dict(float=Panel({ u'ItemA': frame[u'float'], u'ItemB': frame[u'float'] + 1 }), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) cat = dict(int8=Categorical(list('abcdefg')), int16=Categorical(np.arange(1000)), int32=Categorical(np.arange(10000))) timestamp = dict(normal=Timestamp('2011-01-01'), nat=NaT, tz=Timestamp('2011-01-01', tz='US/Eastern')) if _loose_version < '0.19.2': timestamp['freq'] = Timestamp('2011-01-01', offset='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', offset='M') else: timestamp['freq'] = Timestamp('2011-01-01', freq='D') timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M') off = { 'DateOffset': DateOffset(years=1), 'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824), 'BusinessDay': BusinessDay(offset=timedelta(seconds=9)), 'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'), 'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'), 'SemiMonthBegin': SemiMonthBegin(day_of_month=9), 'SemiMonthEnd': SemiMonthEnd(day_of_month=24), 'MonthBegin': MonthBegin(1), 'MonthEnd': MonthEnd(1), 'QuarterBegin': QuarterBegin(1), 'QuarterEnd': QuarterEnd(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'YearEnd': YearEnd(1), 'Week': Week(1), 'Week_Tues': Week(2, normalize=False, weekday=1), 'WeekOfMonth': WeekOfMonth(week=3, weekday=4), 'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3), 'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"), 'Easter': Easter(), 'Hour': Hour(1), 'Minute': Minute(1) } return dict(series=series, frame=frame, panel=panel, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()), cat=cat, timestamp=timestamp, offsets=off)
def p_apply(panel, f): result = {} for item in panel.items: result[item] = f(panel[item]) return Panel(result, items=panel.items)
def create_data(): """ create the pickle/msgpack data """ data = { u'A': [0., 1., 2., 3., np.nan], u'B': [0, 1, 0, 1, 0], u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'], u'D': date_range('1/1/2009', periods=5), u'E': [0., 1, Timestamp('20100101'), u'foo', 2.] } scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M')) index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) mi = dict(reg2=MultiIndex.from_tuples(tuple( zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', u'foo', u'qux', u'qux'], [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two'] ])), names=[u'first', u'second'])) series = dict( float=Series(data[u'A']), int=Series(data[u'B']), mixed=Series(data[u'E']), ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), index=MultiIndex.from_tuples(tuple( zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=[u'one', u'two'])), dup=Series(np.arange(5).astype(np.float64), index=[u'A', u'B', u'C', u'D', u'A']), cat=Series(Categorical([u'foo', u'bar', u'baz'])), dt=Series(date_range('20130101', periods=5)), dt_tz=Series(date_range('20130101', periods=5, tz='US/Eastern')), period=Series([Period('2000Q1')] * 5)) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list(u"ABCDA") frame = dict( float=DataFrame({ u'A': series[u'float'], u'B': series[u'float'] + 1 }), int=DataFrame({ u'A': series[u'int'], u'B': series[u'int'] + 1 }), mixed=DataFrame({k: data[k] for k in [u'A', u'B', u'C', u'D']}), mi=DataFrame( { u'A': np.arange(5).astype(np.float64), u'B': np.arange(5).astype(np.int64) }, index=MultiIndex.from_tuples(tuple( zip(*[[u'bar', u'bar', u'baz', u'baz', u'baz'], [u'one', u'two', u'one', u'two', u'three']])), names=[u'first', u'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=[u'A', u'B', u'A']), cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}), cat_and_float=DataFrame({ u'A': Categorical([u'foo', u'bar', u'baz']), u'B': np.arange(3).astype(np.int64) }), mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame( { u'A': Timestamp('20130102', tz='US/Eastern'), u'B': Timestamp('20130603', tz='CET') }, index=range(5))) mixed_dup_panel = Panel({ u'ItemA': frame[u'float'], u'ItemB': frame[u'int'] }) mixed_dup_panel.items = [u'ItemA', u'ItemA'] panel = dict(float=Panel({ u'ItemA': frame[u'float'], u'ItemB': frame[u'float'] + 1 }), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) return dict(series=series, frame=frame, panel=panel, index=index, scalars=scalars, mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), sp_frame=dict(float=_create_sp_frame()))
def response_as_panel(self, swap=False): panel = Panel(self.response) if swap: panel = panel.swapaxes('items', 'minor') return panel
#coding:utf-8 # import pandas as pd # import numpy as np # p1 = pd.Panel(np.arange(27).reshape((3,3,3))) # print(p1) # print(p1.values) #coding:utf-8 import numpy as np import pandas as pd from pandas_datareader import * from pandas import Series, DataFrame, Index, Panel # da= get_data_yahoo('AAPL') # print(da) data =dict((stk, get_data_yahoo(stk, '1/1/2016', '1/15/2016')) for stk in ['AAPL', 'GOOG', 'BIDU', 'MSFT']) print(data) pdata = Panel(data) print(pdata) # pdata = pdata.swapaxes('items', 'minor') print(pdata) #访问顺序:# Item -> Major -> Minor print(pdata['AAPL']) print(pdata[:, '1/5/2016', :]) print(pdata['AAPL', '1/6/2016', :]) #Panel与DataFrame相互转换 stacked = pdata.ix[:, '1/7/2016':, :].to_frame() print(stacked) print(stacked.to_panel())
def setup_method(self, method): self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) with catch_warnings(record=True): self.panel_ints = Panel(np.random.rand(4, 4, 4), items=lrange(0, 8, 2), major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) self.series_uints = Series(np.random.rand(4), index=UInt64Index(lrange(0, 8, 2))) self.frame_uints = DataFrame(np.random.randn(4, 4), index=UInt64Index(lrange(0, 8, 2)), columns=UInt64Index(lrange(0, 12, 3))) with catch_warnings(record=True): self.panel_uints = Panel(np.random.rand(4, 4, 4), items=UInt64Index(lrange(0, 8, 2)), major_axis=UInt64Index(lrange(0, 12, 3)), minor_axis=UInt64Index(lrange(0, 16, 4))) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) with catch_warnings(record=True): self.panel_labels = Panel(np.random.randn(4, 4, 4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) with catch_warnings(record=True): self.panel_mixed = Panel(np.random.randn(4, 4, 4), items=[2, 4, 'null', 8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) with catch_warnings(record=True): self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) dates_rev = (date_range('20130101', periods=4).sort_values(ascending=False)) self.series_ts_rev = Series(np.random.randn(4), index=dates_rev) self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev) with catch_warnings(record=True): self.panel_ts_rev = Panel(np.random.randn(4, 4, 4), items=dates_rev) self.frame_empty = DataFrame({}) self.series_empty = Series({}) with catch_warnings(record=True): self.panel_empty = Panel({}) # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self, '%s_%s' % (o, t), None) setattr(self, o, d)
def setup(self): with warnings.catch_warnings(record=True): self.p = Panel(np.random.randn(100, 100, 100)) self.inds = range(0, 100, 10)
# -*- coding: utf-8 -*- import numpy as np import pandas as pd import pandas_datareader.data as web from pandas import Series, DataFrame, Index, Panel pdata = Panel( dict((stk, web.get_data_yahoo(stk, '1/1/2016', '1/15/2016')) for stk in ['AAPL', 'GOOG', 'BIDU', 'MSFT'])) print(pdata) pdata = pdata.swapaxes('items', 'minor') print(pdata) print() print("## access order: Item -> Major -> Minor:") print(pdata['Adj Close']) print(pdata[:, '1/5/2016', :]) # items as columns print(pdata['Adj Close', '1/6/2016', :]) print() print("## conversion between Panel and DataFrame:") print("""### items as columns; major, and minor as hierarchical index:""") stacked1 = pdata.ix[:, '1/7/2016':, :].to_frame() print("stacked1 ==>") print(stacked1) print() stacked2 = pdata.ix[:, '1/7/2016':, 0:1].to_frame() #stacked2 = pdata.ix[:, '1/7/2016':, 0].to_frame() # Error, not interval
def Regroup(groupinfo, labels, *args): """ Modify the 3D numpy arrays in *args so that data is grouped according to user specifications. For example, presume that the following scenarios are given: Fast_Down_01 Slow_Down_01 Fast_Down_02 Slow_Down_02 Fast_Down_04 Slow_Down_04 Fast_Down_08 Slow_Down_08 And only a single track run is specified: SCIT. In this example, the third item, skill scores, can be arbitrary. Now, supposed that we want to display the result data such that the x-axis is for the Down* and there are two plots: one for Fast and one for Slow. So, we group the scenarios data by some key (discussed later) _into_ the trackruns dimension. For this reason, the data dimension being grouped into (in this case, trackruns) must originally be singleton. *groupinfo* - dict with keys "group", "into", and "by". The "group" element states which dimension the grouping will occur on. The "into" element states along which dimension the groups will be stacked. These two elements can have values of "scenarios", "skills", or "trackruns". The "by" element is rudimentary for now, but it controls the key value function used for grouping. The key function is applied to the list of default labels for the dimension stated for "group". The unique set of keys generated by the function on these labels become the new default labels for the "into" dimension. Currently, the keyfunction is hard-coded to split the label by under- scores and search for the string given in "by" in the resulting list. It then returns the list's next value. So, in the above example, the new labels for the "trackruns" dimension would be "01", "02", "04", and "08". """ if groupinfo is None: return args if len(args) == 0: return args if len(labels[groupinfo['into']]) != 1: raise ValueError("Dim %s is not singleton!" % groupinfo['into']) if groupinfo['group'] == groupinfo['into']: raise ValueError("Can not group %s dimension into itself!" % groupinfo['group']) from pandas import Panel grpAxis = dataAxes[groupinfo['group']] intoAxis = dataAxes[groupinfo['into']] otherAxis = dataAxes[list( set(['scenarios', 'trackruns', 'skills']) - set([groupinfo['group'], groupinfo['into']]))[0]] # !!Temporary!! restricted functionality for just trackrun variables keyfunc = lambda x: _varval(x, groupinfo['by']) g_args = [] for a in args: wp = Panel(a, items=labels['scenarios'], major_axis=labels['skills'], minor_axis=labels['trackruns']) grouped = wp.groupby(keyfunc, axis=grpAxis) if len(grouped) == 0: raise ValueError("Grouping didn't result in anything!") intolabs, g_a = zip(*grouped) # Get a list of numpy arrays from the list of Panels g_a = np.concatenate(map(lambda x: x.values, g_a), axis=intoAxis) g_args.append(g_a) labels[groupinfo['into']] = intolabs # Do the full set for error-checking purposes trunclabs = None for intolab in intolabs: # TODO: Generalize this! # Take some original labels and remove the variable and its value that # were used to make *intolabs* tmp = [ '_'.join(_remove_varval(lab.split('_'), groupinfo['by'])) for lab in grouped.groups[intolab] ] if trunclabs is not None: if tmp != trunclabs: raise ValueError("The labels do not match! %s\n%s" % (trunclabs, tmp)) else: trunclabs = tmp labels[groupinfo['group']] = trunclabs return g_args