def parse_lutkepohl_data(path): # pragma: no cover """ Parse data files from Lutkepohl (2005) book Source for data files: www.jmulti.de """ from statsmodels.compat.pandas import datetools as dt from collections import deque from datetime import datetime import pandas import re regex = re.compile(asbytes('<(.*) (\w)([\d]+)>.*')) with open(path, 'rb') as f: lines = deque(f) to_skip = 0 while asbytes('*/') not in lines.popleft(): #while '*/' not in lines.popleft(): to_skip += 1 while True: to_skip += 1 line = lines.popleft() m = regex.match(line) if m: year, freq, start_point = m.groups() break data = np.genfromtxt(path, names=True, skip_header=to_skip + 1) n = len(data) # generate the corresponding date range (using pandas for now) start_point = int(start_point) year = int(year) offsets = { asbytes('Q'): frequencies.BQuarterEnd(), asbytes('M'): frequencies.BMonthEnd(), asbytes('A'): frequencies.BYearEnd() } # create an instance offset = offsets[freq] inc = offset * (start_point - 1) start_date = offset.rollforward(datetime(year, 1, 1)) + inc offset = offsets[freq] from pandas import DatetimeIndex # pylint: disable=E0611 date_range = DatetimeIndex(start=start_date, freq=offset, periods=n) return data, date_range
def parse_lutkepohl_data(path): # pragma: no cover """ Parse data files from Lutkepohl (2005) book Source for data files: www.jmulti.de """ from statsmodels.compat.pandas import datetools as dt from collections import deque from datetime import datetime import pandas import re regex = re.compile(asbytes('<(.*) (\w)([\d]+)>.*')) with open(path, 'rb') as f: lines = deque(f) to_skip = 0 while asbytes('*/') not in lines.popleft(): #while '*/' not in lines.popleft(): to_skip += 1 while True: to_skip += 1 line = lines.popleft() m = regex.match(line) if m: year, freq, start_point = m.groups() break data = np.genfromtxt(path, names=True, skip_header=to_skip+1) n = len(data) # generate the corresponding date range (using pandas for now) start_point = int(start_point) year = int(year) offsets = { asbytes('Q') : frequencies.BQuarterEnd(), asbytes('M') : frequencies.BMonthEnd(), asbytes('A') : frequencies.BYearEnd() } # create an instance offset = offsets[freq] inc = offset * (start_point - 1) start_date = offset.rollforward(datetime(year, 1, 1)) + inc offset = offsets[freq] from pandas import DatetimeIndex # pylint: disable=E0611 date_range = DatetimeIndex(start=start_date, freq=offset, periods=n) return data, date_range
def parse_lutkepohl_data(path): # pragma: no cover """ Parse data files from Lütkepohl (2005) book Source for data files: www.jmulti.de """ from collections import deque from datetime import datetime import re regex = re.compile(asbytes(r'<(.*) (\w)([\d]+)>.*')) with open(path, 'rb') as f: lines = deque(f) to_skip = 0 while asbytes('*/') not in lines.popleft(): #while '*/' not in lines.popleft(): to_skip += 1 while True: to_skip += 1 line = lines.popleft() m = regex.match(line) if m: year, freq, start_point = m.groups() break data = (pd.read_csv(path, delimiter=r"\s+", header=to_skip + 1).to_records(index=False)) n = len(data) # generate the corresponding date range (using pandas for now) start_point = int(start_point) year = int(year) offsets = { asbytes('Q'): frequencies.BQuarterEnd(), asbytes('M'): frequencies.BMonthEnd(), asbytes('A'): frequencies.BYearEnd() } # create an instance offset = offsets[freq] inc = offset * (start_point - 1) start_date = offset.rollforward(datetime(year, 1, 1)) + inc offset = offsets[freq] date_range = pd.date_range(start=start_date, freq=offset, periods=n) return data, date_range
def _null_terminate(self, s, encoding): if PY3: # have bytes not strings, so must decode null_byte = asbytes('\x00') try: s = s.lstrip(null_byte)[:s.index(null_byte)] except: pass return s.decode(encoding) else: null_byte = asbytes('\x00') try: return s.lstrip(null_byte)[:s.index(null_byte)] except: return s
def _null_terminate(self, s, encoding): null_byte = asbytes('\x00') try: s = s.lstrip(null_byte)[:s.index(null_byte)] except Exception: pass return s.decode(encoding)
def setup_class(cls): #SAS case cls.endog = dta3['Relief'] cls.groups = dta3['Brand'] cls.alpha = 0.05 cls.setup_class_() #super(cls, cls).setup_class_() #CheckTuckeyHSD.setup_class_() cls.meandiff2 = sas_['mean'] cls.confint2 = sas_[['lower','upper']].astype(float).values.reshape((3, 2)) cls.reject2 = sas_['sig'] == asbytes('***')
def setup_class(self): #SAS case self.endog = dta3['Relief'] self.groups = dta3['Brand'] self.alpha = 0.05 self.setup_class_() #super(self, self).setup_class_() #CheckTuckeyHSD.setup_class_() self.meandiff2 = sas_['mean'] self.confint2 = sas_[['lower','upper']].view(float).reshape((3,2)) self.reject2 = sas_['sig'] == asbytes('***')
def setup_class(self): #SAS case self.endog = dta3['Relief'] self.groups = dta3['Brand'] self.alpha = 0.05 self.setup_class_() #super(self, self).setup_class_() #CheckTuckeyHSD.setup_class_() self.meandiff2 = sas_['mean'] self.confint2 = sas_[['lower', 'upper']].view(float).reshape((3, 2)) self.reject2 = sas_['sig'] == asbytes('***')
def test_missing_roundtrip(): buf = BytesIO() dta = np.array([(np.nan, np.inf, "")], dtype=[("double_miss", float), ("float_miss", np.float32), ("string_miss", "a1")]) writer = StataWriter(buf, dta) writer.write_file() buf.seek(0) dta = genfromdta(buf, missing_flt=np.nan) assert_(isnull(dta[0][0])) assert_(isnull(dta[0][1])) assert_(dta[0][2] == asbytes("")) dta = genfromdta(os.path.join(curdir, "results/data_missing.dta"), missing_flt=-999) assert_(np.all([dta[0][i] == -999 for i in range(5)]))
def _write(self, to_write): """ Helper to call asbytes before writing to file for Python 3 compat. """ self._file.write(asbytes(to_write))
cyl_labels = np.array([ 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Japan', 'Germany', 'France', 'Germany', 'Sweden', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'Sweden', 'USA', 'France', 'Japan', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'USA' ]) #accommodate recfromtxt for python 3.2, requires bytes ss = asbytes(ss) ss2 = asbytes(ss2) ss3 = asbytes(ss3) ss5 = asbytes(ss5) dta = pd.read_csv(BytesIO(ss), sep=r'\s+', header=None, engine='python') dta.columns = "Rust", "Brand", "Replication" dta2 = pd.read_csv(BytesIO(ss2), sep=r'\s+', header=None, engine='python') dta2.columns = "idx", "Treatment", "StressReduction" dta2["Treatment"] = dta2["Treatment"].map(lambda v: v.encode('utf-8')) dta3 = pd.read_csv(BytesIO(ss3), sep=r'\s+', header=None, engine='python') dta3.columns = ["Brand", "Relief"] dta5 = pd.read_csv(BytesIO(ss5), sep=r'\t', header=None, engine='python') dta5.columns = ['pair', 'mean', 'lower', 'upper', 'sig'] for col in ('pair', 'sig'): dta5[col] = dta5[col].map(lambda v: v.encode('utf-8'))
cylinders = np.array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, 4, 4, 4, 4, 4, 6, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 4, 8, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4]) cyl_labels = np.array(['USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Japan', 'Germany', 'France', 'Germany', 'Sweden', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'Sweden', 'USA', 'France', 'Japan', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'USA']) #accommodate recfromtxt for python 3.2, requires bytes ss = asbytes(ss) ss2 = asbytes(ss2) ss3 = asbytes(ss3) ss5 = asbytes(ss5) dta = np.recfromtxt(BytesIO(ss), names=("Rust","Brand","Replication")) dta2 = np.recfromtxt(BytesIO(ss2), names = ("idx", "Treatment", "StressReduction")) dta3 = np.recfromtxt(BytesIO(ss3), names = ("Brand", "Relief")) dta5 = np.recfromtxt(BytesIO(ss5), names = ('pair', 'mean', 'lower', 'upper', 'sig'), delimiter='\t') sas_ = dta5[[1,3,2]] from statsmodels.stats.multicomp import (tukeyhsd, pairwise_tukeyhsd, MultiComparison) #import statsmodels.sandbox.stats.multicomp as multi #print tukeyhsd(dta['Brand'], dta['Rust'])