Example #1
0
File: hdf.py Project: Itay4/pandas
class HDF(BaseIO):

    params = ['table', 'fixed']
    param_names = ['format']

    def setup(self, format):
        self.fname = '__test__.h5'
        N = 100000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.df.to_hdf(self.fname, 'df', format=format)

    def time_read_hdf(self, format):
        read_hdf(self.fname, 'df')

    def time_write_hdf(self, format):
        self.df.to_hdf(self.fname, 'df', format=format)
Example #2
0
gdp.pct_change().tail()
gdp.pct_change(periods=4).tail() # Quarterly data, annual difference

state_gdp.to_excel('state_gdp_from_dataframe.xls')
state_gdp.to_excel('state_gdp_from_dataframe_sheetname.xls', sheet_name='State GDP')
state_gdp.to_excel('state_gdp_from_dataframe.xlsx')
state_gdp.to_csv('state_gdp_from_dataframe.csv')
sio = StringIO.StringIO()
state_gdp.to_json(sio)
sio.seek(0)
sio.buf[:50]

df = DataFrame(zeros((1000,1000)))
df.to_csv('size_test.csv')
df.to_hdf('size_test.h5','df') # h5 is the usual extension for HDF5
df.to_hdf('size_test_compressed.h5','df',complib='zlib',complevel=6)
f = gzip.open('size_test.csvz','w')
df.to_csv(f)
f.close()
df_from_csvz = read_csv('size_test.csvz',compression='gzip')

x = randn(100,100)
DataFrame(x).to_csv('numpy_array.csv',header=False,index=False)

codes = ['GDPC1','INDPRO','CPILFESL','UNRATE','GS10','GS1','BAA','AAA']
names = ['Real GDP','Industrial Production','Core CPI','Unemployment Rate',\
   '10 Year Yield','1 Year Yield','Baa Yield','Aaa Yield']
# r to disable escape
base_url = r'http://research.stlouisfed.org/fred2/data/'
Example #3
0
def readRinexNav(fn,odir=None):
    """
    Michael Hirsch
    It may actually be faster to read the entire file via f.read() and then .split()
    and asarray().reshape() to the final result, but I did it frame by frame.
    http://gage14.upc.es/gLAB/HTML/GPS_Navigation_Rinex_v2.11.html
    """
    fn = Path(fn).expanduser()
    if odir: odir = Path(odir).expanduser()
    
    startcol = 3 #column where numerical data starts
    nfloat=19 #number of text elements per float data number
    nline=7 #number of lines per record

    with fn.open('r') as f:
        #find end of header, which has non-constant length
        while True:
            if 'END OF HEADER' in f.readline(): break
        #handle frame by frame
        sv = []; epoch=[]; raws=''
        while True:
            headln = f.readline()
            if not headln: break
            #handle the header
            sv.append(headln[:2])
            year = int(headln[2:5])
            if 80<= year <=99:
                year+=1900
            elif year<80: #good till year 2180
                year+=2000
            epoch.append(datetime(year =year,
                                  month   =int(headln[5:8]),
                                  day     =int(headln[8:11]),
                                  hour    =int(headln[11:14]),
                                  minute  =int(headln[14:17]),
                                  second  =int(headln[17:20]),
                                  microsecond=int(headln[21])*100000))
            """
            now get the data.
            Use rstrip() to chomp newlines consistently on Windows and Python 2 & Python 3
            Specifically [:-1] doesn't work consistently on multi-platform line endings
            """
            raw = (headln[22:].rstrip() +
                    ''.join(f.readline()[startcol:].rstrip() for _ in range(nline)))
            raws += raw + '\n'

    raws = raws.replace('D','E')

    strio = BytesIO(raws.encode())
    darr = np.genfromtxt(strio,delimiter=nfloat)

    nav= DataFrame(np.hstack((np.asarray(sv,int)[:,None],darr)), epoch,
               ['sv','SVclockBias','SVclockDrift','SVclockDriftRate','IODE',
                'Crs','DeltaN','M0','Cuc','Eccentricity','Cus','sqrtA','TimeEph',
                'Cic','OMEGA','CIS','Io','Crc','omega','OMEGA DOT','IDOT',
                'CodesL2','GPSWeek','L2Pflag','SVacc','SVhealth','TGD','IODC',
                'TransTime','FitIntvl'])

    if odir:
        h5fn = odir/fn.name.with_suffix('.h5')
        print('saving NAV data to {}'.format(h5fn))
        nav.to_hdf(h5fn,key='NAV',mode='a',complevel=6,append=False)

    return nav
gdp.pct_change().tail()
gdp.pct_change(periods=4).tail()  # Quarterly data, annual difference

state_gdp.to_excel("state_gdp_from_dataframe.xls")
state_gdp.to_excel("state_gdp_from_dataframe_sheetname.xls", sheet_name="State GDP")
state_gdp.to_excel("state_gdp_from_dataframe.xlsx")
state_gdp.to_csv("state_gdp_from_dataframe.csv")
sio = StringIO.StringIO()
state_gdp.to_json(sio)
sio.seek(0)
sio.buf[:50]

df = DataFrame(zeros((1000, 1000)))
df.to_csv("size_test.csv")
df.to_hdf("size_test.h5", "df")  # h5 is the usual extension for HDF5
df.to_hdf("size_test_compressed.h5", "df", complib="zlib", complevel=6)
f = gzip.open("size_test.csvz", "w")
df.to_csv(f)
f.close()
df_from_csvz = read_csv("size_test.csvz", compression="gzip")

x = randn(100, 100)
DataFrame(x).to_csv("numpy_array.csv", header=False, index=False)

codes = ["GDPC1", "INDPRO", "CPILFESL", "UNRATE", "GS10", "GS1", "BAA", "AAA"]
names = [
    "Real GDP",
    "Industrial Production",
    "Core CPI",
    "Unemployment Rate",
Example #5
0
    samples = []
    rows = []

    i = 0
    for i, line in enumerate(open(f)):
        if i < 10:
            continue
        snp, sample, g1, g2 = line.split('\t')[:4]
        genotype = ''.join([g1, g2])
        if sample == current_sample:
            rows.append((snp, genotype))
        else:
            print current_sample, i
            samples.append(current_sample)
            df = DataFrame(rows, columns=['SNP', current_sample])
            df.to_hdf(h5f, current_sample)
            rows = [(snp, genotype)]
            current_sample = sample

    # Part 2 - assemble wide format
    print 'Assembling wide format'
    df = pd.read_hdf(h5f, samples[0])
    for sample in samples[1:]:
        print sample
        df1 = pd.read_hdf(filename.replace('txt', 'h5'),sample)
        df = df.merge(df1, on='SNP')

    df.to_hdf(h5f, 'wide_format')
    print 'Wide format assembled'