def extract(): fields = [ 'Date', 'Total Equity', 'Domestic Equity', 'World Equity', 'Hybrid', 'Total Bond', 'Taxable Bond', 'Municipal Bond', 'Total' ] fin = open(archive) records, metadata = xls.parse(fin) records = [[r[f['id']] for f in metadata['fields']] for r in records] # discard first 2 rows as blank / table title records = records[2:] for r in records: if isinstance(r[0], datetime.datetime): # get rid of time part r[0] = r[0].isoformat()[:10] for idx in range(1, len(r)): if isinstance(r[idx], float): r[idx] = int(r[idx]) # split out weekly from monthly # somewhere down sheet have blank line then heading for weekly stuff for count, r in enumerate(records): if r[0] == 'Estimated Weekly Net New Cash Flow': weekly = records[count + 1:count + 7] # -1 as blank line above monthly = records[:count - 1] # here we just overwrite as they seem to append (earliest data still there) fo = open('data/monthly.csv', 'w') writer = csv.writer(fo, lineterminator='\n') writer.writerow(fields) writer.writerows(monthly) # now do the weekly data - have to merge with existing data as they only # give us the latest few weeks weeklyfp = 'data/weekly.csv' existing = [] if os.path.exists(weeklyfp): existing = csv.reader(open('data/weekly.csv')) existing.next() existing = [x for x in existing] # combine existing and weekly # weekly stuff should be newer so algorithm is: # iterate through existing and add to weekly until we get # existing row with date equal to oldest new row and then we break oldest_new_week = list(weekly[0]) overlap = False for count, row in enumerate(existing): if row[0] == oldest_new_week[0]: overlap = True weekly = existing[:count] + weekly break # default weekly to everything in case we do not have an overlap if not overlap: weekly = existing + weekly fo = open('data/weekly.csv', 'w') writer = csv.writer(fo, lineterminator='\n') writer.writerow(fields) writer.writerows(weekly)
def extract(fp=cachepath): rows, metadata = xls.parse(open(fp)) # convert from iterator to list and rows of dictionaries to rows of lists rows = [[row[f['id']] for f in metadata['fields']] for row in rows] # headings spread across rows 2-8 header = [ 'Date', 'SP500', 'Dividend', 'Earnings', 'Consumer Price Index', 'Long Interest Rate', 'Real Price', 'Real Dividend', 'Real Earnings', 'PE10' ] # first rows is header, last row is footnotes rows = rows[1:-1] # usually one last row with footnotes but some months stuff goes wrong and we # have extra rows and then something random e.g. april 2016 there is a # random number 20+ rows down rows = [r for r in rows if r[0] != ''] transposed = zip(*rows) # fix dates # delete "date fraction" column del transposed[5] # seem to have a random extra blank 6 columns del transposed[-6:] transposed[0] = [_fixdates(val) for val in transposed[0]] for idx, row in enumerate(transposed[1:]): row = [_fixup(val) for val in row] transposed[idx + 1] = row data = zip(*transposed) fout = open(out_filepath, 'w') writer = csv.writer(fout, lineterminator='\n') writer.writerow(header) writer.writerows(data)
def extract(fp=cachepath): rows, metadata = xls.parse(open(fp)) # convert from iterator to list and rows of dictionaries to rows of lists rows = [ [row[f['id']] for f in metadata['fields']] for row in rows ] # headings spread across rows 2-8 header = [ 'Date', 'SP500', 'Dividend', 'Earnings', 'Consumer Price Index', 'Long Interest Rate', 'Real Price', 'Real Dividend', 'Real Earnings', 'P/E10' ] # first rows is header, last row is footnotes data = rows[1:-1] transposed = zip(*data) # fix dates # delete "date fraction" column del transposed[5] transposed[0] = [ _fixdates(val) for val in transposed[0] ] for idx, row in enumerate(transposed[1:]): row = [ _fixup(val) for val in row ] transposed[idx+1] = row data = zip(*transposed) fout = open(out_filepath, 'w') writer = csv.writer(fout, lineterminator='\n') writer.writerow(header) writer.writerows(data)
def extract(fp=CACHE_PATH): rows, metadata = xls.parse(open(fp)) # convert from iterator to list and rows of dictionaries to rows of lists rows = [[row[f['id']] for f in metadata['fields']] for row in rows] # headings spread across rows 2-8 header = [ 'Date', 'SP500', 'Dividend', 'Earnings', 'Consumer Price Index', 'Long Interest Rate', 'Real Price', 'Real Dividend', 'Real Earnings', 'PE10' ] # first rows is header, last row is footnotes data = filter(is_valid_raw, rows) transposed = zip(*data) # remove empty columns in the end del transposed[11:] # fix dates # delete "date fraction" column del transposed[5] transposed[0] = [_fixdates(val) for val in transposed[0]] for idx, row in enumerate(transposed[1:]): row = [_fixup(val) for val in row] transposed[idx + 1] = row data = zip(*transposed) fout = open(OUT_FILEPATH, 'w') writer = csv.writer(fout, lineterminator='\n') writer.writerow(header) writer.writerows(data)
def extract(url): fo = urllib.urlopen(url) records, metadata = xls.parse(fo) def rerowify(dict_): return [dict_[f['id']] for f in metadata['fields']] rows = [rerowify(r) for r in records] del rows[:8] transposed = zip(*rows) annual = zip(*transposed[:3]) annual = [[int(r[0])] + list(r[1:]) for r in annual if r[0]] quarterly = zip(*transposed[4:7]) # 1947q1 etc def fixquarters(date): mapping = [ ['Q1', '-01-01'], ['Q2', '-04-01'], ['Q3', '-07-01'], ['Q4', '-10-01'] ] for x in mapping: date = date.replace(x[0], x[1]) return str(date) quarterly = [[fixquarters(r[0])] + list(r[1:]) for r in quarterly] return (annual, quarterly)
def test_2_header_type(self): """Test guessing header type""" xlsfo = open(os.path.join(self.testdata_path, 'simple.xls')) iterator, metadata = xls.parse(xlsfo) assert_equal( [{'id': u'date', 'type': 'DateTime'}, {'id': u'temperature', 'type': 'Integer'}, {'id': u'place', 'type': 'String'}], metadata['fields'])
def test_1_convert_xls(self): """Test converting a XLS to JSON""" xlsfo = open(os.path.join(self.testdata_path, 'simple.xls')) iterator, metadata = xls.parse(xlsfo, guess_types=False) assert_equal([{"id": u"date"}, {"id": u"temperature"}, {"id": u"place"}], metadata['fields']) content = [row for row in iterator] assert ({u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1.0} in content)
def test_3_convert_xlsx(self): """Test converting a XLSX to JSON""" xlsfo = open(os.path.join(self.testdata_path, 'simple.xlsx')) iterator, metadata = xls.parse(xlsfo, excel_type='xlsx') assert_equal([{'type': 'DateTime', 'id': u'date'}, {'id': u'temperature', 'type': 'Integer'}, {'id': u'place', 'type': 'String'}], metadata['fields']) content = [row for row in iterator] assert ({u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1} in content)
def test_2_header_type(self): """Test guessing header type""" xlsfo = open(os.path.join(self.testdata_path, 'simple.xls')) iterator, metadata = xls.parse(xlsfo) assert_equal([{ 'id': u'date', 'type': 'DateTime' }, { 'id': u'temperature', 'type': 'Integer' }, { 'id': u'place', 'type': 'String' }], metadata['fields'])
def test_1_convert_xls(self): """Test converting a XLS to JSON""" xlsfo = open(os.path.join(self.testdata_path, 'simple.xls')) iterator, metadata = xls.parse(xlsfo, guess_types=False) assert_equal([{ "id": u"date" }, { "id": u"temperature" }, { "id": u"place" }], metadata['fields']) content = [row for row in iterator] assert ({ u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1.0 } in content)
def test_3_convert_xlsx(self): """Test converting a XLSX to JSON""" xlsfo = open(os.path.join(self.testdata_path, 'simple.xlsx')) iterator, metadata = xls.parse(xlsfo, excel_type='xlsx') assert_equal([{ 'type': 'DateTime', 'id': u'date' }, { 'id': u'temperature', 'type': 'Integer' }, { 'id': u'place', 'type': 'String' }], metadata['fields']) content = [row for row in iterator] assert ({ u'date': datetime.datetime(2011, 1, 1, 0, 0), u'place': u'Galway', u'temperature': 1 } in content)
def extract(fp=cachepath): rows, metadata = xls.parse(open(fp)) # convert from iterator to list and rows of dictionaries to rows of lists rows = [ [row[f['id']] for f in metadata['fields']] for row in rows ] # headings spread across rows 2-8 header = [ 'Date', 'SP500', 'Dividend', 'Earnings', 'Consumer Price Index', 'Long Interest Rate', 'Real Price', 'Real Dividend', 'Real Earnings', 'PE10' ] # first rows is header, last row is footnotes rows = rows[1:-1] # usually one last row with footnotes but some months stuff goes wrong and we # have extra rows and then something random e.g. april 2016 there is a # random number 20+ rows down rows = [ r for r in rows if r[0] != '' ] transposed = zip(*rows) # fix dates # delete "date fraction" column del transposed[5] # seem to have a random extra blank 6 columns del transposed[-6:] transposed[0] = [ _fixdates(val) for val in transposed[0] ] for idx, row in enumerate(transposed[1:]): row = [ _fixup(val) for val in row ] transposed[idx+1] = row data = zip(*transposed) fout = open(out_filepath, 'w') writer = csv.writer(fout, lineterminator='\n') writer.writerow(header) writer.writerows(data)
def parse(*args, **kwargs): kwargs['excel_type'] = 'xlsx' return xls.parse(*args, **kwargs)
def extract(): fields = [ 'Date', 'Total Equity', 'Domestic Equity', 'World Equity', 'Hybrid', 'Total Bond', 'Taxable Bond', 'Municipal Bond', 'Total' ] fin = open(archive) records, metadata = xls.parse(fin) records = [ [ r[f['id']] for f in metadata['fields'] ] for r in records ] # discard first 2 rows as blank / table title records = records[2:] for r in records: if isinstance(r[0], datetime.datetime): # get rid of time part r[0] = r[0].isoformat()[:10] for idx in range(1, len(r)): if isinstance(r[idx], float): r[idx] = int(r[idx]) # split out weekly from monthly # somewhere down sheet have blank line then heading for weekly stuff for count, r in enumerate(records): if r[0] == 'Estimated Weekly Net New Cash Flow': weekly = records[count+1:count+7] # -1 as blank line above monthly = records[:count-1] # here we just overwrite as they seem to append (earliest data still there) fo = open('data/monthly.csv', 'w') writer = csv.writer(fo, lineterminator='\n') writer.writerow(fields) writer.writerows(monthly) # now do the weekly data - have to merge with existing data as they only # give us the latest few weeks weeklyfp = 'data/weekly.csv' existing = [] if os.path.exists(weeklyfp): existing = csv.reader(open('data/weekly.csv')) existing.next() existing = [ x for x in existing ] # combine existing and weekly # weekly stuff should be newer so algorithm is: # iterate through existing and add to weekly until we get # existing row with date equal to oldest new row and then we break oldest_new_week = list(weekly[0]) overlap = False for count,row in enumerate(existing): if row[0] == oldest_new_week[0]: overlap = True weekly = existing[:count] + weekly break # default weekly to everything in case we do not have an overlap if not overlap: weekly = existing + weekly fo = open('data/weekly.csv', 'w') writer = csv.writer(fo, lineterminator='\n') writer.writerow(fields) writer.writerows(weekly)