def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, names=None, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, true_values=None, false_values=None, squeeze=False, **kwds): data = self.__get_sheet(sheetname) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, true_values=true_values, false_values=false_values, skiprows=skiprows, skipfooter=skip_footer, squeeze=squeeze, **kwds) return parser.read()
def parse_options_data(table): rows = body.findAll('tr') header = _unpack(rows[0], kind='th') print header data = [_unpack(r) for r in rows[:]] L1=[] for i in data: if len(i)>4: if len(i)==6: L1.append(i[0:-2]) else: L1.append(i[0:-2]) for i in range(0,len(L1)): for ii in range(0,len(L1[i])): oo=L1[i][ii].strip() L1[i][ii]=oo res=TextParser(L1, names=header).get_chunk() res=res.applymap(f) #not if downloading is # res.index=res[0] # res= res.reindex(columns=res.columns[1:]) # res=res.T res.columns=['name','ticker','supprice','reported_eps','estimat'] #------------------------------------ return res
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands): head, body, _ = data # _ is footer which is rarely used: ignore for now if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands) df = tp.read() if infer_types: # TODO: rm this code so infer_types has no effect in 0.14 df = df.convert_objects(convert_dates='coerce') else: df = df.applymap(text_type) return df
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands): head, body, foot = data if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 if foot: body += [foot] # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser( body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, ) df = tp.read() return df
def _data_to_frame(data, header, index_col, skiprows, parse_dates, tupleize_cols, thousands): head, body, foot = data if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 if foot: body += [foot] # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands) df = tp.read() return df
def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) assert_frame_equal(chunk, df[:3]) last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) assert_frame_equal(chunks[0], df[1:3]) # test bad parameter (skip_footer) reader = read_csv(StringIO(self.data1), index_col=0, iterator=True, skip_footer=True) self.assertRaises(ValueError, reader.get_chunk, 3) treader = read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) self.assert_(isinstance(treader, TextParser))
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode sheet = self.book.sheet_by_name(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( izip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) # how to produce this first case? if dt[0] < datetime.MINYEAR: # pragma: no cover value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize) return parser.read()
def _data_to_frame(**kwargs): head, body, foot = kwargs.pop("data") header = kwargs.pop("header") kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) if head: body = head + body # Infer header when there is a <thead> or top <th>-only rows if header is None: if len(head) == 1: header = 0 else: # ignore all-empty-text rows header = [ i for i, row in enumerate(head) if any(text for text in row) ] if foot: body += foot # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, **kwargs) df = tp.read() return df
def test_iterator(self): # See gh-6607 reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True) df = self.read_csv(StringIO(self.data1), index_col=0) chunk = reader.read(3) tm.assert_frame_equal(chunk, df[:3]) last_chunk = reader.read(5) tm.assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) # pass skiprows parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) chunks = list(parser) tm.assert_frame_equal(chunks[0], df[1:3]) treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) tm.assertIsInstance(treader, TextFileReader) # gh-3967: stopping iteration when chunksize is specified data = """A,B,C foo,1,2,3 bar,4,5,6 baz,7,8,9 """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ 3, 6, 9]), index=['foo', 'bar', 'baz']) self.assertEqual(len(result), 3) tm.assert_frame_equal(pd.concat(result), expected) # skip_footer is not supported with the C parser yet if self.engine == 'python': # test bad parameter (skip_footer) reader = self.read_csv(StringIO(self.data1), index_col=0, iterator=True, skip_footer=True) self.assertRaises(ValueError, reader.read, 3)
def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] df = read_csv(StringIO(data), index_col=0) parser = TextParser(as_list, index_col=0, chunksize=2) chunk = parser.get_chunk(None) assert_frame_equal(chunk, df)
def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A','B','C'],['foo','1','2','3'],['bar','4','5','6']] df = read_csv(StringIO(data), index_col=0) parser = TextParser(as_list, index_col=0, chunksize=2) chunk = parser.get_chunk(None) assert_frame_equal(chunk, df)
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, **kwds): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) # how to produce this first case? if dt[0] < datetime.MINYEAR: # pragma: no cover value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def _parse_options_data(table): rows = table.findAll('tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] # Use ',' as a thousands separator as we're pulling from the US site. return TextParser(data, names=header, na_values=['N/A'], thousands=',').get_chunk()
def _parse_html_table(table, offset=0): """ Applies the _unpack function to an entire table. It will pull out the header row and then unpack all others as data Parameters ---------- table : lxml.html.HtmlElement The html table as represented by lxml offset : int An integer specifying the offset for the headers in the table. This means, starting with 0, which row contains the headers. The function will then assume that all rows below are data rows. Returns ------- df : pd.DataFrame A pandas DataFrame containing the information from the html table """ rows = table.findall('.//tr') header = _unpack(rows[0 + offset], kind='th') data = [_unpack(r) for r in rows[1 + offset:]] return TextParser(data, names=header, na_values=['N/A'], thousands=',').get_chunk()
def test_iterator(self): reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) df = read_csv(StringIO(self.data1), index_col=0) chunk = reader.get_chunk(3) assert_frame_equal(chunk, df[:3]) last_chunk = reader.get_chunk(5) assert_frame_equal(last_chunk, df[3:]) # pass list lines = list(csv.reader(StringIO(self.data1))) parser = TextParser(lines, index_col=0, chunksize=2) df = read_csv(StringIO(self.data1), index_col=0) chunks = list(parser) assert_frame_equal(chunks[0], df[:2]) assert_frame_equal(chunks[1], df[2:4]) assert_frame_equal(chunks[2], df[4:]) treader = read_table(StringIO(self.data1), sep=',', index_col=0, iterator=True) self.assert_(isinstance(treader, TextParser))
def parse_options_data(self, table): rows = table.findall('.//tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] print(len(data)) print(type(data)) return TextParser(data, names=header).get_chunk()
def extract_votes_data(tab): rows = tab.findall('.//tr') header = unpack(rows[0], 'th') #each unpacked row is converted into an array to allow for explicit indexing. the row is then reconverted into a list values = [list(np.array(unpack(row))[[0, 2, 3]]) for row in rows[1:]] #TextParser is an iterable object, get_chunk method returns all rows by default return TextParser(values, names=['Borough'] + header[3:], thousands=',').get_chunk()
def parse_options_data(table): rows = table.xpath('.//tr') # 以table为当前路径,查找tr标签 header = _unpack(rows[0], kind='td') # 查找th标签作为header data = [_unpack(r) for r in rows[1:]] # 剩下的行作为data return TextParser(data, names=header).get_chunk()
def parse_options_data(table): #获取所有行 rows = calls.findall('.//tr') #获取标题行 header = unpack(rows[0], kind = 'th') #获取数据行 data = [unpack(r) for r in rows[1:]] return TextParser(data, names = header).get_chunk()
def _data_to_frame(**kwargs): head, body, foot = kwargs.pop('data') header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: rows = lrange(len(head)) body = head + body if header is None: # special case when a table has <th> elements header = 0 if rows == [0] else rows if foot: body += [foot] # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, **kwargs) df = tp.read() return df
def _parse_options_data(table): header = table.findall('thead/tr') header = _unpack(header[0], kind='th') rows = table.findall('tbody/tr') data = [_unpack(r) for r in rows] if len(data) > 0: return TextParser(data, names=header, thousands=',').get_chunk() else: # Empty table return DataFrame(columns=header)
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands): head, body, _ = data # _ is footer which is rarely used: ignore for now if head: body = [head] + body if header is None: # special case when a table has <th> elements header = 0 # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands) df = tp.read() return df
def parse_table(table): rows = table.xpath('.//tr') header = parse_row(rows[0]) data = [parse_row(row) for row in rows[1:]] return TextParser(data, names=header).get_chunk()
def test_read_data_list(all_parsers): parser = all_parsers kwargs = {"index_col": 0} data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]] expected = parser.read_csv(StringIO(data), **kwargs) with TextParser(data_list, chunksize=2, **kwargs) as parser: result = parser.read() tm.assert_frame_equal(result, expected)
def parse_options_data(table): ''' Unpack html element and turn it into dataframe :param table: html element :return: dataframe from the table ''' rows = table.findall('.//tr') #print rows header = _unpack(rows[0], kind= 'td') #print header data = [_unpack(r) for r in rows[1:]] #print data return TextParser(data, names = header).get_chunk()
def _data_to_frame(**kwargs): head, body, foot = kwargs.pop('data') header = kwargs.pop('header') kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: body = head + body # Infer header when there is a <thead> or top <th>-only rows if header is None: if len(head) == 1: header = 0 else: # ignore all-empty-text rows header = [i for i, row in enumerate(head) if any(text for text in row)] if foot: body += foot # fill out elements of body that are "ragged" _expand_elements(body) tp = TextParser(body, header=header, **kwargs) df = tp.read() return df
def get_as_dataframe(worksheet, evaluate_formulas=False, **options): """ Returns the worksheet contents as a DataFrame. :param worksheet: the worksheet. :param evaluate_formulas: if True, get the value of a cell after formula evaluation; otherwise get the formula itself if present. Defaults to False. :param \*\*options: all the options for pandas.io.parsers.TextParser, according to the version of pandas that is installed. (Note: TextParser supports only the 'python' parser engine.) :returns: pandas.DataFrame """ all_values = _get_all_values(worksheet, evaluate_formulas) return TextParser(all_values, **options).read()
def parse_options_data(table): rows = table.findAll('tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] L1=[] for i in data: if len(i)>4: L1.append(i) # if len(i)==6: # L1.append(i[1:]) # else: # L1.append(i) for i in range(0,len(L1)): for ii in range(0,len(L1[i])): oo=L1[i][ii].strip() L1[i][ii]=oo res=TextParser(L1, names=header).get_chunk() res=res.applymap(f) #not if downloading is # res.index=res[0] # res= res.reindex(columns=res.columns[1:]) # res=res.T #------------------------------------ return res
def test_reader_list_skiprows(all_parsers): data = """index,A,B,C,D foo,2,3,4,5 bar,7,8,9,10 baz,12,13,14,15 qux,12,13,14,15 foo2,12,13,14,15 bar2,12,13,14,15 """ parser = all_parsers kwargs = {"index_col": 0} lines = list(csv.reader(StringIO(data))) with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader: chunks = list(reader) expected = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(chunks[0], expected[1:3])
def _data_to_frame(data, **kwargs) -> DataFrame: """ Convert parsed data to Data Frame. This method will bind xml dictionary data of keys and values into named columns of Data Frame using the built-in TextParser class that build Data Frame and infers specific dtypes. """ tags = next(iter(data)) nodes = [list(d.values()) for d in data] try: with TextParser(nodes, names=tags, **kwargs) as tp: return tp.read() except ParserError: raise ParserError("XML document may be too complex for import. " "Try to flatten document and use distinct " "element and attribute names.")
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, verbose=False, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode def _parse_cell(cell_contents,cell_typ): """converts the contents of the cell into a pandas appropriate object""" if cell_typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): cell_contents = datetime.time(cell_contents.hour, cell_contents.minute, cell_contents.second, cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(cell_contents, epoch1904) if dt[0] < datetime.MINYEAR: cell_contents = datetime.time(*dt[3:]) else: cell_contents = datetime.datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan elif cell_typ == XL_CELL_BOOLEAN: cell_contents = bool(cell_contents) elif convert_float and cell_typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(cell_contents) if val == cell_contents: cell_contents = val return cell_contents # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False ret_dict = False #Keep sheetname to maintain backwards compatibility. if isinstance(sheetname, list): sheets = sheetname ret_dict = True elif sheetname is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheetname] #handle same-type duplicates. sheets = list(set(sheets)) output = {} for asheetname in sheets: if verbose: print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): sheet = self.book.sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(asheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: row.append(_parse_cell(value,typ)) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) output[asheetname] = parser.read() if ret_dict: return output else: return output[asheetname]
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. value = xldate.xldate_as_datetime(value, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (value.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): value = datetime.time(value.hour, value.minute, value.second, value.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(value, epoch1904) if dt[0] < datetime.MINYEAR: value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) elif convert_float and typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(value) if val == value: value = val row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype: DtypeArg | None = None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, **kwds, ): if convert_float is None: convert_float = True else: stacklevel = find_stack_level() warnings.warn( "convert_float is deprecated and will be removed in a future version.", FutureWarning, stacklevel=stacklevel, ) validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) has_index_names = (is_list_like(header) and len(header) > 1 and index_col is not None) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # GH34673: if MultiIndex names present and not defined in the header, # offset needs to be incremented so that forward filling starts # from the first MI value instead of the name if has_index_names: offset += 1 # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): _validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() for asheetname in sheets: if verbose: print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, compat.string_types): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if sheet.nrows == 0: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == '' or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) elif compat.PY2: output[asheetname].columns = _maybe_convert_to_string( output[asheetname].columns) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def parse_options_data(table): rows = table.findAll('tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[2:]] # a list of row string return TextParser(data, names=header).get_chunk()
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds, ): validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) _skiprows = kwargs.pop("skiprows") excel_header = kwargs.get("_header") sheet_name = kwargs.get("sheet_name", 0) footer = b"</sheetData></worksheet>" # Default to pandas case, where we are not splitting or partitioning if start is None or end is None: return pandas.read_excel(fname, **kwargs) from zipfile import ZipFile from openpyxl import load_workbook from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from openpyxl.worksheet.worksheet import Worksheet from pandas.core.dtypes.common import is_list_like from pandas.io.excel._util import ( _fill_mi_header, _maybe_convert_usecols, ) from pandas.io.parsers import TextParser import re wb = load_workbook(filename=fname, read_only=True) # Get shared strings ex = ExcelReader(fname, read_only=True) ex.read_manifest() ex.read_strings() # Convert string name 0 to string if sheet_name == 0: sheet_name = wb.sheetnames[sheet_name] # get the worksheet to use with the worksheet reader ws = Worksheet(wb) # Read the raw data with ZipFile(fname) as z: with z.open("xl/worksheets/{}.xml".format( sheet_name.lower())) as file: file.seek(start) bytes_data = file.read(end - start) def update_row_nums(match): """Update the row numbers to start at 1. Note: This is needed because the parser we are using does not scale well if the row numbers remain because empty rows are inserted for all "missing" rows. Parameters ---------- match The match from the origin `re.sub` looking for row number tags. Returns ------- string The updated string with new row numbers. """ b = match.group(0) return re.sub( b"\d+", # noqa: W605 lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows). encode("utf-8"), b, ) bytes_data = re.sub(b'r="[A-Z]*\d+"', update_row_nums, bytes_data) # noqa: W605 bytesio = BytesIO(excel_header + bytes_data + footer) # Use openpyxl to read/parse sheet data reader = WorksheetReader(ws, bytesio, ex.shared_strings, False) # Attach cells to worksheet object reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.pop("convert_float", True)) usecols = _maybe_convert_usecols(kwargs.pop("usecols", None)) header = kwargs.pop("header", 0) index_col = kwargs.pop("index_col", None) # skiprows is handled externally skiprows = None # Handle header and create MultiIndex for columns if necessary if is_list_like(header) and len(header) == 1: header = header[0] if header is not None and is_list_like(header): control_row = [True] * len(data[0]) for row in header: data[row], control_row = _fill_mi_header( data[row], control_row) # Handle MultiIndex for row Index if necessary if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if dataset is empty if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] parser = TextParser(data, header=header, index_col=index_col, has_index_names=is_list_like(header) and len(header) > 1, skiprows=skiprows, usecols=usecols, **kwargs) # In excel if you create a row with only a border (no values), this parser will # interpret that as a row of NaN values. Pandas discards these values, so we # also must discard these values. pandas_df = parser.read().dropna(how="all") # Since we know the number of rows that occur before this partition, we can # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex, # the index is already correct because it came from the data. if isinstance(pandas_df.index, pandas.RangeIndex): pandas_df.index = pandas.RangeIndex(start=_skiprows, stop=len(pandas_df.index) + _skiprows) # We return the length if it is a RangeIndex (common case) to reduce # serialization cost. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]
def _parse_options_data(table): rows = table.findall('.//tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] return TextParser(data, names=header, na_values=['N/A'], thousands=',').get_chunk()
url = 'https://gist.github.com/paulmillr/2657075' parsed = parse(urlopen(url)) doc = parsed.getroot() tables = doc.findall('.//table') rows = tables[0].findall('.//tr') def _unpack(row, kind='td'): elts = row.findall('.//%s' % kind) return [val.text_content() for val in elts] header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] data_ = TextParser(data, names=header[1:]).get_chunk() clean_data = pd.DataFrame(data_, columns=['User', 'Contribs', 'Location']) clean_data.to_csv('256_Top_Contributors.csv') print( 'First part: List of 256 Top contributors saved to 256_Top_Contributors.csv' ) ############### # Second part # ############### token = '4efc542dfb9e2fb1756529dfe242b46d92455362' API = 'https://api.github.com' myHeaders = {'Authorization': 'token 4efc542dfb9e2fb1756529dfe242b46d92455362'} listUsers = [] for i in range(clean_data['User'].size):