Exemple #1
0
    def parse(self,
              sheetname=0,
              header=0,
              skiprows=None,
              skip_footer=0,
              names=None,
              index_col=None,
              parse_cols=None,
              parse_dates=False,
              date_parser=None,
              na_values=None,
              thousands=None,
              convert_float=True,
              has_index_names=None,
              true_values=None,
              false_values=None,
              squeeze=False,
              **kwds):

        data = self.__get_sheet(sheetname)
        parser = TextParser(data,
                            header=header,
                            index_col=index_col,
                            has_index_names=has_index_names,
                            na_values=na_values,
                            thousands=thousands,
                            parse_dates=parse_dates,
                            date_parser=date_parser,
                            true_values=true_values,
                            false_values=false_values,
                            skiprows=skiprows,
                            skipfooter=skip_footer,
                            squeeze=squeeze,
                            **kwds)
        return parser.read()
Exemple #2
0
def parse_options_data(table):
    rows = body.findAll('tr')
    header = _unpack(rows[0], kind='th')
    print header
    data = [_unpack(r) for r in rows[:]]
    L1=[]
    for i in data:
        if len(i)>4:
            if len(i)==6:
                L1.append(i[0:-2])
            else:
                L1.append(i[0:-2])
    for i in range(0,len(L1)):
        for ii in range(0,len(L1[i])):
            oo=L1[i][ii].strip()
            L1[i][ii]=oo
    res=TextParser(L1, names=header).get_chunk()
    res=res.applymap(f)
    #not if downloading is
#     res.index=res[0]
#     res= res.reindex(columns=res.columns[1:])
#     res=res.T
    res.columns=['name','ticker','supprice','reported_eps','estimat']
    #------------------------------------
    return res
Exemple #3
0
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates,
                   tupleize_cols, thousands):
    head, body, _ = data  # _ is footer which is rarely used: ignore for now

    if head:
        body = [head] + body

        if header is None:  # special case when a table has <th> elements
            header = 0

    # fill out elements of body that are "ragged"
    _expand_elements(body)

    tp = TextParser(body,
                    header=header,
                    index_col=index_col,
                    skiprows=_get_skiprows(skiprows),
                    parse_dates=parse_dates,
                    tupleize_cols=tupleize_cols,
                    thousands=thousands)
    df = tp.read()

    if infer_types:  # TODO: rm this code so infer_types has no effect in 0.14
        df = df.convert_objects(convert_dates='coerce')
    else:
        df = df.applymap(text_type)
    return df
Exemple #4
0
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands):
    head, body, foot = data

    if head:
        body = [head] + body

        if header is None:  # special case when a table has <th> elements
            header = 0

    if foot:
        body += [foot]

    # fill out elements of body that are "ragged"
    _expand_elements(body)

    tp = TextParser(
        body,
        header=header,
        index_col=index_col,
        skiprows=_get_skiprows(skiprows),
        parse_dates=parse_dates,
        tupleize_cols=tupleize_cols,
        thousands=thousands,
    )
    df = tp.read()
    return df
Exemple #5
0
def _data_to_frame(data, header, index_col, skiprows, parse_dates,
                   tupleize_cols, thousands):
    head, body, foot = data

    if head:
        body = [head] + body

        if header is None:  # special case when a table has <th> elements
            header = 0

    if foot:
        body += [foot]

    # fill out elements of body that are "ragged"
    _expand_elements(body)

    tp = TextParser(body,
                    header=header,
                    index_col=index_col,
                    skiprows=_get_skiprows(skiprows),
                    parse_dates=parse_dates,
                    tupleize_cols=tupleize_cols,
                    thousands=thousands)
    df = tp.read()
    return df
Exemple #6
0
    def test_iterator(self):
        reader = read_csv(StringIO(self.data1), index_col=0, iterator=True)
        df = read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.get_chunk(3)
        assert_frame_equal(chunk, df[:3])

        last_chunk = reader.get_chunk(5)
        assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        assert_frame_equal(chunks[0], df[:2])
        assert_frame_equal(chunks[1], df[2:4])
        assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        assert_frame_equal(chunks[0], df[1:3])

        # test bad parameter (skip_footer)
        reader = read_csv(StringIO(self.data1), index_col=0, iterator=True,
                          skip_footer=True)
        self.assertRaises(ValueError, reader.get_chunk, 3)

        treader = read_table(StringIO(self.data1), sep=',', index_col=0,
                             iterator=True)
        self.assert_(isinstance(treader, TextParser))
Exemple #7
0
    def _parse_excel(self,
                     sheetname,
                     header=0,
                     skiprows=None,
                     skip_footer=0,
                     index_col=None,
                     has_index_names=None,
                     parse_cols=None,
                     parse_dates=False,
                     date_parser=None,
                     na_values=None,
                     thousands=None,
                     chunksize=None):
        from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR,
                          XL_CELL_BOOLEAN)

        datemode = self.book.datemode
        sheet = self.book.sheet_by_name(sheetname)

        data = []
        should_parse = {}
        for i in range(sheet.nrows):
            row = []
            for j, (value, typ) in enumerate(
                    izip(sheet.row_values(i), sheet.row_types(i))):
                if parse_cols is not None and j not in should_parse:
                    should_parse[j] = self._should_parse(j, parse_cols)

                if parse_cols is None or should_parse[j]:
                    if typ == XL_CELL_DATE:
                        dt = xldate_as_tuple(value, datemode)
                        # how to produce this first case?
                        if dt[0] < datetime.MINYEAR:  # pragma: no cover
                            value = datetime.time(*dt[3:])
                        else:
                            value = datetime.datetime(*dt)
                    elif typ == XL_CELL_ERROR:
                        value = np.nan
                    elif typ == XL_CELL_BOOLEAN:
                        value = bool(value)
                    row.append(value)

            data.append(row)

        if header is not None:
            data[header] = _trim_excel_header(data[header])

        parser = TextParser(data,
                            header=header,
                            index_col=index_col,
                            has_index_names=has_index_names,
                            na_values=na_values,
                            thousands=thousands,
                            parse_dates=parse_dates,
                            date_parser=date_parser,
                            skiprows=skiprows,
                            skip_footer=skip_footer,
                            chunksize=chunksize)

        return parser.read()
Exemple #8
0
def _data_to_frame(**kwargs):
    head, body, foot = kwargs.pop("data")
    header = kwargs.pop("header")
    kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
    if head:
        body = head + body

        # Infer header when there is a <thead> or top <th>-only rows
        if header is None:
            if len(head) == 1:
                header = 0
            else:
                # ignore all-empty-text rows
                header = [
                    i for i, row in enumerate(head)
                    if any(text for text in row)
                ]

    if foot:
        body += foot

    # fill out elements of body that are "ragged"
    _expand_elements(body)
    tp = TextParser(body, header=header, **kwargs)
    df = tp.read()
    return df
Exemple #9
0
    def test_iterator(self):
        # See gh-6607
        reader = self.read_csv(StringIO(self.data1), index_col=0,
                               iterator=True)
        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.read(3)
        tm.assert_frame_equal(chunk, df[:3])

        last_chunk = reader.read(5)
        tm.assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = self.read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[:2])
        tm.assert_frame_equal(chunks[1], df[2:4])
        tm.assert_frame_equal(chunks[2], df[4:])

        # pass skiprows
        parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1])
        chunks = list(parser)
        tm.assert_frame_equal(chunks[0], df[1:3])

        treader = self.read_table(StringIO(self.data1), sep=',', index_col=0,
                                  iterator=True)
        tm.assertIsInstance(treader, TextFileReader)

        # gh-3967: stopping iteration when chunksize is specified
        data = """A,B,C
foo,1,2,3
bar,4,5,6
baz,7,8,9
"""
        reader = self.read_csv(StringIO(data), iterator=True)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        tm.assert_frame_equal(result[0], expected)

        # chunksize = 1
        reader = self.read_csv(StringIO(data), chunksize=1)
        result = list(reader)
        expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[
            3, 6, 9]), index=['foo', 'bar', 'baz'])
        self.assertEqual(len(result), 3)
        tm.assert_frame_equal(pd.concat(result), expected)

        # skip_footer is not supported with the C parser yet
        if self.engine == 'python':
            # test bad parameter (skip_footer)
            reader = self.read_csv(StringIO(self.data1), index_col=0,
                                   iterator=True, skip_footer=True)
            self.assertRaises(ValueError, reader.read, 3)
Exemple #10
0
    def test_read_text_list(self):
        data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
        as_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
        df = read_csv(StringIO(data), index_col=0)

        parser = TextParser(as_list, index_col=0, chunksize=2)
        chunk = parser.get_chunk(None)

        assert_frame_equal(chunk, df)
Exemple #11
0
    def test_read_text_list(self):
        data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
        as_list = [['A','B','C'],['foo','1','2','3'],['bar','4','5','6']]
        df = read_csv(StringIO(data), index_col=0)

        parser = TextParser(as_list, index_col=0, chunksize=2)
        chunk  = parser.get_chunk(None)

        assert_frame_equal(chunk, df)
Exemple #12
0
    def test_read_text_list(self):
        data = """A,B,C\nfoo,1,2,3\nbar,4,5,6"""
        as_list = [['A','B','C'],['foo','1','2','3'],['bar','4','5','6']]
        df = read_csv(StringIO(data), index_col=0)

        parser = TextParser(as_list, index_col=0, chunksize=2)
        chunk  = parser.get_chunk(None)

        assert_frame_equal(chunk, df)
Exemple #13
0
    def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
                     index_col=None, has_index_names=None, parse_cols=None,
                     parse_dates=False, date_parser=None, na_values=None,
                     thousands=None, chunksize=None, **kwds):
        from xlrd import (xldate_as_tuple, XL_CELL_DATE,
                          XL_CELL_ERROR, XL_CELL_BOOLEAN)

        datemode = self.book.datemode
        if isinstance(sheetname, compat.string_types):
            sheet = self.book.sheet_by_name(sheetname)
        else:  # assume an integer if not a string
            sheet = self.book.sheet_by_index(sheetname)

        data = []
        should_parse = {}
        for i in range(sheet.nrows):
            row = []
            for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                 sheet.row_types(i))):
                if parse_cols is not None and j not in should_parse:
                    should_parse[j] = self._should_parse(j, parse_cols)

                if parse_cols is None or should_parse[j]:
                    if typ == XL_CELL_DATE:
                        dt = xldate_as_tuple(value, datemode)
                        # how to produce this first case?
                        if dt[0] < datetime.MINYEAR:  # pragma: no cover
                            value = datetime.time(*dt[3:])
                        else:
                            value = datetime.datetime(*dt)
                    elif typ == XL_CELL_ERROR:
                        value = np.nan
                    elif typ == XL_CELL_BOOLEAN:
                        value = bool(value)
                    row.append(value)

            data.append(row)

        if header is not None:
            data[header] = _trim_excel_header(data[header])

        parser = TextParser(data, header=header, index_col=index_col,
                            has_index_names=has_index_names,
                            na_values=na_values,
                            thousands=thousands,
                            parse_dates=parse_dates,
                            date_parser=date_parser,
                            skiprows=skiprows,
                            skip_footer=skip_footer,
                            chunksize=chunksize,
                            **kwds)

        return parser.read()
Exemple #14
0
def _parse_options_data(table):
    rows = table.findAll('tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    # Use ',' as a thousands separator as we're pulling from the US site.
    return TextParser(data, names=header, na_values=['N/A'],
                      thousands=',').get_chunk()
Exemple #15
0
def _parse_html_table(table, offset=0):
    """
    Applies the _unpack function to an entire table. It will pull
    out the header row and then unpack all others as data

    Parameters
    ----------
    table : lxml.html.HtmlElement
        The html table as represented by lxml

    offset : int
        An integer specifying the offset for the headers in the table.
        This means, starting with 0, which row contains the headers.
        The function will then assume that all rows below are data rows.

    Returns
    -------
    df : pd.DataFrame
        A pandas DataFrame containing the information from the html
        table
    """
    rows = table.findall('.//tr')
    header = _unpack(rows[0 + offset], kind='th')
    data = [_unpack(r) for r in rows[1 + offset:]]
    return TextParser(data, names=header, na_values=['N/A'],
                      thousands=',').get_chunk()
Exemple #16
0
    def test_iterator(self):
        reader = read_csv(StringIO(self.data1), index_col=0, iterator=True)

        df = read_csv(StringIO(self.data1), index_col=0)

        chunk = reader.get_chunk(3)
        assert_frame_equal(chunk, df[:3])

        last_chunk = reader.get_chunk(5)
        assert_frame_equal(last_chunk, df[3:])

        # pass list
        lines = list(csv.reader(StringIO(self.data1)))
        parser = TextParser(lines, index_col=0, chunksize=2)

        df = read_csv(StringIO(self.data1), index_col=0)

        chunks = list(parser)
        assert_frame_equal(chunks[0], df[:2])
        assert_frame_equal(chunks[1], df[2:4])
        assert_frame_equal(chunks[2], df[4:])

        treader = read_table(StringIO(self.data1),
                             sep=',',
                             index_col=0,
                             iterator=True)
        self.assert_(isinstance(treader, TextParser))
Exemple #17
0
 def parse_options_data(self, table):
     rows = table.findall('.//tr')
     header = _unpack(rows[0], kind='th')
     data = [_unpack(r) for r in rows[1:]]
     print(len(data))
     print(type(data))
     return TextParser(data, names=header).get_chunk()
def extract_votes_data(tab):
    rows = tab.findall('.//tr')
    header = unpack(rows[0], 'th')
    #each unpacked row is converted into an array to allow for explicit indexing. the row is then reconverted into a list
    values = [list(np.array(unpack(row))[[0, 2, 3]]) for row in rows[1:]]
    #TextParser is an iterable object, get_chunk method returns all rows by default
    return TextParser(values, names=['Borough'] + header[3:],
                      thousands=',').get_chunk()
Exemple #19
0
def parse_options_data(table):
    rows = table.xpath('.//tr')
    # 以table为当前路径,查找tr标签
    header = _unpack(rows[0], kind='td')
    # 查找th标签作为header
    data = [_unpack(r) for r in rows[1:]]
    # 剩下的行作为data
    return TextParser(data, names=header).get_chunk()
Exemple #20
0
def parse_options_data(table):
    #获取所有行
    rows = calls.findall('.//tr')
    #获取标题行
    header = unpack(rows[0], kind = 'th')
    #获取数据行
    data = [unpack(r) for r in rows[1:]]
    return TextParser(data, names = header).get_chunk()
Exemple #21
0
def _data_to_frame(**kwargs):
    head, body, foot = kwargs.pop('data')
    header = kwargs.pop('header')
    kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
    if head:
        rows = lrange(len(head))
        body = head + body
        if header is None:  # special case when a table has <th> elements
            header = 0 if rows == [0] else rows

    if foot:
        body += [foot]

    # fill out elements of body that are "ragged"
    _expand_elements(body)
    tp = TextParser(body, header=header, **kwargs)
    df = tp.read()
    return df
Exemple #22
0
def _data_to_frame(**kwargs):
    head, body, foot = kwargs.pop('data')
    header = kwargs.pop('header')
    kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
    if head:
        rows = lrange(len(head))
        body = head + body
        if header is None:  # special case when a table has <th> elements
            header = 0 if rows == [0] else rows

    if foot:
        body += [foot]

    # fill out elements of body that are "ragged"
    _expand_elements(body)
    tp = TextParser(body, header=header, **kwargs)
    df = tp.read()
    return df
Exemple #23
0
def _parse_options_data(table):
    header = table.findall('thead/tr')
    header = _unpack(header[0], kind='th')
    rows = table.findall('tbody/tr')
    data = [_unpack(r) for r in rows]
    if len(data) > 0:
        return TextParser(data, names=header, thousands=',').get_chunk()
    else:  # Empty table
        return DataFrame(columns=header)
Exemple #24
0
def _data_to_frame(data, header, index_col, skiprows, infer_types,
                   parse_dates, tupleize_cols, thousands):
    head, body, _ = data  # _ is footer which is rarely used: ignore for now

    if head:
        body = [head] + body

        if header is None:  # special case when a table has <th> elements
            header = 0

    # fill out elements of body that are "ragged"
    _expand_elements(body)

    tp = TextParser(body, header=header, index_col=index_col,
                    skiprows=_get_skiprows(skiprows),
                    parse_dates=parse_dates, tupleize_cols=tupleize_cols,
                    thousands=thousands)
    df = tp.read()
    return df
Exemple #25
0
def parse_table(table):

    rows = table.xpath('.//tr')

    header = parse_row(rows[0])

    data = [parse_row(row) for row in rows[1:]]


    return TextParser(data, names=header).get_chunk()
def test_read_data_list(all_parsers):
    parser = all_parsers
    kwargs = {"index_col": 0}
    data = "A,B,C\nfoo,1,2,3\nbar,4,5,6"

    data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], ["bar", "4", "5", "6"]]
    expected = parser.read_csv(StringIO(data), **kwargs)

    with TextParser(data_list, chunksize=2, **kwargs) as parser:
        result = parser.read()

    tm.assert_frame_equal(result, expected)
def parse_options_data(table):
    '''
    Unpack html element and turn it into dataframe
    :param table: html element
    :return: dataframe from the table
    '''
    rows = table.findall('.//tr')
    #print rows
    header = _unpack(rows[0], kind= 'td')
    #print header
    data = [_unpack(r) for r in rows[1:]]
    #print data
    return TextParser(data, names = header).get_chunk()
Exemple #28
0
def _data_to_frame(data, header, index_col, skiprows, infer_types, parse_dates,
                   tupleize_cols, thousands):
    head, body, _ = data  # _ is footer which is rarely used: ignore for now

    if head:
        body = [head] + body

        if header is None:  # special case when a table has <th> elements
            header = 0

    # fill out elements of body that are "ragged"
    _expand_elements(body)

    tp = TextParser(body,
                    header=header,
                    index_col=index_col,
                    skiprows=_get_skiprows(skiprows),
                    parse_dates=parse_dates,
                    tupleize_cols=tupleize_cols,
                    thousands=thousands)
    df = tp.read()
    return df
Exemple #29
0
def _data_to_frame(**kwargs):
    head, body, foot = kwargs.pop('data')
    header = kwargs.pop('header')
    kwargs['skiprows'] = _get_skiprows(kwargs['skiprows'])
    if head:
        body = head + body

        # Infer header when there is a <thead> or top <th>-only rows
        if header is None:
            if len(head) == 1:
                header = 0
            else:
                # ignore all-empty-text rows
                header = [i for i, row in enumerate(head)
                          if any(text for text in row)]

    if foot:
        body += foot

    # fill out elements of body that are "ragged"
    _expand_elements(body)
    tp = TextParser(body, header=header, **kwargs)
    df = tp.read()
    return df
def _data_to_frame(data, header, index_col, skiprows, infer_types,
                   parse_dates, tupleize_cols, thousands):
    head, body, _ = data  # _ is footer which is rarely used: ignore for now

    if head:
        body = [head] + body

        if header is None:  # special case when a table has <th> elements
            header = 0

    # fill out elements of body that are "ragged"
    _expand_elements(body)

    tp = TextParser(body, header=header, index_col=index_col,
                    skiprows=_get_skiprows(skiprows),
                    parse_dates=parse_dates, tupleize_cols=tupleize_cols,
                    thousands=thousands)
    df = tp.read()

    if infer_types:  # TODO: rm this code so infer_types has no effect in 0.14
        df = df.convert_objects(convert_dates='coerce')
    else:
        df = df.applymap(text_type)
    return df
Exemple #31
0
def get_as_dataframe(worksheet, evaluate_formulas=False, **options):
    """
    Returns the worksheet contents as a DataFrame.

    :param worksheet: the worksheet.
    :param evaluate_formulas: if True, get the value of a cell after
            formula evaluation; otherwise get the formula itself if present.
            Defaults to False.
    :param \*\*options: all the options for pandas.io.parsers.TextParser,
            according to the version of pandas that is installed.
            (Note: TextParser supports only the 'python' parser engine.)
    :returns: pandas.DataFrame
    """
    all_values = _get_all_values(worksheet, evaluate_formulas)
    return TextParser(all_values, **options).read()
def parse_options_data(table):
    rows = table.findAll('tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    L1=[]
    for i in data:
        if len(i)>4:
            L1.append(i)
#             if len(i)==6:
#                 L1.append(i[1:])
#             else:
#                 L1.append(i)
    for i in range(0,len(L1)):
        for ii in range(0,len(L1[i])):
            oo=L1[i][ii].strip()
            L1[i][ii]=oo
    res=TextParser(L1, names=header).get_chunk()
    res=res.applymap(f)
    #not if downloading is
#     res.index=res[0]
#     res= res.reindex(columns=res.columns[1:])
#     res=res.T
    #------------------------------------
    return res
def test_reader_list_skiprows(all_parsers):
    data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""
    parser = all_parsers
    kwargs = {"index_col": 0}

    lines = list(csv.reader(StringIO(data)))
    with TextParser(lines, chunksize=2, skiprows=[1], **kwargs) as reader:
        chunks = list(reader)

    expected = parser.read_csv(StringIO(data), **kwargs)

    tm.assert_frame_equal(chunks[0], expected[1:3])
Exemple #34
0
def _data_to_frame(data, **kwargs) -> DataFrame:
    """
    Convert parsed data to Data Frame.

    This method will bind xml dictionary data of keys and values
    into named columns of Data Frame using the built-in TextParser
    class that build Data Frame and infers specific dtypes.
    """

    tags = next(iter(data))
    nodes = [list(d.values()) for d in data]

    try:
        with TextParser(nodes, names=tags, **kwargs) as tp:
            return tp.read()
    except ParserError:
        raise ParserError("XML document may be too complex for import. "
                          "Try to flatten document and use distinct "
                          "element and attribute names.")
Exemple #35
0
    def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                     index_col=None, has_index_names=None, parse_cols=None,
                     parse_dates=False, date_parser=None, na_values=None,
                     thousands=None, chunksize=None, convert_float=True,
                     verbose=False, **kwds):
        import xlrd
        from xlrd import (xldate, XL_CELL_DATE,
                          XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        def _parse_cell(cell_contents,cell_typ):
            """converts the contents of the cell into a pandas
               appropriate object"""
               
            if cell_typ == XL_CELL_DATE:
                if xlrd_0_9_3:
                    # Use the newer xlrd datetime handling.
                    cell_contents = xldate.xldate_as_datetime(cell_contents,
                                                              epoch1904)

                    # Excel doesn't distinguish between dates and time,
                    # so we treat dates on the epoch as times only.
                    # Also, Excel supports 1900 and 1904 epochs.
                    year = (cell_contents.timetuple())[0:3]
                    if ((not epoch1904 and year == (1899, 12, 31))
                            or (epoch1904 and year == (1904, 1, 1))):
                        cell_contents = datetime.time(cell_contents.hour,
                                              cell_contents.minute,
                                              cell_contents.second,
                                              cell_contents.microsecond)
                else:
                    # Use the xlrd <= 0.9.2 date handling.
                    dt = xldate.xldate_as_tuple(cell_contents, epoch1904)

                    if dt[0] < datetime.MINYEAR:
                        cell_contents = datetime.time(*dt[3:])
                    else:
                        cell_contents = datetime.datetime(*dt)

            elif cell_typ == XL_CELL_ERROR:
                cell_contents = np.nan
            elif cell_typ == XL_CELL_BOOLEAN:
                cell_contents = bool(cell_contents)
            elif convert_float and cell_typ == XL_CELL_NUMBER:
                # GH5394 - Excel 'numbers' are always floats
                # it's a minimal perf hit and less suprising
                val = int(cell_contents)
                if val == cell_contents:
                    cell_contents = val
            return cell_contents

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False
        
        ret_dict = False
        
        #Keep sheetname to maintain backwards compatibility.
        if isinstance(sheetname, list):
            sheets = sheetname
            ret_dict = True
        elif sheetname is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheetname]
        
        #handle same-type duplicates.
        sheets = list(set(sheets))
        
        output = {}
        
        for asheetname in sheets:
            if verbose:
                print("Reading sheet %s" % asheetname)
            
            if isinstance(asheetname, compat.string_types):
                sheet = self.book.sheet_by_name(asheetname)
            else:  # assume an integer if not a string    
                sheet = self.book.sheet_by_index(asheetname)   
            
            data = []
            should_parse = {}
            
            for i in range(sheet.nrows):
                row = []
                for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                     sheet.row_types(i))):
                    if parse_cols is not None and j not in should_parse:
                        should_parse[j] = self._should_parse(j, parse_cols)
    
                    if parse_cols is None or should_parse[j]:
                        row.append(_parse_cell(value,typ))
                data.append(row)
    
            if header is not None:
                data[header] = _trim_excel_header(data[header])
    
            parser = TextParser(data, header=header, index_col=index_col,
                                has_index_names=has_index_names,
                                na_values=na_values,
                                thousands=thousands,
                                parse_dates=parse_dates,
                                date_parser=date_parser,
                                skiprows=skiprows,
                                skip_footer=skip_footer,
                                chunksize=chunksize,
                                **kwds)
            
            output[asheetname] = parser.read()
            
        if ret_dict:
            return output
        else:
            return output[asheetname]
Exemple #36
0
    def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                     index_col=None, has_index_names=None, parse_cols=None,
                     parse_dates=False, date_parser=None, na_values=None,
                     thousands=None, chunksize=None, convert_float=True,
                     **kwds):
        import xlrd
        from xlrd import (xldate, XL_CELL_DATE,
                          XL_CELL_ERROR, XL_CELL_BOOLEAN,
                          XL_CELL_NUMBER)

        epoch1904 = self.book.datemode

        # xlrd >= 0.9.3 can return datetime objects directly.
        if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
            xlrd_0_9_3 = True
        else:
            xlrd_0_9_3 = False

        if isinstance(sheetname, compat.string_types):
            sheet = self.book.sheet_by_name(sheetname)
        else:  # assume an integer if not a string
            sheet = self.book.sheet_by_index(sheetname)

        data = []
        should_parse = {}
        for i in range(sheet.nrows):
            row = []
            for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                 sheet.row_types(i))):
                if parse_cols is not None and j not in should_parse:
                    should_parse[j] = self._should_parse(j, parse_cols)

                if parse_cols is None or should_parse[j]:
                    if typ == XL_CELL_DATE:
                        if xlrd_0_9_3:
                            # Use the newer xlrd datetime handling.
                            value = xldate.xldate_as_datetime(value, epoch1904)

                            # Excel doesn't distinguish between dates and time,
                            # so we treat dates on the epoch as times only.
                            # Also, Excel supports 1900 and 1904 epochs.
                            year = (value.timetuple())[0:3]
                            if ((not epoch1904 and year == (1899, 12, 31))
                                    or (epoch1904 and year == (1904, 1, 1))):
                                    value = datetime.time(value.hour,
                                                          value.minute,
                                                          value.second,
                                                          value.microsecond)
                        else:
                            # Use the xlrd <= 0.9.2 date handling.
                            dt = xldate.xldate_as_tuple(value, epoch1904)

                            if dt[0] < datetime.MINYEAR:
                                value = datetime.time(*dt[3:])
                            else:
                                value = datetime.datetime(*dt)

                    elif typ == XL_CELL_ERROR:
                        value = np.nan
                    elif typ == XL_CELL_BOOLEAN:
                        value = bool(value)
                    elif convert_float and typ == XL_CELL_NUMBER:
                        # GH5394 - Excel 'numbers' are always floats
                        # it's a minimal perf hit and less suprising
                        val = int(value)
                        if val == value:
                            value = val

                    row.append(value)

            data.append(row)

        if header is not None:
            data[header] = _trim_excel_header(data[header])

        parser = TextParser(data, header=header, index_col=index_col,
                            has_index_names=has_index_names,
                            na_values=na_values,
                            thousands=thousands,
                            parse_dates=parse_dates,
                            date_parser=date_parser,
                            skiprows=skiprows,
                            skip_footer=skip_footer,
                            chunksize=chunksize,
                            **kwds)

        return parser.read()
Exemple #37
0
    def parse(
        self,
        sheet_name=0,
        header=0,
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        dtype: DtypeArg | None = None,
        true_values=None,
        false_values=None,
        skiprows=None,
        nrows=None,
        na_values=None,
        verbose=False,
        parse_dates=False,
        date_parser=None,
        thousands=None,
        comment=None,
        skipfooter=0,
        convert_float=None,
        mangle_dupe_cols=True,
        **kwds,
    ):

        if convert_float is None:
            convert_float = True
        else:
            stacklevel = find_stack_level()
            warnings.warn(
                "convert_float is deprecated and will be removed in a future version.",
                FutureWarning,
                stacklevel=stacklevel,
            )

        validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(dict.fromkeys(sheets).keys())

        output = {}

        for asheetname in sheets:
            if verbose:
                print(f"Reading sheet {asheetname}")

            if isinstance(asheetname, str):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            if hasattr(sheet, "close"):
                # pyxlsb opens two TemporaryFiles
                sheet.close()
            usecols = maybe_convert_usecols(usecols)

            if not data:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = fill_mi_header(
                        data[row], control_row)

                    if index_col is not None:
                        header_name, _ = pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            # If there is a MultiIndex header and an index then there is also
            # a row containing just the index name(s)
            has_index_names = (is_list_like(header) and len(header) > 1
                               and index_col is not None)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if header is None:
                    offset = 0
                elif not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # GH34673: if MultiIndex names present and not defined in the header,
                # offset needs to be incremented so that forward filling starts
                # from the first MI value instead of the name
                if has_index_names:
                    offset += 1

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == "" or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(
                    data,
                    names=names,
                    header=header,
                    index_col=index_col,
                    has_index_names=has_index_names,
                    squeeze=squeeze,
                    dtype=dtype,
                    true_values=true_values,
                    false_values=false_values,
                    skiprows=skiprows,
                    nrows=nrows,
                    na_values=na_values,
                    skip_blank_lines=False,  # GH 39808
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                    thousands=thousands,
                    comment=comment,
                    skipfooter=skipfooter,
                    usecols=usecols,
                    mangle_dupe_cols=mangle_dupe_cols,
                    **kwds,
                )

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
Exemple #38
0
    def parse(self,
              sheet_name=0,
              header=0,
              names=None,
              index_col=None,
              usecols=None,
              squeeze=False,
              dtype=None,
              true_values=None,
              false_values=None,
              skiprows=None,
              nrows=None,
              na_values=None,
              verbose=False,
              parse_dates=False,
              date_parser=None,
              thousands=None,
              comment=None,
              skipfooter=0,
              convert_float=True,
              mangle_dupe_cols=True,
              **kwds):

        _validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())

        output = OrderedDict()

        for asheetname in sheets:
            if verbose:
                print("Reading sheet {sheet}".format(sheet=asheetname))

            if isinstance(asheetname, compat.string_types):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = _maybe_convert_usecols(usecols)

            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = _fill_mi_header(data[row],
                                                             control_row)

                    if index_col is not None:
                        header_name, _ = _pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == '' or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(data,
                                    names=names,
                                    header=header,
                                    index_col=index_col,
                                    has_index_names=has_index_names,
                                    squeeze=squeeze,
                                    dtype=dtype,
                                    true_values=true_values,
                                    false_values=false_values,
                                    skiprows=skiprows,
                                    nrows=nrows,
                                    na_values=na_values,
                                    parse_dates=parse_dates,
                                    date_parser=date_parser,
                                    thousands=thousands,
                                    comment=comment,
                                    skipfooter=skipfooter,
                                    usecols=usecols,
                                    mangle_dupe_cols=mangle_dupe_cols,
                                    **kwds)

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                    elif compat.PY2:
                        output[asheetname].columns = _maybe_convert_to_string(
                            output[asheetname].columns)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
def parse_options_data(table):
    rows = table.findAll('tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[2:]]  # a list of row string
    return TextParser(data, names=header).get_chunk()
Exemple #40
0
    def parse(
        self,
        sheet_name=0,
        header=0,
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        dtype=None,
        true_values=None,
        false_values=None,
        skiprows=None,
        nrows=None,
        na_values=None,
        verbose=False,
        parse_dates=False,
        date_parser=None,
        thousands=None,
        comment=None,
        skipfooter=0,
        convert_float=True,
        mangle_dupe_cols=True,
        **kwds,
    ):

        validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(dict.fromkeys(sheets).keys())

        output = {}

        for asheetname in sheets:
            if verbose:
                print(f"Reading sheet {asheetname}")

            if isinstance(asheetname, str):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = maybe_convert_usecols(usecols)

            if not data:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = fill_mi_header(
                        data[row], control_row)

                    if index_col is not None:
                        header_name, _ = pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if header is None:
                    offset = 0
                elif not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == "" or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(
                    data,
                    names=names,
                    header=header,
                    index_col=index_col,
                    has_index_names=has_index_names,
                    squeeze=squeeze,
                    dtype=dtype,
                    true_values=true_values,
                    false_values=false_values,
                    skiprows=skiprows,
                    nrows=nrows,
                    na_values=na_values,
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                    thousands=thousands,
                    comment=comment,
                    skipfooter=skipfooter,
                    usecols=usecols,
                    mangle_dupe_cols=mangle_dupe_cols,
                    **kwds,
                )

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
Exemple #41
0
    def parse(fname, **kwargs):
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        _skiprows = kwargs.pop("skiprows")
        excel_header = kwargs.get("_header")
        sheet_name = kwargs.get("sheet_name", 0)
        footer = b"</sheetData></worksheet>"

        # Default to pandas case, where we are not splitting or partitioning
        if start is None or end is None:
            return pandas.read_excel(fname, **kwargs)

        from zipfile import ZipFile
        from openpyxl import load_workbook
        from openpyxl.worksheet._reader import WorksheetReader
        from openpyxl.reader.excel import ExcelReader
        from openpyxl.worksheet.worksheet import Worksheet
        from pandas.core.dtypes.common import is_list_like
        from pandas.io.excel._util import (
            _fill_mi_header,
            _maybe_convert_usecols,
        )
        from pandas.io.parsers import TextParser
        import re

        wb = load_workbook(filename=fname, read_only=True)
        # Get shared strings
        ex = ExcelReader(fname, read_only=True)
        ex.read_manifest()
        ex.read_strings()
        # Convert string name 0 to string
        if sheet_name == 0:
            sheet_name = wb.sheetnames[sheet_name]
        # get the worksheet to use with the worksheet reader
        ws = Worksheet(wb)
        # Read the raw data
        with ZipFile(fname) as z:
            with z.open("xl/worksheets/{}.xml".format(
                    sheet_name.lower())) as file:
                file.seek(start)
                bytes_data = file.read(end - start)

        def update_row_nums(match):
            """Update the row numbers to start at 1.

            Note: This is needed because the parser we are using does not scale well if
            the row numbers remain because empty rows are inserted for all "missing"
            rows.

            Parameters
            ----------
            match
                The match from the origin `re.sub` looking for row number tags.

            Returns
            -------
            string
                The updated string with new row numbers.
            """
            b = match.group(0)
            return re.sub(
                b"\d+",  # noqa: W605
                lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows).
                encode("utf-8"),
                b,
            )

        bytes_data = re.sub(b'r="[A-Z]*\d+"', update_row_nums,
                            bytes_data)  # noqa: W605
        bytesio = BytesIO(excel_header + bytes_data + footer)
        # Use openpyxl to read/parse sheet data
        reader = WorksheetReader(ws, bytesio, ex.shared_strings, False)
        # Attach cells to worksheet object
        reader.bind_cells()
        data = PandasExcelParser.get_sheet_data(
            ws, kwargs.pop("convert_float", True))
        usecols = _maybe_convert_usecols(kwargs.pop("usecols", None))
        header = kwargs.pop("header", 0)
        index_col = kwargs.pop("index_col", None)
        # skiprows is handled externally
        skiprows = None

        # Handle header and create MultiIndex for columns if necessary
        if is_list_like(header) and len(header) == 1:
            header = header[0]
        if header is not None and is_list_like(header):
            control_row = [True] * len(data[0])

            for row in header:
                data[row], control_row = _fill_mi_header(
                    data[row], control_row)
        # Handle MultiIndex for row Index if necessary
        if is_list_like(index_col):
            # Forward fill values for MultiIndex index.
            if not is_list_like(header):
                offset = 1 + header
            else:
                offset = 1 + max(header)

            # Check if dataset is empty
            if offset < len(data):
                for col in index_col:
                    last = data[offset][col]
                    for row in range(offset + 1, len(data)):
                        if data[row][col] == "" or data[row][col] is None:
                            data[row][col] = last
                        else:
                            last = data[row][col]

        parser = TextParser(data,
                            header=header,
                            index_col=index_col,
                            has_index_names=is_list_like(header)
                            and len(header) > 1,
                            skiprows=skiprows,
                            usecols=usecols,
                            **kwargs)
        # In excel if you create a row with only a border (no values), this parser will
        # interpret that as a row of NaN values. Pandas discards these values, so we
        # also must discard these values.
        pandas_df = parser.read().dropna(how="all")
        # Since we know the number of rows that occur before this partition, we can
        # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex,
        # the index is already correct because it came from the data.
        if isinstance(pandas_df.index, pandas.RangeIndex):
            pandas_df.index = pandas.RangeIndex(start=_skiprows,
                                                stop=len(pandas_df.index) +
                                                _skiprows)
        # We return the length if it is a RangeIndex (common case) to reduce
        # serialization cost.
        if index_col is not None:
            index = pandas_df.index
        else:
            # The lengths will become the RangeIndex
            index = len(pandas_df)
        return _split_result_for_readers(1, num_splits, pandas_df) + [
            index,
            pandas_df.dtypes,
        ]
Exemple #42
0
def _parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0], kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, names=header, na_values=['N/A'],
                      thousands=',').get_chunk()
url = 'https://gist.github.com/paulmillr/2657075'
parsed = parse(urlopen(url))
doc = parsed.getroot()
tables = doc.findall('.//table')
rows = tables[0].findall('.//tr')


def _unpack(row, kind='td'):
    elts = row.findall('.//%s' % kind)
    return [val.text_content() for val in elts]


header = _unpack(rows[0], kind='th')
data = [_unpack(r) for r in rows[1:]]
data_ = TextParser(data, names=header[1:]).get_chunk()
clean_data = pd.DataFrame(data_, columns=['User', 'Contribs', 'Location'])
clean_data.to_csv('256_Top_Contributors.csv')
print(
    'First part: List of 256 Top contributors saved to 256_Top_Contributors.csv'
)

###############
# Second part #
###############

token = '4efc542dfb9e2fb1756529dfe242b46d92455362'
API = 'https://api.github.com'
myHeaders = {'Authorization': 'token 4efc542dfb9e2fb1756529dfe242b46d92455362'}
listUsers = []
for i in range(clean_data['User'].size):