Beispiel #1
0
    def get_cols(self, lines):
        """Initialize Column objects from a multi-line ASCII header

        Parameters
        ----------
        lines : `list`
            List of table lines
        """
        re_name_def = re.compile("^\s*%\s+(?P<colname>\w+)")
        self.names = []
        for line in lines:
            if not line.startswith('%'):
                break  # End of header lines
            else:
                match = re_name_def.search(line)
                if match:
                    self.names.append(match.group('colname'))

        if not self.names:
            raise core.InconsistentTableError(
                'No column names found in Omega header')

        self.cols = []
        for n in self.names:
            col = core.Column(name=n)
            self.cols.append(col)
    def process_lines(self, lines):
        """
        Convert the given input into a list of SoupString rows
        for further processing.
        """

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise core.OptionalTableImportError(
                'BeautifulSoup must be '
                'installed to read HTML tables')

        if 'parser' not in self.html:
            soup = BeautifulSoup('\n'.join(lines))
        else:  # use a custom backend parser
            soup = BeautifulSoup('\n'.join(lines), self.html['parser'])
        tables = soup.find_all('table')
        for i, possible_table in enumerate(tables):
            if html.identify_table(possible_table, self.html, i + 1):
                table = possible_table  # Find the correct table
                break
        else:
            if isinstance(self.html['table_id'], int):
                err_descr = 'number {0}'.format(self.html['table_id'])
            else:
                err_descr = "id '{0}'".format(self.html['table_id'])
            raise core.InconsistentTableError(
                'ERROR: HTML table {0} not found'.format(err_descr))

        self.html['attrs'] = table.attrs
        # Get all table rows
        soup_list = [html.SoupString(x) for x in table.find_all('tr')]

        return soup_list
    def get_cols(self, lines):
        """Initialize Column objects from a multi-line ASCII header

        Parameters
        ----------
        lines : `list`
            List of table lines
        """
        re_name_def = re.compile(r'^\s*%\s+(?P<colname>\w+)')
        self.names = []
        for line in lines:
            if not line:  # ignore empty lines in header (windows)
                continue
            if not line.startswith('%'):  # end of header lines
                break
            match = re_name_def.search(line)
            if match:
                self.names.append(match.group('colname'))

        if not self.names:
            raise core.InconsistentTableError(
                'No column names found in Omega header')

        self.cols = []  # pylint: disable=attribute-defined-outside-init
        for name in self.names:
            col = core.Column(name=name)
            self.cols.append(col)
 def __call__(self, lines):
     for line in lines:
         if not isinstance(line, html.SoupString):
             raise TypeError('HTML lines should be of type SoupString')
         soup = line.soup
         # If header is duplicated, don't return it as data!
         header_elements = soup.find_all('th')
         if header_elements:
             continue
         data_elements = soup.find_all('td')
         if data_elements:
             # Return multirows as a couple for HTMLWithGroupsData handling
             yield [(el.text.strip(), int(el['rowspan']))
                    if el.has_attr('rowspan') else el.text.strip()
                    for el in data_elements]
     if len(lines) == 0:
         raise core.InconsistentTableError('HTML tables must contain data '
                                           'in a <table> tag')
    def get_cols(self, lines):
        """Initialize Column objects from a multi-line ASCII header

        Parameters
        ----------
        lines : `list`
            List of table lines
        """
        re_name_def = re.compile(
            r'^\s*#\s+'  # whitespace and comment marker
            r'(?P<colnumber>[0-9]+)\s+-\s+'  # number of column
            r'(?P<colname>(.*))'
        )
        self.names = []
        include_cuts = False
        for line in lines:
            if not line:  # ignore empty lines in header (windows)
                continue
            if not line.startswith('# '):  # end of header lines
                break
            if line.startswith('# -/+'):
                include_cuts = True
            else:
                match = re_name_def.search(line)
                if match:
                    self.names.append(match.group('colname').rstrip())

        if not self.names:
            raise core.InconsistentTableError(
                'No column names found in cWB header')

        if include_cuts:
            self.cols = [  # pylint: disable=attribute-defined-outside-init
                core.Column(name='selection cut 1'),
                core.Column(name='selection cut 2'),
            ]
        else:
            self.cols = []  # pylint: disable=attribute-defined-outside-init
        for name in self.names:
            col = core.Column(name=name)
            self.cols.append(col)
Beispiel #6
0
    def get_cols(self, lines):
        """Initialize Column objects from a multi-line ASCII header

        Parameters
        ----------
        lines : `list`
            List of table lines
        """
        re_name_def = re.compile(
            "^\s*#\s+"  # whitespace and comment marker
            "(?P<colnumber>[0-9]+)\s+-\s+"  # number of column
            "(?P<colname>(.*))"
            )
        self.names = []
        for line in lines:
            if not line.startswith('# '):
                break  # End of header lines
            elif line.startswith('# -/+'):
                include_cuts = True
            else:
                match = re_name_def.search(line)
                if match:
                    self.names.append(match.group('colname').rstrip())

        if not self.names:
            raise core.InconsistentTableError(
                'No column names found in cWB header')

        if include_cuts:
            self.cols = [
                core.Column(name='selection cut 1'),
                core.Column(name='selection cut 2'),
            ]
        else:
            self.cols = []
        for n in self.names:
            col = core.Column(name=n)
            self.cols.append(col)