Example #1
0
 def test_find_separator(self):
     s = r"#COLUMNSEPARATOR = ;"
     v = utils.find_separator(s)
     self.assertEqual(v, ";")
     s = r"I'm sorry the column separator is not in this gef file, even if he wanted to be there."
     v = utils.find_separator(s)
     self.assertEqual(v, r";|\s+|,|\|\s*")
Example #2
0
    def parse_data(headers, data_s, column_names=None):
        separator = utils.find_separator(headers)

        # Remove multiple whitespaces
        # TODO: find a way for polars to handle columns with variable amounts of whitespace
        if separator == " ":
            new_data = re.sub("[ \t]+", " ", data_s.replace("!", ""))
        else:
            # If we have another separator remove all whitespace around it
            new_data = re.sub(
                f"[\t ]*{re.escape(separator)}[\t ]*",
                separator,
                data_s.replace(separator + "!", "").replace("!", ""),
            )

        # Remove whitespace at the beginning and end of lines, and remove the
        # last trailing line
        new_data = "\n".join([line.strip()
                              for line in new_data.splitlines()]).rstrip()

        return pl.read_csv(
            new_data.encode(),
            sep=separator,
            new_columns=column_names,
            has_headers=False,
        )
Example #3
0
 def parse_data(header_s, data_s, columns_number=None, columns_info=None):
     if columns_number is None and columns_info is None:
         columns_number = utils.parse_columns_number(header_s)
         if columns_number is not None:
             columns_info = []
             for column_number in range(1, columns_number + 1):
                 column_info = utils.parse_column_info(
                     header_s, column_number,
                     MAP_QUANTITY_NUMBER_COLUMN_NAME_CPT)
                 columns_info.append(column_info)
     new_data = data_s.replace('!', '')
     separator = utils.find_separator(header_s)
     df = pd.read_csv(io.StringIO(new_data),
                      sep=separator,
                      names=columns_info,
                      index_col=False,
                      engine='python')
     return df