def test_find_separator(self): s = r"#COLUMNSEPARATOR = ;" v = utils.find_separator(s) self.assertEqual(v, ";") s = r"I'm sorry the column separator is not in this gef file, even if he wanted to be there." v = utils.find_separator(s) self.assertEqual(v, r";|\s+|,|\|\s*")
def parse_data(headers, data_s, column_names=None): separator = utils.find_separator(headers) # Remove multiple whitespaces # TODO: find a way for polars to handle columns with variable amounts of whitespace if separator == " ": new_data = re.sub("[ \t]+", " ", data_s.replace("!", "")) else: # If we have another separator remove all whitespace around it new_data = re.sub( f"[\t ]*{re.escape(separator)}[\t ]*", separator, data_s.replace(separator + "!", "").replace("!", ""), ) # Remove whitespace at the beginning and end of lines, and remove the # last trailing line new_data = "\n".join([line.strip() for line in new_data.splitlines()]).rstrip() return pl.read_csv( new_data.encode(), sep=separator, new_columns=column_names, has_headers=False, )
def parse_data(header_s, data_s, columns_number=None, columns_info=None): if columns_number is None and columns_info is None: columns_number = utils.parse_columns_number(header_s) if columns_number is not None: columns_info = [] for column_number in range(1, columns_number + 1): column_info = utils.parse_column_info( header_s, column_number, MAP_QUANTITY_NUMBER_COLUMN_NAME_CPT) columns_info.append(column_info) new_data = data_s.replace('!', '') separator = utils.find_separator(header_s) df = pd.read_csv(io.StringIO(new_data), sep=separator, names=columns_info, index_col=False, engine='python') return df