def format_authors(authors: bs4.element.Tag) -> List[str]: """ Tranforms the raw authors string into a list of authors. """ authors = str(authors) authors = authors.replace('\n', '').replace(' ', ' ').replace(', ', ',') authors_out = authors.split(',') authors_out = [a.strip() for a in authors_out] return authors_out
def parse_manually(self, parse_object: bs4.element.Tag) -> dict: """ Method which is dedicated to manuall parse broken html Input: parse_object = object which we would parse Output: dict """ list_column_names = [str(v) for v in parse_object.find_all('b')] parse_object = str(parse_object) list_column_names.insert(0, '</a>') list_split = [] for types in list_column_names: if types in list_column_names: list_split.append(types) parse_object = parse_object.replace(types, self.rand) parse_split = parse_object.split(self.rand) if '</a>' in list_split: list_split[0] = sp.status_iasa list_split = [self.remove_tags(x) for x in list_split] list_split = [self.remove_special(x) for x in list_split] list_split = [self.remove_spaces(x) for x in list_split] list_split = [v for v in list_split if v] value_dict = {} if len(parse_split) > 1: for column_value, value in zip(list_split, parse_split[1:]): value_dict.update( self.make_further_check(sp.rechange_iasa[column_value], value, sp.rechange_phrase)) return value_dict