def format_authors(authors: bs4.element.Tag) -> List[str]:
    """ Retrieves list of authors from the tag. """
    authors = str(authors).replace('<dd>',
                                   '').replace('</dd>',
                                               '').replace(', ', ',').strip()
    authors = authors.split(',')
    return authors
Esempio n. 2
0
def format_authors(authors: bs4.element.Tag) -> List[str]:
    """ Tranforms the raw authors string into a list of authors. """
    authors = str(authors)
    authors = authors.replace('\n', '').replace('  ', ' ').replace(', ', ',')
    authors_out = authors.split(',')
    authors_out = [a.strip() for a in authors_out]
    return authors_out
Esempio n. 3
0
 def parse_manually(self, parse_object: bs4.element.Tag) -> dict:
     """
     Method which is dedicated to manuall parse broken html
     Input:  parse_object = object which we would parse
     Output: dict
     """
     list_column_names = [str(v) for v in parse_object.find_all('b')]
     parse_object = str(parse_object)
     list_column_names.insert(0, '</a>')
     list_split = []
     for types in list_column_names:
         if types in list_column_names:
             list_split.append(types)
             parse_object = parse_object.replace(types, self.rand)
     parse_split = parse_object.split(self.rand)
     if '</a>' in list_split:
         list_split[0] = sp.status_iasa
     list_split = [self.remove_tags(x) for x in list_split]
     list_split = [self.remove_special(x) for x in list_split]
     list_split = [self.remove_spaces(x) for x in list_split]
     list_split = [v for v in list_split if v]
     value_dict = {}
     if len(parse_split) > 1:
         for column_value, value in zip(list_split, parse_split[1:]):
             value_dict.update(
                 self.make_further_check(sp.rechange_iasa[column_value],
                                         value, sp.rechange_phrase))
     return value_dict
Esempio n. 4
0
def format_datetime(date_str: bs4.element.Tag, time: str) -> datetime:
    """
    Formatting date & time correctly

    :param date_str: string of html, contains date
    :param time: contains time (format: XX:XX)
    :return: string with the result if there is data, else returns empty string
    """

    arr = date_str.split(' ')
    year = int(arr[2])
    month = get_month(arr[1])
    day = int(arr[0])
    if time == '':
        hour = 20
        minutes = 0
    else:
        time = time.split(':')
        hour = int(time[0])
        minutes = int(time[1])

    return datetime(year, month, day, hour, minutes)