def extract_list_of_tokens(self, node: bs4.element.Tag): ''' this function parse the bs4 object to extract the list of all tokens We decided not to consider the spaces as a token ''' result = list() index_local = 0 for c in node.recursiveChildGenerator(): if str(type(c)) == "<class 'bs4.element.NavigableString'>": result.append("{}".format(c)) index_local += 1 result = [r.strip() for r in result if len(r.strip()) > 0] return result
def extract_list_of_tokens(self, node: bs4.element.Tag, keep_spaces: bool = True): ''' this function allows you to extract the list of all tokens. if @keep_spaces = True we consider all spaces as tokens, otherwise we remove them ''' result = list() index_local = 0 for c in node.recursiveChildGenerator(): if str(type(c)) == "<class 'bs4.element.NavigableString'>": result.append("{}".format(c)) index_local += 1 result = self.post_process_token(result, keep_spaces) return result