コード例 #1
0
    def extract_list_of_tokens(self, node: bs4.element.Tag):
        '''
        this function parse the bs4 object to extract the list of all tokens
        We decided not to consider the spaces as a token

        '''
        result = list()
        index_local = 0
        for c in node.recursiveChildGenerator():
            if str(type(c)) == "<class 'bs4.element.NavigableString'>":
                result.append("{}".format(c))
                index_local += 1
        result = [r.strip() for r in result if len(r.strip()) > 0]
        return result
コード例 #2
0
ファイル: method.py プロジェクト: mciniselli/github_manager
 def extract_list_of_tokens(self,
                            node: bs4.element.Tag,
                            keep_spaces: bool = True):
     '''
     this function allows you to extract the list of all tokens.
     if @keep_spaces = True we consider all spaces as tokens, otherwise we remove them
     '''
     result = list()
     index_local = 0
     for c in node.recursiveChildGenerator():
         if str(type(c)) == "<class 'bs4.element.NavigableString'>":
             result.append("{}".format(c))
             index_local += 1
     result = self.post_process_token(result, keep_spaces)
     return result