scriptOpen, scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose commonHTMLEntity.setParseAction(replaceHTMLEntity) allsoup = [] #empty list for soup ie the html data for i in range(len(df)): r = requests.get(df.ix[i, 'Report URL']) #request from URL soup = BeautifulSoup(r.text, "lxml") #convert to soup targetHTML = str(soup) #convert to strin firstPass = (htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag).suppress().transformString( targetHTML) #remove HTML tags repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) repeatedNewlines.setParseAction(replaceWith("\n\n")) secondPass = repeatedNewlines.transformString( firstPass) #remove additional HTML tags allsoup.append(str(secondPass.lower())) print('Soup number', i) df['Soup'] = allsoup #create boolean for Risk Factors #need to determine which keywords we will use to parse the file df['RiskFactors'] = df['Soup'].str.contains('risk factors', regex=True) df['unregistered'] = df['Soup'].str.contains('unregistered sales of equity', regex=True) df['Cautionary1'] = df['Soup'].str.contains('statements about Kroger', regex=True)
from pyparsing import ( makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, htmlComment, anyOpenTag, anyCloseTag, LineEnd, replaceWith, ) scriptOpen, scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose commonHTMLEntity.setParseAction(replaceHTMLEntity) # get some HTML targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" with urlopen(targetURL) as targetPage: targetHTML = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities firstPass = ((htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag).suppress().transformString(targetHTML)) # first pass leaves many blank lines, collapse these down repeatedNewlines = LineEnd() * (2, ) repeatedNewlines.setParseAction(replaceWith("\n\n")) secondPass = repeatedNewlines.transformString(firstPass) print(secondPass)
# htmlStripper.py # # Sample code for stripping HTML markup tags and scripts from # HTML source files. # # Copyright (c) 2006, 2016, Paul McGuire # from contextlib import closing import urllib.request, urllib.parse, urllib.error from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity, htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) scriptOpen,scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose commonHTMLEntity.setParseAction(replaceHTMLEntity) # get some HTML targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary" with closing(urllib.request.urlopen( targetURL )) as targetPage: targetHTML = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities firstPass = (htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) # first pass leaves many blank lines, collapse these down repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) repeatedNewlines.setParseAction(replaceWith("\n\n")) secondPass = repeatedNewlines.transformString(firstPass) print(secondPass)