def setIndex(): from MyHTMLParser import MyHTMLParser import os HTMLlist = [] nobody = [ "index.html", "index1.html", "index2.html", "index3.html", "index4.html", "ru3.html", "ru2.html", "ru1.html", "rut.html", "rus.html", "ru.html" ] for (parent, d, f) in os.walk(NorroenDyrd.mirror): for fn in f: if fn in nobody: continue elif fn.find(".html") == -1: continue elif os.path.join(parent, fn) in HTMLlist: continue else: HTMLlist.append(os.path.join(parent, fn)) html = [] for h in HTMLlist: entry = {} with open(h, "r", encoding="utf-8") as f: html = f.readlines() parser = MyHTMLParser() for i in html: parser.feed(i) entry["path"] = h.replace(NorroenDyrd.mirror, NorroenDyrd.base) entry["text"] = parser.plaintext entry["title"] = parser.title NorroenDyrd.index.append(entry) del parser
def __GetEmailContent__(self, filePath): self._myHtmlParserObj = MyHTMLParser() emailContent = "" with open(filePath, 'r') as handle: emailMessage = email.message_from_file(handle) emailBody = "" if emailMessage.is_multipart(): for part in emailMessage.walk(): if part.get_content_type( ) == "text/html" or part.get_content_type( ) == "text/plain": partPayload = part.get_payload() emailBody = emailBody + ' ' + partPayload else: if emailMessage.get_content_type( ) == "text/html" or emailMessage.get_content_type( ) == "text/plain": emailBody = emailMessage.get_payload() # Cleaning email content emailSubject = '' if emailMessage.has_key('subject'): emailSubject = self.__CleanEmailContent__( emailMessage['subject']) emailContent = self._myHtmlParserObj.GetParsedContentFromHtml( emailBody) emailContent = str(emailSubject) + " " + str(emailContent) emailContent = self.__CleanEmailContent__(emailContent) return emailContent
def create_journey_instructions(steps): parser = MyHTMLParser() # HTML parser for directions API data instruct = "" for step in steps: parser.feed(step['html_instructions']) instruct += parser.get_data() + ">>>>>" print(instruct) return instruct
def __init__(self, config): self.config = config self.ht = myhashtable(config) self.htmlparser = MyHTMLParser(self.config, self.ht) self.start_batch_processing() self.write_file_map() self.ht.write_posting_file(term_count) self.ht.write_hash_table()
def table_maker(pd_row): #read data from html-like file h = MyHTMLParser() h.feed(pd_row['data']) soup = soupparser() p = soup.handle_data(pd_row) dividendpershare = [h.type_dividendpershare, h.asofdate_dividendpershare, h.reporttype_dividendpershare, h.period_dividendpershare, h.currency_dividendpershare, p.data_dividendpershare, h.exdate_dividendpershare, h.recorddate_dividendpershare, h.paydate_dividendpershare, h.declarationdate_dividendpershare] totalrevenue = [h.type_totalrevenue, h.asofdate_totalrevenue, h.reporttype_totalrevenue, h.period_totalrevenue, h.currency_totalrevenue, p.data_totalrevenue, h.exdate_totalrevenue, h.recorddate_totalrevenue, h.paydate_totalrevenue, h.declarationdate_totalrevenue] dividend = [h.type_dividend, h.asofdate_dividend, h.reporttype_dividend, h.period_dividend, h.currency_dividend, p.data_dividend, h.exdate_dividend, h.recorddate_dividend, h.paydate_dividend, h.declarationdate_dividend] eps = [h.type_eps, h.asofdate_eps, h.reporttype_eps, h.period_eps, h.currency_eps, p.data_eps, h.exdate_eps, h.recorddate_eps, h.paydate_eps, h.declarationdate_eps] #sort data and make it into a dataframe names = ['type', 'asofdate', 'reporttype', 'period', 'currency','data', 'exdate', 'recorddate', 'paydate', 'declarationdate'] def make_dataframe(list1): dict1 = {names[i]: list1[i] for i in range(10)} dataframe1 = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in dict1.items()])) dataframe1 = dataframe1.fillna(method='ffill') return dataframe1 dividendpershare_dataframe = make_dataframe(dividendpershare) totalrevenue_dataframe = make_dataframe(totalrevenue) dividend_dataframe = make_dataframe(dividend) eps_dataframe = make_dataframe(eps) table1 = pd.concat([dividendpershare_dataframe, totalrevenue_dataframe, dividend_dataframe, eps_dataframe], axis = 0, ignore_index=True) reqId1 = [pd_row['reqId']] * len(table1['type']) table1['reqId'] = pd.Series(np.array(reqId1), index = table1.index) #format each column to put into sql table1['type'] = table1['type'].astype(str) table1['reporttype'] = table1['reporttype'].astype(str) table1['period'] = table1['period'].astype(str) table1['asofdate'] = pd.to_datetime(table1['asofdate']) table1['exdate'] = pd.to_datetime(table1['exdate']) table1['recorddate'] = pd.to_datetime(table1['recorddate']) table1['paydate'] = pd.to_datetime(table1['paydate']) table1['declarationdate'] = pd.to_datetime(table1['declarationdate']) #drop_duplicate line table1 = table1.drop_duplicates() return table1
def dataFetcher(self, jobname=None): global masterList global masterDictionary global jobName global prePost # MEATHOD OVERLOAING if jobname == None: mydatafetcher = URLCreator(jobname=self.jobName) else: mydatafetcher = URLCreator(jobname=jobname) contents = mydatafetcher.loadUrl() parser = MyHTMLParser() root = parser.feed(contents) postdependencyList = list(set(parser.postList)) predependencyList = list(set(parser.preList)) # print('pre:',predependencyList) return predependencyList, postdependencyList
def __init__(self, emailDirPath, spamMappingFilePath): self._emailDirPath = emailDirPath self._spamIdentifierMapFilePath = spamMappingFilePath # {'inmail.1' :'Spam', 'inmail.10':'Ham',...........'inmail.200':'Spam'} self._emailFileNameToSpamOrHamMap = {} self._emailFileNameToSpamOrHamMap = self.__LoadFileNameToSpamOrHamMapping__( self._spamIdentifierMapFilePath) self._myHtmlParserObj = MyHTMLParser() self.fp = "test.txt" if os.path.isfile(self.fp): os.remove(self.fp) with open(self.fp, 'w') as handle: handle.write(str(datetime.datetime.now())) self.bulkList = [] self._ESMgrObject = ElasticSearchManager(Resource.INDEX_NAME, Resource.TYPE_NAME)
def crawl(self, depth, frontier): if depth > self.maxdepth: return nextLevelFrontier = list() for url in frontier: # only parse when the number of crawled pages are not exceeding maximum if len(self.crawledlist ) < self.numPages and url not in self.crawledlist: # pass in the URL and create the request request = req.Request( url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" }) try: time.sleep(1) # send the request to the url and get the response data = req.urlopen(request).read().decode("utf-8") parser = MyHTMLParser() parser.feed(data) # Handling Nonetype if self.record(url, depth): self.create_web_file(data, len(self.crawledlist)) print(url) print("Finished:", len(self.crawledlist), "files") print("current depth: ", depth) nextLevelFrontier += parser.urls # try to catch errors when encounter except urllib.error.HTTPError as err: # handling page not found error if err.code == 404: continue else: raise self.crawl(depth + 1, nextLevelFrontier)
# MYHTMLParser from MyHTMLParser import MyHTMLParser class MyHTMLParser(HTMLParser): def start_tag(self, tag, attrs): print("encountered a start tag: ", tag) def end_tag(self, tag): print("encountered an end tag : ", tag) def handel_data(self, data): print("encountered some data is : ", data) parser = MyHTMLParser() parser.feed = ('<html><head><title>test</title></head>' "<body><h1>parse me!</h1></body></html>")
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' ) req.add_header('Referer', 'http://gupiao.jd.com/find/12195') req.add_header('Host', 'gupiao.jd.com') req.add_header('Origin', 'http://gupiao.jd.com') req.add_header( 'Cookie', 'TrackID=1zjctpUkfXiPPpd2-FlJw52fq9gkx9v0WGqH_4sECdaGDpJ8D_58Bqx-Bx4HQsVMYTsT5X4AEec9ZtKVXPzJEMA; pinId=EX7C17pLL2_bXrUjzBWQTQ; __jdv=204210054|direct|-|none|-|1531620946230; _jrda=3; sec_flag=e125e94ccd30d095203da363b24adad3; sec_addr=c0a8006c; wlfstk_smdl=uj4fvqhhhqq66p2ddnrgf4vw8a2cggkb; 3AB9D23F7A4B3C9B=XG5I3N4FBWQZLN7HPAC56MKB755NV4K4D6CA6ICAOGCMBJBKMFJPJFYCRFOUFX7YP4IHFLD3YJJESRXWWTFXSHEVFM; __jda=204210054.1495960752486274042302.NaN.1525092662.1531620946.23; __jdb=204210054.10.1495960752486274042302|23.1531620946; __jdc=204210054; __jdu=1495960752486274042302; _jrdb=1531621024187' ) req.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8') response = urllib2.urlopen(req) string = response.read().replace("\n", "").replace("\t", "").replace( " ", "").replace("%", "") # print string htmlParser = MyHTMLParser() htmlParser.feed(string) # 对象转Json parserDict = htmlParser.__dict__ try: parserDict.pop('interesting') parserDict.pop('lasttag') parserDict.pop('lineno') parserDict.pop('offset') parserDict.pop('cdata_elem') parserDict.pop('rawdata') parserDict.pop('_HTMLParser__starttag_text') parserDict.pop('index') # parserDict['buyNumStart'] = "100.00"
def parseItem(self, htmlText): parser = MyHTMLParser(htmlText, self.requestHandler) return parser.getJson()