async def read_book(self, book_file): p = re.compile(r'\.\s+') p2 = re.compile(r"\\'") self._book = epub.read_epub(book_file) for item in self._book.get_items(): if item.get_type() == ebooklib.ITEM_DOCUMENT: name = str(item.get_name()) self._parsed_book[name] = list() logger.debug('==================================') logger.debug('NAME : ' + name) logger.debug('----------------------------------') content = str(item.get_content()) logger.debug(content) parser = MyHTMLParser() parser.feed(content) result = parser.get_result() for string in result: string = p.sub('.\n', string) string = p2.sub("'", string) lines = string.split("\n") for line in lines: new_line = str(line.lstrip("\\n")).rstrip() self._parsed_book[name].append(new_line) logger.debug(new_line) #translated_string = await self.translate_text(new_line, 'en') #content = content.replace(new_line, translated_string) #logger.debug(":" + str(translated_string) + ":") logger.debug('==================================') logger.debug("Book:") logger.debug(str(self._parsed_book)) logger.debug('==================================')
def get(self): parse = MyHTMLParser() findlist = parse.getFromS() url = 'http://www.genkotsu-hb.com/' for find in findlist: text = find.string self.response.out.write(text) if u'創業価格フェア' in text: text = '%s%s%s%s' % (u'@eibiisii 「',text,u'」だって!―炭焼きレストランさわやか ',url) self.response.out.write(text) twitter = TwitterAuth() twitterapi = twitter.getAuth() twitter.update(twitterapi,tweet)
def fetch(furl): print 'fetch ' + furl wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})') #html = get(url) html = pq(url=furl)('.maxPicList').html() if html: hp = MyHTMLParser() hp.feed(html) hp.close() for link in hp.links: m = wwwp.match(link) if m: m.group(4) myurls.append(m.group(1)+m.group(2)+'/detail/apply/'+m.group(4)+'/?callback=?')
def fetch(furl): print 'fetch ' + furl + str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})') #html = get(url) html = pq(url=furl)('.maxPicList').html() #md5.update(html.encode('utf8')) #keytxt = md5.hexdigest() #global lastdigest #print lastdigest #if lastdigest == keytxt: # return #lastdigest = keytxt if html: hp = MyHTMLParser() hp.feed(html) hp.close() for link in hp.links: m = wwwp.match(link) if m: m.group(4) myurls.append(m.group(1)+m.group(2)+'/detail/apply/'+m.group(4)+'/?callback=?')
def fetch(furl): print 'fetch ' + furl wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})') #html = get(url) html = pq(url=furl)('.maxPicList').html() if html: hp = MyHTMLParser() hp.feed(html) hp.close() for link in hp.links: m = wwwp.match(link) if m: m.group(4) myurls.append( m.group(1) + m.group(2) + '/detail/apply/' + m.group(4) + '/?callback=?')
def fetch(furl): print 'fetch ' + furl + str( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) wwwp = re.compile(r'(http://)?([^/]*)(/?.*)(\d{6})') #html = get(url) html = pq(url=furl)('.maxPicList').html() #md5.update(html.encode('utf8')) #keytxt = md5.hexdigest() #global lastdigest #print lastdigest #if lastdigest == keytxt: # return #lastdigest = keytxt if html: hp = MyHTMLParser() hp.feed(html) hp.close() for link in hp.links: m = wwwp.match(link) if m: m.group(4) myurls.append( m.group(1) + m.group(2) + '/detail/apply/' + m.group(4) + '/?callback=?')
'https://www.seattletimes.com/seattle-news/politics/how-amazon-gets-whatever-it-wants/', 'https://www.seattletimes.com/seattle-news/homeless/homeless-man-dies-from-exposure-service-providers-prepare-for-more-cold-weather/', 'https://www.seattletimes.com/seattle-news/transportation/more-snow-is-headed-toward-seattle-and-road-clearing-crews-are-getting-ready/', 'https://www.seattletimes.com/seattle-news/politics/should-seattle-make-trims-to-neighborhood-upzones-plan-city-council-wades-into-debate/', 'https://www.seattletimes.com/business/tensions-over-political-resistance-to-amazon-boil-over-in-new-york/', 'https://www.seattletimes.com/seattle-news/health/washington-lawmakers-weigh-stricter-vaccine-bill-amid-outbreak/' ] count = 0 for url2 in url_list: res = requests.get(url2) raw_file = open('file' + str(count) + '.txt', 'w') raw_file.write(url2) raw_file.write('\n') raw_file.write(str(res.status_code)) raw_file.write('\n') raw_file.write(res.text) raw_file.write('\n') raw_file.close() count = count + 1 res = requests.get(url_list[0]) parser = MyHTMLParser() parser.feed(res.text) raw_file = open('sampledata.txt', 'w') for stuff in parser.p_data: raw_file.write(stuff) raw_file.close()
def doParse(self): parse = MyHTMLParser() findlist = parse.getFromC() return findlist