def __init__(self, db, filename): 'Harvest articles from the list of feeds in filename.' self.db = db self.filename = filename self.htmlparser = HtmlParser() feedlist = self.read_feed_list(filename) self.articles = self.parse_feedlist(feedlist)
def load(html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() for key in xpaths: xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) value = elements[0][2].encode('utf-8')
def load(id,tm,url,html,encode, xpaths): parser = HtmlParser(html,encode) parser.parse() db_sql = "insert into job_detail(url,src_desc,type,title,\ keywords,department,job_require,job_duty,\ job_welfare,label,company,company_desc,\ logo,salary,work_experience,\ edu, field,location,head_count,pub_time) values(" jd = page_pb2.JobDescription() js ="{\"pub_tm\":\"" + tm + "\"," js = js + "\"url\":\"" + url + "\"," for key in xpaths: # print "[ON]handle " + key xpath=xpaths.get(key) elements = parser.get_element_by_xpath(xpath,encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp=open("./data/"+id+".dat",'w') fp.write(js.rstrip(',') + "}") fp.close()
def __init__(self, url, number_of_threads=20, allowed_urls=[], blocked_urls=[], basic_auth=(), depth=-1): self.url = url self.number_of_threads = number_of_threads self.allowed_urls = allowed_urls # self.blocked_urls = blocked_urls self.lost_url = set() self.basic_auth = basic_auth self.depth = depth self.crawl = True self.visited = {} self.general_visited = set() self.unvisited = set() self.general_unvisited = {self.url} self.fetched_url_record = dict() self.csv_table = CsvFormat([ "url", "status code", "title", "keyword", "description", "h1", "h2", "h3", "h4", "h5", "h6", "index", "open tags", "external links", "h_tag_format" ]) self.downloaded_pages = {} self.record = [] self.url_parser = UrlParser(url) self.parser = HtmlParser() self.filemanager = FileManager()
def _toString(self): htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/') htmlParser.parse() timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime()) text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2] return text
def crawl(init_url): url_pool = UrlManager() downloader = Downloader() parser = HtmlParser() outputer = Outputer() temp_url = init_url while temp_url: driver = downloader.download(temp_url) content, temp_url = parser.parse(driver) outputer.write(content) outputer.close()
def parse_feed(self, feed): 'Extract list of articles from the feed.' articles = [] htmlparser = HtmlParser() for e in feed.entries[:1]: # read just the first entry while debugging article = Article(source=e.author, title=e.title, link=e.link) content = htmlparser.parse(e.link) article.content = re.sub(r' -.*$', '', content) article.save() # and associated word frequencies articles.append(article) return articles
def diff_html_from_file(cls, fileName1, fileName2, encode): '''get different elements btw. two html files ''' if fileName1 == "" or fileName2 == "": print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null" return [] html_str1 = file(fileName1, "rb").read() html_Parser1 = HtmlParser(html_str1, encode) elements1 = html_Parser1.parse() html_Parser1.saveElementsToFile(elements1, "./tmp1.txt") html_str2 = file(fileName2, "rb").read() html_Parser2 = HtmlParser(html_str2, encode) elements2 = html_Parser2.parse() html_Parser2.saveElementsToFile(elements2, "./tmp2.txt") diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt") return diffs
def load(id, html, encode, xpaths): parser = HtmlParser(html, encode) parser.parse() jd = page_pb2.JobDescription() js = "{" for key in xpaths: # print "[ON]handle " + key xpath = xpaths.get(key) elements = parser.get_element_by_xpath(xpath, encode) if (len(elements) == 0): print "[ERR] " + key continue value = elements[0][2].encode('utf-8') js += "\"" + key + "\":\"" + value + "\"," # set_pb(jd,key,value) fp = open("./data/" + id + ".dat", 'w') fp.write(js.rstrip(',') + "}") fp.close()
def main(): # initialize argument parser parser = argparse.ArgumentParser() # add arguments parser.add_argument('url') parser.add_argument('keyword') # get arguments args = parser.parse_args() # set keyword and url from arguments keyword = args.keyword url = args.url # do a get request and get html from url response = do_request(url) # check if keyword is in response if keyword in response: print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(url)) results = process_source(response, keyword) # initialize html parser parser = HtmlParser() # parse links from parser links = parser.feed(response) # iterate through collected links for link in links: # get the css or js file behind the links response = do_request(link) # check if keyword is in css or js file if keyword in response: print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(link)) results = process_source(response, keyword)
def __init__(self, root_url): self.parser = HtmlParser() self.storage = DataStore() self._get_root_urls(root_url)
def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput()
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
import requests from bs4 import BeautifulSoup from htmlparser import HtmlParser from urlparser import UrlParser from time import sleep import codecs import json import pandas as pd visited = set() unvisited = set() domain = 'www.motoji.co.jp' siteUrl = f"https://{domain}/" praser_url = UrlParser(siteUrl) parser_html = HtmlParser() DATA = [] def get_res(url): headers_pc = {'User-Agent': 'robot wpmake'} try: res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False) return res except requests.exceptions.RequestException as e: print(e) return False def update_data(url, status_code): DATA.append({"url": url, "status_code": status_code})