Example #1
0
 def __init__(self, db, filename):
     'Harvest articles from the list of feeds in filename.'
     self.db = db
     self.filename = filename
     self.htmlparser = HtmlParser()
     feedlist = self.read_feed_list(filename)
     self.articles = self.parse_feedlist(feedlist)
Example #2
0
def load(html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    for key in xpaths:
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        value = elements[0][2].encode('utf-8')
Example #3
0
def load(id,tm,url,html,encode, xpaths):
    parser = HtmlParser(html,encode)
    parser.parse()

    db_sql =  "insert into job_detail(url,src_desc,type,title,\
    keywords,department,job_require,job_duty,\
    job_welfare,label,company,company_desc,\
    logo,salary,work_experience,\
    edu, field,location,head_count,pub_time) values("

    jd = page_pb2.JobDescription()
    js ="{\"pub_tm\":\"" + tm + "\","
    js = js + "\"url\":\"" + url + "\","
    for key in xpaths:
#        print "[ON]handle " + key
        xpath=xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath,encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","
#        set_pb(jd,key,value)
    fp=open("./data/"+id+".dat",'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
Example #4
0
 def __init__(self,
              url,
              number_of_threads=20,
              allowed_urls=[],
              blocked_urls=[],
              basic_auth=(),
              depth=-1):
     self.url = url
     self.number_of_threads = number_of_threads
     self.allowed_urls = allowed_urls
     # self.blocked_urls = blocked_urls
     self.lost_url = set()
     self.basic_auth = basic_auth
     self.depth = depth
     self.crawl = True
     self.visited = {}
     self.general_visited = set()
     self.unvisited = set()
     self.general_unvisited = {self.url}
     self.fetched_url_record = dict()
     self.csv_table = CsvFormat([
         "url", "status code", "title", "keyword", "description", "h1",
         "h2", "h3", "h4", "h5", "h6", "index", "open tags",
         "external links", "h_tag_format"
     ])
     self.downloaded_pages = {}
     self.record = []
     self.url_parser = UrlParser(url)
     self.parser = HtmlParser()
     self.filemanager = FileManager()
Example #5
0
    def _toString(self):
        htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/')
        htmlParser.parse()

        timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime())
        text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2]

        return text
Example #6
0
def crawl(init_url):
    url_pool = UrlManager()
    downloader = Downloader()
    parser = HtmlParser()
    outputer = Outputer()
    temp_url = init_url
    while temp_url:
        driver = downloader.download(temp_url)
        content, temp_url = parser.parse(driver)
        outputer.write(content)
    outputer.close()
Example #7
0
 def parse_feed(self, feed):
     'Extract list of articles from the feed.'
     articles = []
     htmlparser = HtmlParser()
     for e in feed.entries[:1]: # read just the first entry while debugging
         article = Article(source=e.author, title=e.title, link=e.link)
         content = htmlparser.parse(e.link)
         article.content = re.sub(r' -.*$', '', content)
         article.save() # and associated word frequencies
         articles.append(article)
     return articles
Example #8
0
    def diff_html_from_file(cls, fileName1, fileName2, encode):
        '''get different elements btw. two html files
        '''

        if fileName1 == "" or fileName2 == "":
            print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null"
            return []

        html_str1 = file(fileName1, "rb").read()
        html_Parser1 = HtmlParser(html_str1, encode)
        elements1 = html_Parser1.parse()
        html_Parser1.saveElementsToFile(elements1, "./tmp1.txt")

        html_str2 = file(fileName2, "rb").read()
        html_Parser2 = HtmlParser(html_str2, encode)
        elements2 = html_Parser2.parse()
        html_Parser2.saveElementsToFile(elements2, "./tmp2.txt")

        diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt")
        return diffs
Example #9
0
def load(id, html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    jd = page_pb2.JobDescription()
    js = "{"
    for key in xpaths:
        #        print "[ON]handle " + key
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","


#        set_pb(jd,key,value)
    fp = open("./data/" + id + ".dat", 'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
Example #10
0
def main():
    # initialize argument parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument('url')
    parser.add_argument('keyword')

    # get arguments
    args = parser.parse_args()

    # set keyword and url from arguments
    keyword = args.keyword
    url = args.url

    # do a get request and get html from url
    response = do_request(url)

    # check if keyword is in response
    if keyword in response:
        print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(url))
        results = process_source(response, keyword)

    # initialize html parser
    parser = HtmlParser()

    # parse links from parser
    links = parser.feed(response)

    # iterate through collected links
    for link in links:
        # get the css or js file behind the links
        response = do_request(link)

        # check if keyword is in css or js file
        if keyword in response:
            print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(link))
            results = process_source(response, keyword)
Example #11
0
 def __init__(self, root_url):
     self.parser = HtmlParser()
     self.storage = DataStore()
     self._get_root_urls(root_url)
Example #12
0
 def __init__(self):
     self.manage = UrlManager()
     self.parser = HtmlParser()
     self.downloader = Htmldownloader()
     self.output = DataOutput()
Example #13
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
Example #14
0
import requests
from bs4 import BeautifulSoup
from htmlparser import HtmlParser
from urlparser import UrlParser
from time import sleep
import codecs
import json
import pandas as pd

visited = set()
unvisited = set()
domain = 'www.motoji.co.jp'
siteUrl = f"https://{domain}/"
praser_url = UrlParser(siteUrl)
parser_html = HtmlParser()
DATA = []

def get_res(url):

    headers_pc = {'User-Agent': 'robot wpmake'}
    try:
        res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False)
        return res
    except requests.exceptions.RequestException as e:
        print(e)
        return False

def update_data(url, status_code):

    DATA.append({"url": url, "status_code": status_code})