コード例 #1
0
 def __init__(self, db, filename):
     'Harvest articles from the list of feeds in filename.'
     self.db = db
     self.filename = filename
     self.htmlparser = HtmlParser()
     feedlist = self.read_feed_list(filename)
     self.articles = self.parse_feedlist(feedlist)
コード例 #2
0
ファイル: spidermanager.py プロジェクト: oujx28/Spider_study
class SpiderManager(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        #with open("content.html", 'wb') as f:
        #    f.write(content.encode('utf-8'))
        urls = self.parser.parser_url(root_url, content)
        print(urls)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl Failed!!!')
        self.output.output_end()
        print('Crawl Finish!')
コード例 #3
0
def load(id,tm,url,html,encode, xpaths):
    parser = HtmlParser(html,encode)
    parser.parse()

    db_sql =  "insert into job_detail(url,src_desc,type,title,\
    keywords,department,job_require,job_duty,\
    job_welfare,label,company,company_desc,\
    logo,salary,work_experience,\
    edu, field,location,head_count,pub_time) values("

    jd = page_pb2.JobDescription()
    js ="{\"pub_tm\":\"" + tm + "\","
    js = js + "\"url\":\"" + url + "\","
    for key in xpaths:
#        print "[ON]handle " + key
        xpath=xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath,encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","
#        set_pb(jd,key,value)
    fp=open("./data/"+id+".dat",'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
コード例 #4
0
ファイル: spider.py プロジェクト: Honda-a/seotool
 def __init__(self,
              url,
              number_of_threads=20,
              allowed_urls=[],
              blocked_urls=[],
              basic_auth=(),
              depth=-1):
     self.url = url
     self.number_of_threads = number_of_threads
     self.allowed_urls = allowed_urls
     # self.blocked_urls = blocked_urls
     self.lost_url = set()
     self.basic_auth = basic_auth
     self.depth = depth
     self.crawl = True
     self.visited = {}
     self.general_visited = set()
     self.unvisited = set()
     self.general_unvisited = {self.url}
     self.fetched_url_record = dict()
     self.csv_table = CsvFormat([
         "url", "status code", "title", "keyword", "description", "h1",
         "h2", "h3", "h4", "h5", "h6", "index", "open tags",
         "external links", "h_tag_format"
     ])
     self.downloaded_pages = {}
     self.record = []
     self.url_parser = UrlParser(url)
     self.parser = HtmlParser()
     self.filemanager = FileManager()
コード例 #5
0
ファイル: runner.py プロジェクト: zzszmyf/common
def load(html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    for key in xpaths:
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        value = elements[0][2].encode('utf-8')
コード例 #6
0
    def _toString(self):
        htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/')
        htmlParser.parse()

        timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime())
        text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2]

        return text
コード例 #7
0
ファイル: main.py プロジェクト: gloomyline/ML
def crawl(init_url):
    url_pool = UrlManager()
    downloader = Downloader()
    parser = HtmlParser()
    outputer = Outputer()
    temp_url = init_url
    while temp_url:
        driver = downloader.download(temp_url)
        content, temp_url = parser.parse(driver)
        outputer.write(content)
    outputer.close()
コード例 #8
0
 def parse_feed(self, feed):
     'Extract list of articles from the feed.'
     articles = []
     htmlparser = HtmlParser()
     for e in feed.entries[:1]: # read just the first entry while debugging
         article = Article(source=e.author, title=e.title, link=e.link)
         content = htmlparser.parse(e.link)
         article.content = re.sub(r' -.*$', '', content)
         article.save() # and associated word frequencies
         articles.append(article)
     return articles
コード例 #9
0
ファイル: epubparser.py プロジェクト: dpr10/whooshle-insight
    def content(self):
        content = u''

        epub_reader = EpubReader(self._filename)
        epub = epub_reader.load()

        for item in epub.items:
            if isinstance(item, EpubHtml):
                html_parser = HtmlParser(html=item.get_body_content())
                content += html_parser.content() + '\n'

        return content
コード例 #10
0
ファイル: runner.py プロジェクト: eJon/common
def load(id,html,encode, xpaths):
    parser = HtmlParser(html,encode)
    parser.parse()
    jd = page_pb2.JobDescription()
    js ="{";
    for key in xpaths:
#        print "[ON]handle " + key
        xpath=xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath,encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","
#        set_pb(jd,key,value)
    fp=open("./data/"+id+".dat",'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
コード例 #11
0
ファイル: _htmlparser.py プロジェクト: Superjom/swin-2
class debug_HtmlParser:
    @dec
    def __init__(self):
        self.html='''
            <html>
                <head>
                    <title>hello world</title>
                </head>
                <body>
                    你好<b>世界</b>
                    <h1>h1这是</h1>
                    <a href="http://www.cau.edu.cn">link哈啊 1</a>
                    <a href="http://www.cau.edu.cn/hello">link 2</a>
                    <a href="http://www.cau.edu.cn/index">link 3</a>
                </body>
            </html>
        '''
        self.homeUrls = [
            'http://www.cau.edu.cn',
            'http://www.google.com.hk',
            'http://www.baidu.com',
        ]
        self.urlparser = UrlParser(self.homeUrls)
        self.htmlparser = HtmlParser(self.urlparser)

    @dec
    def init(self):
        self.htmlparser.init(self.html)

    @dec
    def transcode(self):
        self.htmlparser.transcode(self.html)

    @dec
    def getLinks(self):
        print self.htmlparser.getLinks()

    @dec
    def getSrcs(self):
        print self.htmlparser.getSrcs()

    def transXML(self):
        print self.htmlparser.d.text()
        strr = self.htmlparser.transXML("http://www.cau.edu.cn")
        f = open('text.txt', 'w')
        f.write(strr)
        f.close()
        print chardet.detect(strr)
        print strr
コード例 #12
0
    def diff_html_from_file(cls, fileName1, fileName2, encode):
        '''get different elements btw. two html files
        '''

        if fileName1 == "" or fileName2 == "":
            print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null"
            return []

        html_str1 = file(fileName1, "rb").read()
        html_Parser1 = HtmlParser(html_str1, encode)
        elements1 = html_Parser1.parse()
        html_Parser1.saveElementsToFile(elements1, "./tmp1.txt")

        html_str2 = file(fileName2, "rb").read()
        html_Parser2 = HtmlParser(html_str2, encode)
        elements2 = html_Parser2.parse()
        html_Parser2.saveElementsToFile(elements2, "./tmp2.txt")

        diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt")
        return diffs
コード例 #13
0
ファイル: runner.py プロジェクト: zzszmyf/common
def load(id, html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    jd = page_pb2.JobDescription()
    js = "{"
    for key in xpaths:
        #        print "[ON]handle " + key
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","


#        set_pb(jd,key,value)
    fp = open("./data/" + id + ".dat", 'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()
コード例 #14
0
def iterate_folder(fp, word_to_tail_map, document_len_map, file_path):

    document_list = [document_name for document_name in listdir(file_path) if isfile(join(file_path, document_name)) and document_name.isdigit()]
    word_info = {}
    
    for document in sorted(document_list, key=lambda x:int(x)):
        
        print "doing for ", document
        if getsize(file_path+document) > 3000000:
            continue
       
        signal.alarm(3)   # Ten seconds 
        try:
            html_parser = HtmlParser(file_path,document,True,True)
            word_list = html_parser.get_all_words()
        except Exception , e:
            log_file_to_check = open("log_file_done_tillYY","a")
            log_file_to_check.write("Time out for %s\n"%(document))
            log_file_to_check.close()
            continue

        signal.alarm(0)

        word_to_position_map = {}
        
        current_position = 1
        for word in word_list:
            if word not in word_to_position_map:
                word_to_position_map[word] = []

            word_to_position_map[word].append(current_position)
            current_position += 1
        document_len_map[int(document)] = current_position - 1 

        for word in word_to_position_map:
            if word not in word_info:
                word_info[word] = []
            word_info[word].append((int(document),word_to_position_map[word]))            
コード例 #15
0
def main():
    # initialize argument parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument('url')
    parser.add_argument('keyword')

    # get arguments
    args = parser.parse_args()

    # set keyword and url from arguments
    keyword = args.keyword
    url = args.url

    # do a get request and get html from url
    response = do_request(url)

    # check if keyword is in response
    if keyword in response:
        print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(url))
        results = process_source(response, keyword)

    # initialize html parser
    parser = HtmlParser()

    # parse links from parser
    links = parser.feed(response)

    # iterate through collected links
    for link in links:
        # get the css or js file behind the links
        response = do_request(link)

        # check if keyword is in css or js file
        if keyword in response:
            print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(link))
            results = process_source(response, keyword)
コード例 #16
0
class MySpider(object):
    def __init__(self, root_url):
        self.parser = HtmlParser()
        self.storage = DataStore()
        self._get_root_urls(root_url)

    def _get_root_urls(self, root_url):
        if os.path.exists('job_class.json'):
            pass
        else:
            new_urls = self.parser.get_url(root_url)
            self.storage.local_store(new_urls,
                                     'job_class.json')  #存储要爬取的行业类别url

    def joburl_init(self, pagenum, path='job_class.json'):
        root_urls = self.storage.load_data(path)
        jobs_dict = {}
        for i in pagenum:
            for list in root_urls:
                jobs_dict[list +
                          str(i)] = root_urls[list] + str(i)  #构造要爬取的网址链接
        self.storage.local_store(jobs_dict, 'job_page_url.json')  #存储构造好的网址链接

    def company_url(self, path='job_page_url.json'):
        company_urls = self.storage.load_data(path)
        company_dicts = {}
        url_get = 0  #已获取的网址总数
        for company_info_url in company_urls:
            print("待爬取的行业网址总数:", len(company_urls) - url_get)
            url_get += 1
            url = company_urls[company_info_url]
            company_dicts.update(self.parser.getcompany_url(url))
            self.storage.local_store(url, 'job_page_url_old.json')  #存储已爬取的网址
        self.storage.local_store(company_dicts,
                                 'company_info_url_new.json')  #存储公司信息的URL

    def company_info(self, path='company_info_url_new.json'):
        company_info_urls = self.storage.load_data(path)
        url_get = 0  #以获取的公司信息网址总数
        for company_name in company_info_urls:
            print("待爬取的公司信息网址总数:", len(company_info_urls) - url_get)
            url_get += 1
            url = company_info_urls[company_name]
            self.parser.getcompany_info(company_name, url)
            self.storage.local_store(
                url, 'compang_info_url_old.json')  #存储以爬取的存储公司信息URL

    #从上次断点出重新开始获取公司信息
    def grab_increment(self):
        new_urls = self.storage.load_data('company_info_url_new.json')
        old_urls = self.storage.load_data('compang_info_url_old.json')
        for company_name in new_urls:
            new_url = new_urls[company_name]
            if new_url not in old_urls:
                self.parser.getcompany_info(company_name, url)
                self.storage.local_store(
                    url, 'compang_info_url_old.json')  # 存储以爬取的存储公司信息URL
コード例 #17
0
ファイル: differ.py プロジェクト: eJon/common
 def diff_html_from_file(cls, fileName1, fileName2,encode):
     '''get different elements btw. two html files
     '''
     
     if fileName1=="" or fileName2=="":
         print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null"
         return []
         
     html_str1 = file(fileName1,"rb").read()
     html_Parser1 = HtmlParser(html_str1,encode)
     elements1 = html_Parser1.parse() 
     html_Parser1.saveElementsToFile(elements1,"./tmp1.txt")
     
     html_str2 = file(fileName2,"rb").read()
     html_Parser2 = HtmlParser(html_str2,encode)
     elements2 = html_Parser2.parse()
     html_Parser2.saveElementsToFile(elements2,"./tmp2.txt")
     
     
         
     diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt")   
     return diffs
コード例 #18
0
class Spiderman(object):

    def __init__(self):
        self.manage = UrlManager()
        self.parser = HtmlParser()
        self.downloader = Htmldownloader()
        self.output = DataOutput()

    def crawl(self,root_url):
        self.manage.add_new_url(root_url)
        print(len(self.manage.new_urls))
        while(self.manage.has_new_url() and self.manage.old_url_size() < 100):
            try:
                new_url = self.manage.get_new_url()
                html = self.downloader.download(new_url)
                new_urls,data = self.parser.parser(new_url,html)
                self.manage.add_new_urls(new_urls)
                self.output.store_data(data=data)
                print('已经抓取%s个链接' % self.manage.old_url_size())
            except:
                print('crawl Failed')
        self.output.output_html()
コード例 #19
0
ファイル: _htmlparser.py プロジェクト: Superjom/swin-2
 def __init__(self):
     self.html='''
         <html>
             <head>
                 <title>hello world</title>
             </head>
             <body>
                 你好<b>世界</b>
                 <h1>h1这是</h1>
                 <a href="http://www.cau.edu.cn">link哈啊 1</a>
                 <a href="http://www.cau.edu.cn/hello">link 2</a>
                 <a href="http://www.cau.edu.cn/index">link 3</a>
             </body>
         </html>
     '''
     self.homeUrls = [
         'http://www.cau.edu.cn',
         'http://www.google.com.hk',
         'http://www.baidu.com',
     ]
     self.urlparser = UrlParser(self.homeUrls)
     self.htmlparser = HtmlParser(self.urlparser)
コード例 #20
0
class Harvester():
    
    def __init__(self, db, filename):
        'Harvest articles from the list of feeds in filename.'
        self.db = db
        self.filename = filename
        self.htmlparser = HtmlParser()
        feedlist = self.read_feed_list(filename)
        self.articles = self.parse_feedlist(feedlist)

    def read_feed_list(self, filename):
        '''
        Read the feed list from a CSV file. The first item of each line
        is the URL to an RSS feed.
        '''
        feedlist = []
        reader = csv.reader(open(filename, 'rb'))
        for line in reader:
            feedlist.append(line)
        return feedlist

    def parse_feed(self, entry):
        'Extract list of articles from the feed.'
        articles = []
        (url, publisher, publisher_location) = entry
        try:
            c = urlopen(url)
        except URLError:
            print 'Failed to fetch ' + url
        feed = feedparser.parse(c)
        # for e in feed.entries[:1]: # read just the first entry while debugging
        for e in feed.entries:
            image_link = None
            image_type = None
            for link in e.links:
                if link['rel'] == 'enclosure':
                    image_link = link['href']
                    image_type = link['type']
            article = Article(
                publisher=publisher,
                publisher_location=publisher_location,
                published_date=e.updated_parsed,
                title=e.title,
                link=e.link,
                image_link=image_link,
                image_type=image_type)
            content = self.htmlparser.parse(e.link)
            m = re.search(r'-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', content)
            if m:
                article.source = m.group(1)
            article.content = re.sub(r'(\\n)?\s*-\s*([a-zA-Z]+(,?\s+[a-zA-Z]+){0,6})$', '', content)
            article.store(self.db) # put article and word frequencies into couchdb
            articles.append(article)
        return articles

    def parse_feedlist(self, feedlist):
        'Parse the RSS feeds.'
        articles = []
        for entry in feedlist:
            articles += self.parse_feed(entry)
        return articles

    def __str__(self):
        print self.filename
コード例 #21
0
 def __init__(self, root_url):
     self.parser = HtmlParser()
     self.storage = DataStore()
     self._get_root_urls(root_url)
コード例 #22
0
 def __init__(self):
     self.manage = UrlManager()
     self.parser = HtmlParser()
     self.downloader = Htmldownloader()
     self.output = DataOutput()
コード例 #23
0
ファイル: spidermanager.py プロジェクト: oujx28/Spider_study
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
コード例 #24
0
ファイル: spider.py プロジェクト: Honda-a/seotool
class Spider:
    def __init__(self,
                 url,
                 number_of_threads=20,
                 allowed_urls=[],
                 blocked_urls=[],
                 basic_auth=(),
                 depth=-1):
        self.url = url
        self.number_of_threads = number_of_threads
        self.allowed_urls = allowed_urls
        # self.blocked_urls = blocked_urls
        self.lost_url = set()
        self.basic_auth = basic_auth
        self.depth = depth
        self.crawl = True
        self.visited = {}
        self.general_visited = set()
        self.unvisited = set()
        self.general_unvisited = {self.url}
        self.fetched_url_record = dict()
        self.csv_table = CsvFormat([
            "url", "status code", "title", "keyword", "description", "h1",
            "h2", "h3", "h4", "h5", "h6", "index", "open tags",
            "external links", "h_tag_format"
        ])
        self.downloaded_pages = {}
        self.record = []
        self.url_parser = UrlParser(url)
        self.parser = HtmlParser()
        self.filemanager = FileManager()

    def start(self):
        self.fetch_html()
        while len(self.general_visited) < len(
                self.general_unvisited) and self.crawl == True:
            self.fetch_html()

    def fetch_html(self):
        url = self.get_url()
        if url in self.general_visited or not url:
            return
        res = self.get_html(url)
        if res.status_code >= 500:
            self.add_to_visited(url, 500)
            return False
        elif res.status_code >= 400:
            self.save_formated_data(res, url)
            self.add_to_visited(url, 400)
        elif res.status_code >= 300:
            if res.history:
                if self.url_parser.domain not in res.url:
                    return False
        elif res.status_code >= 200:
            self.save_formated_data(res, url)
            self.add_to_visited(url, 200)

    def save_formated_data(self, response, current_url):
        html = BeautifulSoup(response.content, "lxml")
        self.csv_table.create_row('data')
        h_tags = self.parser.get_all_h(html)
        update = {
            "url":
            current_url,
            "status code":
            response.status_code,
            "title":
            self.parser.get_title(html),
            "keyword":
            self.parser.get_meta_keyword(html),
            "description":
            self.parser.get_meta_description(html),
            "h1":
            self.parser.get_htag("h1", h_tags),
            "h2":
            self.parser.get_htag("h2", h_tags),
            "h3":
            self.parser.get_htag("h3", h_tags),
            "h4":
            self.parser.get_htag("h4", h_tags),
            "h5":
            self.parser.get_htag("h5", h_tags),
            "h6":
            self.parser.get_htag("h6", h_tags),
            "index":
            self.parser.get_meta_index(html),
            "open tags":
            self.find_open_tags(response.text),
            "external links":
            self.parser.get_broken_a_tags(response.text,
                                          self.url_parser.domain, current_url),
            "h_tag_format":
            self.parser.tag_structure(response.text),
        }
        if response.status_code >= 400:
            update["status code"] = str(update["status code"])
            for fetched_page_url, fetched_url_list in self.fetched_url_record.items(
            ):
                if current_url in fetched_url_list:
                    update["status code"] += f" {fetched_page_url}にあります、\n"
        self.csv_table.update_row('data', update)
        self.csv_table.add_row_to_table('data')
        fetched_urls = self.parser.get_url(html, self.url_parser.domain,
                                           current_url)
        self.add_to_unvisited(current_url, fetched_urls)

    def get_url(self):
        if not self.unvisited:
            self.unvisited = self.general_unvisited - self.general_visited
            return self.unvisited.pop()

        return self.unvisited.pop()

    def add_to_visited(self, key, *args):
        if key not in self.visited and args:
            self.visited[key] = list(args)
        self.general_visited.add(key)

    def add_to_unvisited(self, url, fetched_urls):
        self.fetched_url_record[url] = fetched_urls
        self.general_unvisited.update(fetched_urls)

    def find_open_tags(self, html):
        open_tag_finder = OpenTagFinder()
        open_tag_finder.feed(html)
        open_tag_finder.reset()
        open_tags = open_tag_finder.get_open_tags()
        return open_tags

    def get_html(self, url):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95'
        }
        try:
            if self.basic_auth:
                return requests.get(url,
                                    headers=headers,
                                    auth=HTTPBasicAuth(self.basic_auth[0],
                                                       self.basic_auth[1]),
                                    timeout=5.0)
            else:
                return requests.get(url, headers=headers, timeout=80.0)
        except requests.exceptions.RequestException as e:
            print(e)
            self.filemanager.save_to_log(f"{e} in url {url}")
            return
コード例 #25
0
ファイル: test.py プロジェクト: Honda-a/seotool
import requests
from bs4 import BeautifulSoup
from htmlparser import HtmlParser
from urlparser import UrlParser
from time import sleep
import codecs
import json
import pandas as pd

visited = set()
unvisited = set()
domain = 'www.motoji.co.jp'
siteUrl = f"https://{domain}/"
praser_url = UrlParser(siteUrl)
parser_html = HtmlParser()
DATA = []

def get_res(url):

    headers_pc = {'User-Agent': 'robot wpmake'}
    try:
        res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False)
        return res
    except requests.exceptions.RequestException as e:
        print(e)
        return False

def update_data(url, status_code):

    DATA.append({"url": url, "status_code": status_code})