def __init__(self, seeds, logger, thread_num=5, max_depth=9, ranks=None, index=None):
     self.init_seeds_num = len(seeds)
     self.tocrawl = {}
     for seed in seeds:
         self.tocrawl[seed] = 0  # {url: current_depth, ...}
     self.crawled = {}           # {url1: None, url2: None, ...}
     self.max_depth = max_depth  # traversal depth
     self.logger = logger
     self.ranks = ranks
     self.down_url = get_url.get_url(logger)
     self.indexing = indexing.indexing()
     if index: self.indexing.index.update(index)
     self.threadpool = thread_pool.thread_pool(thread_num)
     self.lock = threading.Lock()
Example #2
0
        pat = re.compile(r"^#EXT-X-STREAM-INF.+BANDWIDTH=(?P<bandwidth>\d+).*(?:\n|\r\n?)(?P<stream>.+)", re.MULTILINE)
        dst_fname = {
            "m3u8": "vlc.m3u8",
            "html": "tv_bl.html",
            "xspf": "vlc.xspf"
        }[fmt]

        req_clns = ["ts_port"]
        for cnxt in rewrite_channels(dst_fname, req_clns, fmt=fmt):
            # :TRICKY: своей колонки нет
            hls_idx = cnxt.clns["ts_port"] + 1
            url = cnxt.row[hls_idx]
            
            if url.startswith("http://"):
                print(name, url)
                try:
                    with contextlib.closing(get_url.get_url(url)) as pf:
                        txt = pf.read()
                except get_url.URLError:
                    pass
                else:
                    max_bw, max_url = 0, None
                    for m in pat.finditer(txt):
                        bw = int(m.group('bandwidth'))
                        if not max_bw or max_bw < bw:
                            max_bw = bw
                            max_url = m.group('stream')
                    assert max_url
                    max_url = o_p.join(os.path.dirname(url), max_url)
                    write_channel(cnxt, max_url)
Example #3
0
# -*-coding:utf-8-*-
'''
Created on 2015年8月30日

@author: yx
'''
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
import multiprocessing
from get_url import get_url
import time

if __name__ == '__main__':
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl('comment_scrapy')
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
    print "get once!!"
    get_url()
    print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M", time.localtime())