def process_item(self, item, spider): if NO_VIDEO or ('skip_video' in item and item['skip_video']) or not item[self.download_url_field]: log("StreamPipeline skipping {}".format(item['raw_url'])) return item spider.video_processor = self try: #check if already downloaded (or tried to) and link to previously saved path if self._no_duplicates: video_fname = self.get_video_filename(item) if video_fname: if video_fname in self.__downloaded_files: ln_to = os.path.join(self.__downloaded_files[video_fname], video_fname) #if os.path.isfile(ln_to): ln_from = os.path.join(item['path'], video_fname) rln = self._call(["ln", "-s", "-f", "--no-dereference", ln_to, ln_from]) url = item["raw_url"] log("Linking {0} to {1} for {2}".format(ln_from, ln_to, url), DEBUG) spider.start_state(url, self.STATE_ID) spider.finalize_state(url, self.STATE_ID) return item#do not download else: #remember fname immediately, if not done before. don't wanna wait results #making things more complex. want to exclude duplicates of not-yet-finished videos. self.__downloaded_files[video_fname] = item['path'] #print "added {0}".format(video_fname) logfile_path = item['vlog'].file_path logfile = open(logfile_path, "w", 0) timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30) data_dir = item['path'] cmdline = "youtube-dl --no-warnings " if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")): cmdline += "--hls-prefer-native " cmdline += "--no-part --socket-timeout {0} ".format(timeout) cmdline += "-o '%s" % data_dir cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter) cmdline += item[self.download_url_field] logfile.write(cmdline + "\n") self.__vcounter += 1 log("Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG) self._sub_proc.append( (subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT, logfile, logfile_path, item["raw_url"],), ) #for key, value in logging.Logger.manager.loggerDict.iteritems(): except Exception as e: format_exc(self, "porcess_item", e) return item
def phantom_login(self): global _webdriver if _webdriver: #login once per application run self.driver = _webdriver log("Reusing logged in webdriver", DEBUG) else: dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.loadImages"] = "false" dcap["phantomjs.page.settings.resourceTimeout"] = "120000" _webdriver = webdriver.PhantomJS(executable_path=Config.value(CONFIG_SECTION, "phantomjs_path"), desired_capabilities=dcap)#init_chrome_driver()# self.driver = _webdriver log("Starting PhantomJS login") do_login(self, self.driver) self.driver.implicitly_wait(PAGE_TIMEOUT) self.driver.set_page_load_timeout(PAGE_TIMEOUT) getLogger("selenium.webdriver").setLevel(INFO)
def __init__(self, domain): """Loads data to memory. Creates index directory if needed and raises DistutilsFileError if failed. Raises IdexFingerpringException domain - index storage ID """ try: POLYNOMIAL = 0x1AABBCCDDFFEEDDCC # must be 65 bit long self._debug = 0 self._hashes = set() self._crc_fun = crcmod.mkCrcFun(POLYNOMIAL, initCrc=0) # When run from the unit test, index directory path will be tweaked in Config file_path =Config.value(mirror0.SECTION_COMMON, "index_directory") dir_util.mkpath(file_path) file_name = domain + ".crc64" self._file_path = os.path.join(file_path, file_name) with open(self._file_path, "a+b") as f: data = f.read() if len(data) % CRC_LEN: raise IndexFingerprintException("%s is corrupt!" % file_name) count = len(data) / CRC_LEN for i in range(0, count): string_val = data[i*CRC_LEN : (i + 1)*CRC_LEN] int_val = Index._string_long(string_val) self._hashes.update([int_val]) log("Read %i hashes from %s" % (count, file_name)) file_name = domain + ".log" self._log_file_path = os.path.join(file_path, file_name) # Rewrite through centralized logging with open(self._log_file_path, "a") as f: f.write("\n\nSTARTED %s\n" % time.strftime("%d %b %Y %H:%M:%S")) except IndexFingerprintException as e: format_exc(self, "__init__", e) log(self._file_path, ERROR) raise
def process_item(self, item, spider): try: log("FSCreator start %s" % item['title'], DEBUG) #log("fs for %s" % item['title']) item_dir = os.path.join(self._top_dir, self.__class__.getItemDir(item, spider)) if os.path.isdir(item_dir): log("Article path exists, overwriting: %s" % item_dir, DEBUG) try: dir_util.mkpath(item_dir) except Exception as e: log("Can't create article directory %s : %s" % (item_dir, str(e)), ERROR) item['path'] = item_dir if not self.__vlog_dir: self.__vlog_dir = os.path.join(Config.value(SECTION_COMMON, "log_directory"), spider.name + "_streaming") shutil.rmtree(self.__vlog_dir, True) try: os.mkdir(self.__vlog_dir) except OSError as e: pass self.__need_clean = False logfile_path = os.path.join(self.__vlog_dir, item['title'] + ".log") class VideoLog: def __init__(self): self.logfile_path = None vlog = VideoLog() vlog.file_path = logfile_path vlog.__call__ = functools.partial(FSCreatorPipeline.append_file, logfile_path) item['vlog'] = vlog return self._create_more(item, spider) except Exception as e: if type(e) == DropItem: raise else: format_exc(self, "process_item", e)
def init_chrome_driver(timeout=30): chrome_options = Options() chrome_options.add_argument("--disable-bundled-ppapi-flash") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--disable-webaudio") chrome_options.add_argument("--mute-audio") #chrome_options.add_argument("--no-startup-window") prefs = {} prefs["plugins.plugins_disabled"] = ["Adobe Flash Player", "Shockwave Flash"] prefs["profile.managed_default_content_settings.images"] = 2 #prefs["profile.managed_default_content_settings.media_stream"] = 2 chrome_options.add_experimental_option("prefs", prefs) path = Config.value(SECTION_COMMON, "chromedriver_path") if path: log("Chromedriver path: %s" % path, INFO) driver = webdriver.Chrome(executable_path=path, chrome_options=chrome_options) else: driver = webdriver.Chrome(chrome_options=chrome_options) driver.wait = WebDriverWait(driver, timeout) return driver
def _run_item(self, response): try: if self.mode == self.NORMAL: item = super(FoxsportsSpider, self)._run_item(response) if self.TITLE_PAGE == self.start_url: item['out_dir'] = "title_page" yield item elif self.mode == self.VIDEO: response = self._prepare_response(response) for sel_item in response.selector.xpath("//li[re:test(@class,'fiso-video-mosaic')]"): url = sel_item.xpath("./descendant::meta[@itemprop='contentURL']/@content").extract_first() if url: debug_link_regex = "" try: debug_link_regex = Config.value(mirror0.SECTION_COMMON, "debug_link_regex") except Exception: pass if debug_link_regex: if not re.search(debug_link_regex, url): continue title = sel_item.xpath("./descendant::meta[@itemprop='headline name']/@content").extract_first() time = sel_item.xpath("./descendant::meta[@itemprop='uploadDate']/@content").extract_first() item = self._item_class(self) item['video_urls'] = [url] item['title'] = RawExtractorPipeline.encode_strip(title) item['raw_url'] = url item['time'] = time self._links[url] = "?" #"""DEBUG""" yield item else: assert "Wrong mode value" except Exception as e: format_exc(self, "_run_item", e)
def create_start_urls(cls): urls = [] urls += [u for u in str.splitlines(Config.value(CONFIG_SECTION, "start_urls")) if u] return urls
def __init__(self): self._sub_proc = [] self._no_duplicates = int(Config.value(mirror0.SECTION_COMMON, "no_duplicate_videos")) self.download_url_field = "raw_url" pass
import unittest sys.path.append("../") from mirror0 import Config Config.enable_debug_mode() import mirror0.index.index from mirror0 import Config, Index class InitOut: def __init__(self): self.count = -1 DOMAIN = "test_domain" TEST_DIR = Config.value(mirror0.SECTION_COMMON, "index_directory") PATH_NAME = join(TEST_DIR, DOMAIN) print(TEST_DIR) logging.basicConfig(format="%(message)s", level=logging.DEBUG) class TestIndex(unittest.TestCase): def test_open_empty(self): mir_idx = Index(domain=DOMAIN) print(len(mir_idx)) self.assertEqual(len(mir_idx), 0) def test_open_ok(self): with open(PATH_NAME + ".crc64", "wb") as f:
def init_idx_log(cls): Spider._idx_file = os.path.join(Config.value(mirror0.SECTION_COMMON, "log_directory"), cls._index_file_name) with open(Spider._idx_file, "w") as f: f.write("Log for %s initially started %s\n" % (cls.name, time.strftime("%b %d %H:%M:%S %Y")))
def _collect_next_page_links(self, response): try: links = "" webdriver = "" try: self._debug_url = Config.value(mirror0.SECTION_COMMON, "debug_url") if self._debug_url: links = [url for url in str.splitlines(self._debug_url) if url] webdriver = "do_use" except Exception: pass if not links: links, webdriver = self._links_from_response_per_url(response) if not links: links = self._links_from_response(response) webdriver = "" if not links: msg = "NO LINKS %s" % response.request.url log(msg, WARNING) self.logidx(msg, response.body) else: log("Raw links: {}".format(len(links)), DEBUG) links = [(self.BASE_URL + lnk if "/" == lnk[0] else lnk) for lnk in links] try: first_n = int(Config.value(mirror0.SECTION_COMMON, "debug_first_n")) links[:] = links[:first_n] log("ONLY FIRST {}".format(first_n)) except NoOptionError: pass next_url = self._extract_next_url(response) if next_url: log("Next page: %s" % next_url, WARNING) else: log("FINISHED at %s" % response.request.url, WARNING) self.logidx("NO SHOW MORE %s" % response.request.url, response.body) try: debug_link_regex = Config.value(mirror0.SECTION_COMMON, "debug_link_regex") print(debug_link_regex) if debug_link_regex: links = [lnk for lnk in links if re.search(debug_link_regex, lnk)] except Exception: pass #links duplicated within page duplicate = 0 #being stored in index for lnk in links: if lnk in self._links: duplicate += 1 elif self._index and self._index.has(lnk): log("Article link is in index, skipping: %s" % lnk, INFO) else: self._links[lnk] = "?" self._total_count += len(links) log("Links collected total: %i this page: %i to process: %i duplicate within page: %i" % (self._total_count, len(links), len(self._links), duplicate), WARNING) if INDEX_ONLY and next_url: return Spider._request(url_=next_url, callback_=self._collect_next_page_links) else: return self._request_next_page_links(next_url, webdriver) except Exception as e: format_exc(self, "collect_next_page_links", e)
def create_start_urls(cls): _lines = str.splitlines(Config.value(CONFIG_SECTION, "start_urls")) urls = [l for l in _lines if l] assert 1 == len(urls) return urls
def create_start_urls(cls): _lines = str.splitlines(Config.value(CONFIG_SECTION, "start_urls")) urls = [l for l in _lines if l] return urls
""" from distutils import dir_util import functools from logging import ERROR, WARNING, DEBUG import os import os.path import shutil from scrapy.exceptions import DropItem from mirror0 import * from mirror0.sscommon.aux import log, format_exc from mirror0 import Config config_out = Config.value(SECTION_COMMON, "output_directory") class FSCreatorPipeline(object): __vlog_dir = "" def __init__(self): self._top_dir = "" def _create_more(self, item, spider): return item def process_item(self, item, spider): try: log("FSCreator start %s" % item['title'], DEBUG) #log("fs for %s" % item['title'])
def process_item(self, item, spider): if NO_VIDEO or ('skip_video' in item and item['skip_video'] ) or not item[self.download_url_field]: log("StreamPipeline skipping {}".format(item['raw_url'])) return item spider.video_processor = self try: #check if already downloaded (or tried to) and link to previously saved path if self._no_duplicates: video_fname = self.get_video_filename(item) if video_fname: if video_fname in self.__downloaded_files: ln_to = os.path.join( self.__downloaded_files[video_fname], video_fname) #if os.path.isfile(ln_to): ln_from = os.path.join(item['path'], video_fname) rln = self._call([ "ln", "-s", "-f", "--no-dereference", ln_to, ln_from ]) url = item["raw_url"] log( "Linking {0} to {1} for {2}".format( ln_from, ln_to, url), DEBUG) spider.start_state(url, self.STATE_ID) spider.finalize_state(url, self.STATE_ID) return item #do not download else: #remember fname immediately, if not done before. don't wanna wait results #making things more complex. want to exclude duplicates of not-yet-finished videos. self.__downloaded_files[video_fname] = item['path'] #print "added {0}".format(video_fname) logfile_path = item['vlog'].file_path logfile = open(logfile_path, "w", 0) timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30) data_dir = item['path'] cmdline = "youtube-dl --no-warnings " if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")): cmdline += "--hls-prefer-native " cmdline += "--no-part --socket-timeout {0} ".format(timeout) cmdline += "-o '%s" % data_dir cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter) cmdline += item[self.download_url_field] logfile.write(cmdline + "\n") self.__vcounter += 1 log( "Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG) self._sub_proc.append( ( subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT, logfile, logfile_path, item["raw_url"], ), ) #for key, value in logging.Logger.manager.loggerDict.iteritems(): except Exception as e: format_exc(self, "porcess_item", e) return item
def create_start_urls(cls): return [u for u in str.splitlines(Config.value(mirror0.afl.afl_spider.CONFIG_SECTION, "start_urls")) if u]