def media_downloaded(self, response, request, info): try: match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)', response.body) try: item = response.meta['item'] item['vlog'](response.body) if not match: item.start(self.STATE_TYPE2) log("NIU TYPE2 for {}".format(item['raw_url'])) return date = match.group(1) mend = response.body[match.end():] match = re.search(r"([\w]+).mp4,200000", mend) if not match: match = re.search(r"([\w]+).mp4,500000", mend) vid_name = match.group(1) item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\ + date + "," + vid_name + ",.mp4.csmil/master.m3u8" item.finish(self.STATE_ID) return item except Exception as e: log( "Y-dl playlist extraction failed %s: %s" % (item['raw_url'], str(e)), ERROR) item['vlog'](response.body) return item except Exception as e: format_exc(self, "media_downloaded", e)
def _spider_idle(self, spider): """Collect more links, starting from the place previously stopped""" try: log("Spider {0} idle start".format(self.name), DEBUG) if self.video_processor: self.video_processor.wait_all_finished(self) if self._links or self._existent: #should complete all requests before going further self._index_successful() for link, states in self._links.viewitems(): self.logidx("%s %s" % (str(states), link)) lost = sum(1 for lnk, result in self._links.viewitems() if "?" == result) ok = sum(1 for lnk, result in self._links.viewitems() if not type(result) is str and self._is_successful(result)) log("Lost links: %i, OK: %i" % (lost, ok), WARNING) self._links.clear() if self.__first_page: return if self._next_page_url_interrupted: log("Idle, start collecting links") self.logidx("Requesting {0}".format(self._next_page_url_interrupted)) req = Spider._request(self._next_page_url_interrupted, self._collect_next_page_links) self._next_page_url_interrupted = "" self.crawler.engine.crawl(req, spider) except Exception as e: format_exc(self, "_spider_idle", e)
def handle_failed(inst, failure, request): try: format_exc(inst, "media_failed", failure) item = request.meta['item'] item['vlog']("Ooyala1: " + str(failure)) except Exception as e: format_exc(inst, "handle_failed", e)
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) self.less_vid = kw.get('less_vid', False) except Exception as e: format_exc(self, "__init__", e)
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) #super(self.__class__, self).__init__(kw) except Exception as e: format_exc(self, "__init__", e)
def media_downloaded(self, response, request, info): try: match = re.search(r'http://newsvidhd[^\|]+(\d{4}/\d{2}/\d{2}/)', response.body) try: item = response.meta['item'] item['vlog'](response.body) if not match: item.start(self.STATE_TYPE2) log("NIU TYPE2 for {}".format(item['raw_url'])) return date = match.group(1) mend = response.body[match.end():] match = re.search(r"([\w]+).mp4,200000", mend) if not match: match = re.search(r"([\w]+).mp4,500000", mend) vid_name = match.group(1) item['playlist_url'] = "http://newsvidhd-vh.akamaihd.net/i/foxsports/prod/archive/"\ + date + "," + vid_name + ",.mp4.csmil/master.m3u8" item.finish(self.STATE_ID) return item except Exception as e: log("Y-dl playlist extraction failed %s: %s" % (item['raw_url'], str(e)), ERROR) item['vlog'](response.body) return item except Exception as e: format_exc(self, "media_downloaded", e)
def do_login(inst, driver): try: driver.implicitly_wait(60) driver.set_page_load_timeout(60) log("Opening login page", DEBUG) try: driver.get("http://www.heraldsun.com.au/login") except TimeoutException as we: log("Login page timeout, continuing", INFO) cls_name = 'ncenvoy-identity ncenvoy-identity-login' xpath = "//iframe[@class='{}']".format(cls_name) el = driver.find_element_by_xpath(xpath) fname = el.get_attribute("name") log("Switching to frame {}".format(fname)) driver.switch_to.frame(fname) driver.find_element_by_id("cam_password").send_keys("infillpaper01") driver.find_element_by_id("cam_username").send_keys("*****@*****.**") driver.find_element_by_class_name("button-submit").click() """ xpath = '//p[contains(text(), "Thank you")]' driver.wait.until(EC.presence_of_element_located( (By.XPATH, xpath))) """ log("Login submitted") except TimeoutException as we: log("Web driver: timeout at login", ERROR) raise except WebDriverException as we: log("Web driver: %s" % we.msg, ERROR) raise except Exception as e: format_exc(inst, "__init__", e) raise
def do_login(inst, driver): try: driver.implicitly_wait(60) driver.set_page_load_timeout(60) log("Opening login page", DEBUG) try: driver.get("http://www.heraldsun.com.au/login") except TimeoutException as we: log("Login page timeout, continuing", INFO) cls_name = 'ncenvoy-identity ncenvoy-identity-login' xpath = "//iframe[@class='{}']".format(cls_name) el = driver.find_element_by_xpath(xpath) fname = el.get_attribute("name") log("Switching to frame {}".format(fname)) driver.switch_to.frame(fname) driver.find_element_by_id("cam_password").send_keys("infillpaper01") driver.find_element_by_id("cam_username").send_keys( "*****@*****.**") driver.find_element_by_class_name("button-submit").click() """ xpath = '//p[contains(text(), "Thank you")]' driver.wait.until(EC.presence_of_element_located( (By.XPATH, xpath))) """ log("Login submitted") except TimeoutException as we: log("Web driver: timeout at login", ERROR) raise except WebDriverException as we: log("Web driver: %s" % we.msg, ERROR) raise except Exception as e: format_exc(inst, "__init__", e) raise
def get_media_requests(self, item, info): if NO_VIDEO: return try: for video_url in item['video_urls']: self._spider.start_state(item['raw_url'], self.STATE_ID) log("VideoDirect downloading %s " % (video_url), DEBUG) request = scrapy.Request( url=video_url, #"http://aaa[1]b(2).ru", method="GET", headers={ "Accept": "*/*", "User-Agent": "Mozilla", }, meta={ "item": item, "video_url": video_url }, #"download_timeout":600, dont_filter=True, ) yield request except Exception as e: format_exc(self, "get_media_requests", e)
def media_downloaded(self, response, request, info): item = response.meta['item'] try: data_num = response.meta['data_num'] item['twitter_data'][data_num] = response except Exception as e: format_exc(self, "media_downloaded %s" % item['raw_url'], e)
def media_downloaded(self, response, request, info): try: data_num = response.meta['data_num'] state_id = TwitterPipeline1.STATE_ID % data_num Ooyala1Pipeline.handle_downloaded(self, response, state_id) response.meta['item'].finish(state_id) except Exception as e: format_exc(self, "media_downloaded", e)
def start_requests(self): try: yield self._request( url_=self.start_url, callback_=self._collect_next_page_links, ) except Exception as e: format_exc(self, "start_requests", e)
def process_item(self, item, spider): if NO_VIDEO or ('skip_video' in item and item['skip_video']) or not item[self.download_url_field]: log("StreamPipeline skipping {}".format(item['raw_url'])) return item spider.video_processor = self try: #check if already downloaded (or tried to) and link to previously saved path if self._no_duplicates: video_fname = self.get_video_filename(item) if video_fname: if video_fname in self.__downloaded_files: ln_to = os.path.join(self.__downloaded_files[video_fname], video_fname) #if os.path.isfile(ln_to): ln_from = os.path.join(item['path'], video_fname) rln = self._call(["ln", "-s", "-f", "--no-dereference", ln_to, ln_from]) url = item["raw_url"] log("Linking {0} to {1} for {2}".format(ln_from, ln_to, url), DEBUG) spider.start_state(url, self.STATE_ID) spider.finalize_state(url, self.STATE_ID) return item#do not download else: #remember fname immediately, if not done before. don't wanna wait results #making things more complex. want to exclude duplicates of not-yet-finished videos. self.__downloaded_files[video_fname] = item['path'] #print "added {0}".format(video_fname) logfile_path = item['vlog'].file_path logfile = open(logfile_path, "w", 0) timeout = get_project_settings().get('DOWNLOAD_TIMEOUT', 30) data_dir = item['path'] cmdline = "youtube-dl --no-warnings " if int(Config.value(mirror0.SECTION_COMMON, "hls_prefer_native")): cmdline += "--hls-prefer-native " cmdline += "--no-part --socket-timeout {0} ".format(timeout) cmdline += "-o '%s" % data_dir cmdline += "/%(title)s-{0}.%(ext)s' ".format(self.__vcounter) cmdline += item[self.download_url_field] logfile.write(cmdline + "\n") self.__vcounter += 1 log("Starting {0} for {1}".format(item[self.download_url_field], item["raw_url"]), DEBUG) self._sub_proc.append( (subprocess.Popen([cmdline], stdout=logfile.fileno(), stderr=logfile.fileno(), shell=True), #stderr=subprocess.STDOUT, logfile, logfile_path, item["raw_url"],), ) #for key, value in logging.Logger.manager.loggerDict.iteritems(): except Exception as e: format_exc(self, "porcess_item", e) return item
def start_requests(self): try: """if scraping from videos page, getting video links directly from the start page top section""" yield self._request( url_=self.start_url, callback_=(self._collect_next_page_links if self.NORMAL == self.mode else self._run_item), ) except Exception as e: format_exc(self, "start_requests", e)
def _links_from_response(self, response): try: links = [] return links except Exception as e: format_exc(self, "_links_from_response", e) return None
def process_item(self, item, spider): try: if self.NAME in spider.disabled_pipelines: return item selector = item['raw_html'] url = item['raw_url'] if not url or not selector: msg = "Invalid item %s" % str(item) log(msg, ERROR) #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first() item['title'] = selector.xpath(self._title_x).extract_first() if not item['title']: item['title'] = selector.xpath( "//head/title/text()").extract_first() if not item['title']: log("No title %s" % url, ERROR) raise DropItem() item['title'] = item['title'].strip() log("RawExtractor got title %s" % item['title'], DEBUG) item['title'] = self.encode_strip(item['title']) body = "" for p in selector.xpath(self._text_paragraph_x).extract(): body += " " + p if body: body = body.strip() elif self._abstract_paragraph_x: body = selector.xpath( self._abstract_paragraph_x).extract_first() if not body: log("No article text %s" % url, DEBUG) body = "" item['text'] = body.encode("ascii", "replace").strip(" -\n") item['pictures'] = selector.xpath(self._picture_x).extract() try: if self._time_format_in: dt_obj_localized = extract_dt_obj(selector, self._time_x, self._time_format_in) else: """Assuming ISO time with timezone on empty format list""" iso_s = selector.xpath(self._time_x).extract_first() dt_obj_localized = time_utils.dt_obj_from_iso(iso_s) item['time'] = time_utils.format_utc_from_localized( dt_obj_localized, self._time_format_out) except Exception as e: log("No time for %s %s" % (url, str(e)), DEBUG) item['time'] = "" return self._extract_more(item, spider) except Exception as e: if type(e) == DropItem: raise format_exc(self, "process_item", e)
def get_video_filename(self, item): try: cmdline = "youtube-dl --no-warnings --get-filename -o '%(title)s.%(ext)s' " cmdline += item[self.download_url_field] process = subprocess.Popen([cmdline], stdout=subprocess.PIPE, stderr=None, shell=True) out_err_tpl = process.communicate() return out_err_tpl[0].strip() except Exception as e: format_exc(self, "get_video_filename", e)
def save(self): """ Writes all data to disc """ try: with open(self._file_path, "wb") as out_f: for hsh in self._hashes: out_f.write(_long_to_bytes(hsh)) except OSError as e: format_exc(self, "Saving {} failed".format(self._file_path), e) raise
def index_log(self, message): """ URLs added in text form are saved to separate file for human reference message: string """ try: with open(self._log_file_path, "a") as f: f.write(message) except OSError as e: format_exc(self, "Index log write failed {}".format(self._file_path), e) raise
def _links_from_response(self, response): response = self._prepare_response(response) try: links = response.xpath("//div[re:test(@class, 'list-item')][not(ancestor::*[re:test(@class, 'double-col')])]/div[re:test(@class, 'inner')]/h4/a/@href").extract() if not links: links = response.xpath("//div[re:test(@class, 'list-item')]/div[re:test(@class, 'inner')]/h4/a/@href").extract() return links except Exception as e: format_exc(self, "_links_from_response", e) return None
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) self._per_url_regex_xpath = ( (r"video/sport/afl", "//a[@class='vms-list-item module']/@href"), ("sport/afl^", '//div[@class="story-block "]/a[@class="thumb-link"]/@href'), #main page ("", '//div[@class="story-block "]/h4[@class="heading"]/a/@href'), #more-stories, clubs ) except Exception as e: format_exc(self, "__init__", e)
def media_downloaded(self, response, request, info): try: item = response.meta['item'] (vpath, vname) = os.path.split(response.meta['video_url']) with open(os.path.join(item['path'], vname), "wb") as f: f.write(response.body) self._spider.finalize_state(item['raw_url'], self.STATE_ID) log("VideoDirect download complete %s for %s" % (request.url, item['raw_url']), WARNING) except Exception as e: format_exc(self, "media_downloaded", e)
def _index_successful(self): try: self._links.update(self._existent) self._existent.clear() for link, state in self._links.viewitems(): if not type(state) is str and self._is_successful(state): if self._index: self._index.add(link) if self._index: self._index.save() except Exception as e: format_exc(self, "_index_successful", e)
def process_item(self, item, spider): try: if self.NAME in spider.disabled_pipelines: return item selector = item['raw_html'] url = item['raw_url'] if not url or not selector: msg = "Invalid item %s" % str(item) log(msg, ERROR) #abstr = response.xpath("//p[@class='article-abstract']/text()").extract_first() item['title'] = selector.xpath(self._title_x).extract_first() if not item['title']: item['title'] = selector.xpath("//head/title/text()").extract_first() if not item['title']: log("No title %s" % url, ERROR) raise DropItem() item['title'] = item['title'].strip() log("RawExtractor got title %s" % item['title'], DEBUG) item['title'] = self.encode_strip(item['title']) body = "" for p in selector.xpath(self._text_paragraph_x).extract(): body += " " + p if body: body = body.strip() elif self._abstract_paragraph_x: body = selector.xpath(self._abstract_paragraph_x).extract_first() if not body: log("No article text %s" % url, DEBUG) body = "" item['text'] = body.encode("ascii", "replace").strip(" -\n") item['pictures'] = selector.xpath(self._picture_x).extract() try: if self._time_format_in: dt_obj_localized = extract_dt_obj(selector, self._time_x, self._time_format_in) else: """Assuming ISO time with timezone on empty format list""" iso_s = selector.xpath(self._time_x).extract_first() dt_obj_localized = time_utils.dt_obj_from_iso(iso_s) item['time'] = time_utils.format_utc_from_localized(dt_obj_localized, self._time_format_out) except Exception as e: log("No time for %s %s" % (url, str(e)), DEBUG) item['time'] = "" return self._extract_more(item, spider) except Exception as e: if type(e) == DropItem: raise format_exc(self, "process_item", e)
def _links_from_response(self, response): try: if self.TITLE_PAGE == self.start_url: xpath_s = "//section[@class='breaking-news']/header/h1[re:test(text(), 'fantasy')]/../../div/ol/li/article/header/div/h1[@itemprop='name headline']/a/@href | //section[@class='breaking-news']/header/h1[re:test(text(), 'women')]/../../div/ol/li/article/header/div/h1[@itemprop='name headline']/a/@href | //section/header/h1/a[text()='More AFL News']/../../../div/div/div/div/div/article/header/div/h1[@itemprop='name headline']/a/@href" else: xpath_s = "//article/header[re:test(@class, 'article')]/div/h1/a/@href" links = response.xpath(xpath_s).extract() return links except Exception as e: format_exc(self, "_links_from_response", e) return None
def media_downloaded(self, response, request, info): try: item = response.meta['item'] (vpath, vname) = os.path.split(response.meta['video_url']) with open(os.path.join(item['path'], vname), "wb") as f: f.write(response.body) self._spider.finalize_state(item['raw_url'], self.STATE_ID) log( "VideoDirect download complete %s for %s" % (request.url, item['raw_url']), WARNING) except Exception as e: format_exc(self, "media_downloaded", e)
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) if self.start_url.startswith(self.VIDEO_PATH): self.mode = self.VIDEO self.disabled_pipelines = [mirror0.generic_spider.text_image_pipeline.TextImagePipeline.NAME, mirror0.generic_spider.raw_extractor_pipeline.RawExtractorPipeline.NAME,] else: self.mode = self.NORMAL except Exception as e: format_exc(self, "__init__", e)
def _run_item(self, response): try: url = response.request.url if not url in self._links: log("Response url doesn't match: %s" % url, INFO) item = self._item_class(self) item['raw_url'] = url response = self._prepare_response(response) item['raw_html'] = response.selector #item['raw_text'] = response.body return item except Exception as e: format_exc(self, "_run_item", e)
def _run_item(self, response): try: item = super(WatodaySpider, self)._run_item(response) if self.start_url.endswith(self.TITLE_PAGE): upath = url_path(response.request.url) if upath.startswith("video"): item['out_dir'] = "video" else: item['out_dir'] = "title-page" return item except Exception as e: format_exc(self, "_run_item", e)
def media_failed(self, failure, request, info): try: item = request.meta['item'] log("VideoDirect download failed %s for %s: %s" % (request.url, item['raw_url'], str(info)), ERROR) """DEBUG""" video_url = request.meta['video_url'] (vpath, vname) = os.path.split(video_url) vname = "FAIL" + vname with open(os.path.join(item['path'], vname), "wb") as f: f.write(video_url) except Exception as e: format_exc(self, "media_downloaded", e)
def _extract_more(self, item, spider): try: log("NewsExtractor start %s" % item['title'], DEBUG) selector = item['raw_html'] item['ooyala_id'] = selector.xpath("//div[re:test(@class, 'vms module')]/@vms-embedcode").extract_first() if item['ooyala_id']: log("Matched %s" % item['raw_url'], DEBUG) else: item['ooyala_id'] = "" log("Not matched %s" % item['raw_url'], DEBUG) return item except Exception as e: format_exc(self, "_extract_more", e)
def wait_all_finished(self, spider): try: log("Waiting for video processes to complete...") i = len(self._sub_proc) for process, logfile, logfile_path, url in self._sub_proc: print "Left %i" % (i) if self._no_duplicates: tail_proc = subprocess.Popen(["tail", "-f", "-n", "1", logfile_path], stdout=None, stderr=None) process.communicate() #process.wait() if self._no_duplicates: tail_proc.terminate() logfile.close() # downloaded successfully if 0 == process.returncode: spider.start_state(url, self.STATE_ID) spider.finalize_state(url, self.STATE_ID) # return code 1 can indicate both no video on page or not supported page elif 1 == process.returncode: # check_call accepts a list of arguments only grepr = StreamPipeline._call(["grep", "ERROR.*Unsupported", logfile_path]) if 0 == grepr: spider.start_state(url, self.STATE_NOVID) spider.finalize_state(url, self.STATE_NOVID) else: grepr = StreamPipeline._call(["grep", "ERROR.*content ID", logfile_path]) spider.start_state(url, self.STATE_ID) if 0 == grepr: log("Content id error %s" % url, DEBUG) else: grepr = StreamPipeline._call(["grep", "ERROR.*timed out", logfile_path]) if 0 == grepr : log("Ydl video timed out {0}".format(url), WARNING) elif 1 == grepr: log("Log state is not known %s" % (url), WARNING) else: log("Grep return code %i %s" % (grepr, url), WARNING) elif -2 == process.returncode: #interruped by user spider.start_state(url, self.STATE_ID) pass else: #started state without later finalizing means fail spider.start_state(url, self.STATE_ID) log("Youtube-dl return code %i %s" % (process.returncode, url), WARNING) i -= 1 self._sub_proc[:] = [] except Exception as e: format_exc(self, "wait_all_finished", e) raise
def media_downloaded(self, response, request, info): try: match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body) item = response.meta['item'] if match: item['ooyala_urls'].append(match.group(1)) return item else: item.start(OOYALA_JS_ID) log("Ooyala0: type 2 %s" % item['raw_url'], WARNING) item['vlog'](response.body) return item #log("No ooyala match %s %s" % (request.url, item['raw_url'])) except Exception as e: format_exc(self, "media_downloaded", e)
def media_failed(self, failure, request, info): try: item = request.meta['item'] log( "VideoDirect download failed %s for %s: %s" % (request.url, item['raw_url'], str(info)), ERROR) """DEBUG""" video_url = request.meta['video_url'] (vpath, vname) = os.path.split(video_url) vname = "FAIL" + vname with open(os.path.join(item['path'], vname), "wb") as f: f.write(video_url) except Exception as e: format_exc(self, "media_downloaded", e)
def _extract_more(self, item, spider): try: selector = item['raw_html'] if spider.NORMAL == spider.mode: item['video_urls'] = selector.xpath("//video/source[@type='video/mp4']/@src").extract() elif spider.VIDEO == spider.mode: pass else: assert "Wrong mode value" #['https://snappytv-a.akamaihd.net/video/928000/603p603/2016-06-04T12-05-17.467Z--35.797.mp4?token=1467913795_5faf17e8319b2988a149bfba6a686f40'] return item except Exception as e: format_exc(self, "_extract_more", e)
def _links_from_response(self, response): try: if self.start_url.endswith(self.TITLE_PAGE): links = response.xpath("//article[re:test(@class, 'story')]/descendant::h3[@class='story__headline']/a/@href")\ .extract() links = [lnk for lnk in links if self.TITLE_PAGE in lnk or self.VIDEO_PATH in lnk] else: links = response.xpath(\ "//article[@class='story has-wof']/div[@class='story__wof']/h3[@class='story__headline']/a/@href | //article[re:test(@class, 'has-wof')]/h3[@class='story__headline']/a/@href" ).extract() return links except Exception as e: format_exc(self, "_links_from_response", e) return None
def media_downloaded(self, response, request, info): try: match = re.search(r"x200[^\w]+(http[^\"\,]+\.mp4)", response.body) item = response.meta['item'] if match: item['ooyala_urls'].append(match.group(1)) return item else: item.start(OOYALA_JS_ID ) log("Ooyala0: type 2 %s" % item['raw_url'], WARNING) item['vlog'](response.body) return item #log("No ooyala match %s %s" % (request.url, item['raw_url'])) except Exception as e: format_exc(self, "media_downloaded", e)
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) self._per_url_regex_xpath = ( (r"video/sport/afl", "//a[@class='vms-list-item module']/@href"), ("sport/afl^", '//div[@class="story-block "]/a[@class="thumb-link"]/@href' ), #main page ("", '//div[@class="story-block "]/h4[@class="heading"]/a/@href' ), #more-stories, clubs ) except Exception as e: format_exc(self, "__init__", e)
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) self.less_vid = kw.get('less_vid', False) #in this spider videos are downloaded with the framework so no need to wait for additional processes self.video_processor = self self._per_url_regex_xpath = ( ("nabchallenge" , "//h4[@class='partial--finals-video__caption']/a/@href | //h3[re:test(text(), 'News')]/parent::div/parent::div/following::div/div/div[re:test(@class, 'list-item')]/div[re:test(@class, 'inner')]/h4/a/@href"), ) except Exception as e: format_exc(self, "__init__", e)
def _extract_more(self, item, spider): try: log("NewsExtractor start %s" % item['title'], DEBUG) selector = item['raw_html'] item['ooyala_id'] = selector.xpath( "//div[re:test(@class, 'vms module')]/@vms-embedcode" ).extract_first() if item['ooyala_id']: log("Matched %s" % item['raw_url'], DEBUG) else: item['ooyala_id'] = "" log("Not matched %s" % item['raw_url'], DEBUG) return item except Exception as e: format_exc(self, "_extract_more", e)
def __init__(self, **kw): try: mirror0.generic_spider.Spider.__init__(self, **kw) if self.start_url.startswith(self.VIDEO_PATH): self.mode = self.VIDEO self.disabled_pipelines = [ mirror0.generic_spider.text_image_pipeline. TextImagePipeline.NAME, mirror0.generic_spider.raw_extractor_pipeline. RawExtractorPipeline.NAME, ] else: self.mode = self.NORMAL except Exception as e: format_exc(self, "__init__", e)
def process_item(self, item, spider): try: META_FILE = "meta.dat" file_path = os.path.join(item['path'], META_FILE) f = open(file_path, "w") f.write("url=%s\n" % item['raw_url']) f.write("publishedUTC=%s\n" % item['time']) if "ooyala_video_ids" in item and item['ooyala_video_ids']: f.write("data-content-id=%s" % json.dumps( item['ooyala_video_ids'], separators=(", ", " "))) except Exception as e: format_exc(self, "process_item", e) finally: f.close() return item
def _run_item(self, response): try: if self.mode == self.NORMAL: item = super(FoxsportsSpider, self)._run_item(response) if self.TITLE_PAGE == self.start_url: item['out_dir'] = "title_page" yield item elif self.mode == self.VIDEO: response = self._prepare_response(response) for sel_item in response.selector.xpath( "//li[re:test(@class,'fiso-video-mosaic')]"): url = sel_item.xpath( "./descendant::meta[@itemprop='contentURL']/@content" ).extract_first() if url: debug_link_regex = "" try: debug_link_regex = Config.value( mirror0.SECTION_COMMON, "debug_link_regex") except Exception: pass if debug_link_regex: if not re.search(debug_link_regex, url): continue title = sel_item.xpath( "./descendant::meta[@itemprop='headline name']/@content" ).extract_first() time = sel_item.xpath( "./descendant::meta[@itemprop='uploadDate']/@content" ).extract_first() item = self._item_class(self) item['video_urls'] = [url] item['title'] = RawExtractorPipeline.encode_strip( title) item['raw_url'] = url item['time'] = time self._links[url] = "?" #"""DEBUG""" yield item else: assert "Wrong mode value" except Exception as e: format_exc(self, "_run_item", e)