def handle_failure(self, failure): url = failure.request.url log.notice("fail url={}".format(url)) return if 'page' in url: yield scrapy.Request(url, callback=self.parse, errback=self.handle_failure, dont_filter=True) else: yield scrapy.Request(url, callback=self.parse_post, errback=self.handle_failure, dont_filter=True)
def parse_post(self, response): log.notice("get res") log.notice("resp={}".format(response)) title = response.xpath( u'//*[@class="tr1 do_not_catch"]//h4/text()').extract()[0] log.notice("title={} url={} ".format(title, response.url)) # content = response.xpath(u'(//div[@class="tpc_content do_not_catch"])[1]').extract()[0] # ptime = response.xpath(u'(//*[@class="tr1"])[1]//*[@class="tipad"]//text()').extract() # ptime = ''.join(ptime) # ptime = re.findall(r'(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2})', ptime) # if ptime: # ptime = ptime[0] # item = CaoliuItem() # item['title'] = title # item['cat'] = '7' # item['cat_name'] = u'技术讨论区' # item['url'] = response.url # item['url_md5'] = hashlib.md5(response.url).hexdigest() # item['publish_time'] = ptime # item['content'] = content # log.notice("item= title={} url={}".format(title, response.url)) yield item
def parse(self, response): urls = response.xpath( u'//*[contains(@class, "tr3 t_one")]//h3//@href').extract() print("urls={}".format(urls)) log.notice("urls={}".format(urls)) for url in urls[:]: if 'htm_data' not in url: urls.remove(url) continue url = urlparse.urljoin(self.host, url) log.notice("url={}".format(url)) yield scrapy.Request(url, callback=self.parse_post, errback=self.handle_failure, dont_filter=True) next_page = self.get_next_page(response) time.sleep(1) if next_page: yield scrapy.Request(next_page, callback=self.parse, errback=self.handle_failure, dont_filter=True)
def download_one_video(self, vid, url): try: print("vid={} url={}".format(vid, url)) vdir = "{}/{}".format(self.root_dir, vid) cmd = "mkdir -p {}".format(vdir) log.notice("vid={} cmd={}".format(vid, cmd)) os.system(cmd) cmd = "wget -c {} -O {}/{}.mp4 -o {}/{}.wget.log".format( url, vdir, vid, vdir, vid) log.notice("vid={} cmd={}".format(vid, cmd)) os.system(cmd) wavfile = "{}/{}.wav".format(vdir, vid) if os.path.exists(wavfile): os.system("rm -f {}".format(wavfile)) cmd = "ffmpeg -i {}/{}.mp4 -f wav -ar 16000 {}/{}.wav".format( vdir, vid, vdir, vid) log.notice("vid={} cmd={}".format(vid, cmd)) os.system(cmd) print("vid={} url={} success".format(vid, url)) except Exception as e: print("vid={} e={}".format(vid, e)) log.fatal("vid={} e={}".format(vid, e))
from lib import log from lib import conf import sys process_name = os.path.basename(sys.argv[0]).split(".")[0] print("file=%s" % os.path.abspath(__file__)) DIR_APP = os.path.dirname(os.path.abspath(__file__)) print("dir app=%s" % DIR_APP) DIR_BASE = DIR_APP print("base=%s" % DIR_BASE) #sys.path.insert(0, DIR_BASE) print("base=%s" % DIR_BASE) DIR_LOG = DIR_BASE + "/log/" DIR_CONF = DIR_BASE + "/conf/" log.notice("conf=%s" % DIR_CONF) print("conf=%s" % DIR_CONF) DIR_DATA = DIR_BASE + "/data/" if not os.path.exists(DIR_DATA): os.makedirs(DIR_DATA) DIR_DATA_TMP = DIR_DATA + "/tmp/" if not os.path.exists(DIR_DATA_TMP): os.makedirs(DIR_DATA_TMP) CONF_IDC = conf.Conf(infile=DIR_CONF + "idc.conf") CURRENT_IDC = CONF_IDC["idc"] print("current idc=%s" % CURRENT_IDC) CONF_APP = conf.Conf(infile=DIR_CONF + "app.conf", _idc=CURRENT_IDC) CONF_APP["common"]["app_name"] = process_name log.init(DIR_LOG, CONF_APP["common"]["app_name"], CONF_APP["common"]["debug"]) log.notice(CONF_APP["common"]["app_name"] + ".init",
def process_item(self, item, spider): print("item={}".format("1")) log.notice("item={}".format(item))