def handle_failure(self, failure):
     url = failure.request.url
     log.notice("fail url={}".format(url))
     return
     if 'page' in url:
         yield scrapy.Request(url,
                              callback=self.parse,
                              errback=self.handle_failure,
                              dont_filter=True)
     else:
         yield scrapy.Request(url,
                              callback=self.parse_post,
                              errback=self.handle_failure,
                              dont_filter=True)
    def parse_post(self, response):
        log.notice("get res")
        log.notice("resp={}".format(response))

        title = response.xpath(
            u'//*[@class="tr1 do_not_catch"]//h4/text()').extract()[0]
        log.notice("title={} url={} ".format(title, response.url))
        # content = response.xpath(u'(//div[@class="tpc_content do_not_catch"])[1]').extract()[0]
        # ptime = response.xpath(u'(//*[@class="tr1"])[1]//*[@class="tipad"]//text()').extract()
        # ptime = ''.join(ptime)
        # ptime = re.findall(r'(\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2})', ptime)
        # if ptime:
        #     ptime = ptime[0]
        #
        item = CaoliuItem()
        # item['title'] = title
        # item['cat'] = '7'
        # item['cat_name'] = u'技术讨论区'
        # item['url'] = response.url
        # item['url_md5'] = hashlib.md5(response.url).hexdigest()
        # item['publish_time'] = ptime
        # item['content'] = content
        # log.notice("item= title={} url={}".format(title, response.url))

        yield item
    def parse(self, response):
        urls = response.xpath(
            u'//*[contains(@class, "tr3 t_one")]//h3//@href').extract()
        print("urls={}".format(urls))
        log.notice("urls={}".format(urls))
        for url in urls[:]:
            if 'htm_data' not in url:
                urls.remove(url)
                continue
            url = urlparse.urljoin(self.host, url)
            log.notice("url={}".format(url))
            yield scrapy.Request(url,
                                 callback=self.parse_post,
                                 errback=self.handle_failure,
                                 dont_filter=True)

        next_page = self.get_next_page(response)
        time.sleep(1)
        if next_page:
            yield scrapy.Request(next_page,
                                 callback=self.parse,
                                 errback=self.handle_failure,
                                 dont_filter=True)
Exemple #4
0
    def download_one_video(self, vid, url):
        try:
            print("vid={} url={}".format(vid, url))
            vdir = "{}/{}".format(self.root_dir, vid)
            cmd = "mkdir -p {}".format(vdir)
            log.notice("vid={} cmd={}".format(vid, cmd))
            os.system(cmd)
            cmd = "wget -c {} -O {}/{}.mp4 -o {}/{}.wget.log".format(
                url, vdir, vid, vdir, vid)
            log.notice("vid={} cmd={}".format(vid, cmd))
            os.system(cmd)

            wavfile = "{}/{}.wav".format(vdir, vid)
            if os.path.exists(wavfile):
                os.system("rm -f {}".format(wavfile))
            cmd = "ffmpeg -i {}/{}.mp4 -f wav -ar 16000 {}/{}.wav".format(
                vdir, vid, vdir, vid)
            log.notice("vid={} cmd={}".format(vid, cmd))
            os.system(cmd)
            print("vid={} url={} success".format(vid, url))
        except Exception as e:
            print("vid={} e={}".format(vid, e))
            log.fatal("vid={} e={}".format(vid, e))
Exemple #5
0
from lib import log
from lib import conf
import sys

process_name = os.path.basename(sys.argv[0]).split(".")[0]
print("file=%s" % os.path.abspath(__file__))
DIR_APP = os.path.dirname(os.path.abspath(__file__))
print("dir app=%s" % DIR_APP)
DIR_BASE = DIR_APP
print("base=%s" % DIR_BASE)
#sys.path.insert(0, DIR_BASE)
print("base=%s" % DIR_BASE)
DIR_LOG = DIR_BASE + "/log/"
DIR_CONF = DIR_BASE + "/conf/"
log.notice("conf=%s" % DIR_CONF)
print("conf=%s" % DIR_CONF)
DIR_DATA = DIR_BASE + "/data/"
if not os.path.exists(DIR_DATA):
    os.makedirs(DIR_DATA)
DIR_DATA_TMP = DIR_DATA + "/tmp/"
if not os.path.exists(DIR_DATA_TMP):
    os.makedirs(DIR_DATA_TMP)

CONF_IDC = conf.Conf(infile=DIR_CONF + "idc.conf")
CURRENT_IDC = CONF_IDC["idc"]
print("current idc=%s" % CURRENT_IDC)
CONF_APP = conf.Conf(infile=DIR_CONF + "app.conf", _idc=CURRENT_IDC)
CONF_APP["common"]["app_name"] = process_name
log.init(DIR_LOG, CONF_APP["common"]["app_name"], CONF_APP["common"]["debug"])
log.notice(CONF_APP["common"]["app_name"] + ".init",
Exemple #6
0
 def process_item(self, item, spider):
     print("item={}".format("1"))
     log.notice("item={}".format(item))