def video_null(self, response=None, children=None, url=None): if not response: return None result = dict() for key, value in self.settings.video_children: if len(value): way = value[0].get('way') if way == 'html': location = value[0].get('location') get = get_from_html(response.response, location) if not len(get): self.logger.error('Error response, url:' + url.url) self.urls.put(url) return None if key == 'title': result[key] = get[0].text else: if 'get' not in value[0].keys(): result[key] = get[0].text else: pass elif way == 'json': m_split = value[0].get('split', '') json_key = value[0].get('key') request = build_request(template=value[0].get('request'), param=value[0].get('param'), isre=value[0].get('isre', False), html=response.response) if not len(request): continue json_response = Crawler(request[0]).crawling() if len(m_split): m_start = json_response.response.index(m_split.split(':')[0]) m_end = json_response.response.index(m_split.split(':')[1]) json_response.response = json_response.response[m_start + 1:m_end] try: d = json.loads(json_response.response) if isinstance(d, dict): res = find_dict(d, json_key) if len(res): result[key] = res[0].get(json_key) elif isinstance(d, int): result[key] = d except ValueError, ve: pass
def run(self): import time counter = 0 while True: if self.stat.ncompare(status.STATUS_RUNNING): break url = self.urls.get() if not url: if counter >= 30: break counter += 1 if counter % 10 == 0: self.logger.info('Job(%d) waiting for url...(%d)' % (self.thread_num, counter)) time.sleep(1) continue self.settings = Settings(url.url_tip) response = Crawler(m_url=url.url, snapshot=self.settings.snapshot and url.url_type == 'host').crawling() if not len(response.response): self.logger.error('None response') continue if url.url_type == 'host': host_children = self.settings.host.get('children') import re response.response = re.sub(r'charset=(\w*)', 'charset=UTF-8', response.response) if self.settings.host.get('onlyurl'): self.host_only_url(response=response, children=host_children) else: self.host_not_only_url(children=host_children) elif url.url_type == 'video': # do video video_children = self.settings.video.get('children') if self.settings.video.get('null'): self.video_null(response=response, children=video_children, url=url) else: self.video_not_null(response=response, children=video_children, url=url)