def video_not_null(self, response=None, children=None, url=None): if response and children: video = self.settings.video request = build_request(template=video.get('request'), param=video.get('param'), isre=video.get('isre'), html=response.response)[0] resp = Crawler(request).crawling() json_response = json.loads(resp.response) result = dict() for key, value in self.settings.video_children: if len(value): way = value[0].get('way') if way == 'html': location = value[0].get('location') get = get_from_html(response.response, location) if key == 'title': temp = get[0].text if not temp: result[key] = get[0].attrib.get('title', '').encode('utf8') else: result[key] = temp.encode('utf8') else: result[key] = get[0].text.encode('utf8') if key == 'title' else get[0].text elif way == 'json': json_key = '' if 'key' in value[0].keys(): json_key = value[0].get('key') elif 'index' in value[0].keys(): json_key = int(value[0].get('index')) result[key] = json_response[json_key] result.update({'status': 'A'}) Videos.objects.filter(id=url.id_indb).update(**result)
def host_not_only_url(self, children=None): if not children: return None requests = build_request(self.settings.host.get('request'), self.settings.host.get('param')) all_data = [] for request in requests: response = Crawler(request).crawling() m_json = json.loads(response.response) items = find_dict(m_json, u'aid') for item in items: save = dict() for key, value in self.settings.host_children: v = item.get(value[0].get('key')) if key == 'url' and not str(v).startswith('http://'): v = self.settings.url_format[0] + str(v) if key == 'title': v = v.encode('utf8') if key in ['playcount', 'favorite', 'community', 'upcount', 'downcount'] and not isinstance(v, int): try: v = int(v) except Exception, e: self.logger.error('Unknow value ' + v + ' ' + key) v = 0 save[key] = v site_id = Sites.objects.get(name=self.settings.site, status=u'A').id save.update({'site_id': site_id}) all_data.append(save)
def video_null(self, response=None, children=None, url=None): if not response: return None result = dict() for key, value in self.settings.video_children: if len(value): way = value[0].get('way') if way == 'html': location = value[0].get('location') get = get_from_html(response.response, location) if not len(get): self.logger.error('Error response, url:' + url.url) self.urls.put(url) return None if key == 'title': result[key] = get[0].text else: if 'get' not in value[0].keys(): result[key] = get[0].text else: pass elif way == 'json': m_split = value[0].get('split', '') json_key = value[0].get('key') request = build_request(template=value[0].get('request'), param=value[0].get('param'), isre=value[0].get('isre', False), html=response.response) if not len(request): continue json_response = Crawler(request[0]).crawling() if len(m_split): m_start = json_response.response.index(m_split.split(':')[0]) m_end = json_response.response.index(m_split.split(':')[1]) json_response.response = json_response.response[m_start + 1:m_end] try: d = json.loads(json_response.response) if isinstance(d, dict): res = find_dict(d, json_key) if len(res): result[key] = res[0].get(json_key) elif isinstance(d, int): result[key] = d except ValueError, ve: pass