コード例 #1
0
ファイル: job.py プロジェクト: hengheng0haha/spiderx
 def video_not_null(self, response=None, children=None, url=None):
     if response and children:
         video = self.settings.video
         request = build_request(template=video.get('request'),
                                 param=video.get('param'),
                                 isre=video.get('isre'),
                                 html=response.response)[0]
         resp = Crawler(request).crawling()
         json_response = json.loads(resp.response)
         result = dict()
         for key, value in self.settings.video_children:
             if len(value):
                 way = value[0].get('way')
                 if way == 'html':
                     location = value[0].get('location')
                     get = get_from_html(response.response, location)
                     if key == 'title':
                         temp = get[0].text
                         if not temp:
                             result[key] = get[0].attrib.get('title', '').encode('utf8')
                         else:
                             result[key] = temp.encode('utf8')
                     else:
                         result[key] = get[0].text.encode('utf8') if key == 'title' else get[0].text
                 elif way == 'json':
                     json_key = ''
                     if 'key' in value[0].keys():
                         json_key = value[0].get('key')
                     elif 'index' in value[0].keys():
                         json_key = int(value[0].get('index'))
                     result[key] = json_response[json_key]
         result.update({'status': 'A'})
         Videos.objects.filter(id=url.id_indb).update(**result)
コード例 #2
0
ファイル: job.py プロジェクト: hengheng0haha/spiderx
 def host_not_only_url(self, children=None):
     if not children:
         return None
     requests = build_request(self.settings.host.get('request'), self.settings.host.get('param'))
     all_data = []
     for request in requests:
         response = Crawler(request).crawling()
         m_json = json.loads(response.response)
         items = find_dict(m_json, u'aid')
         for item in items:
             save = dict()
             for key, value in self.settings.host_children:
                 v = item.get(value[0].get('key'))
                 if key == 'url' and not str(v).startswith('http://'):
                     v = self.settings.url_format[0] + str(v)
                 if key == 'title':
                     v = v.encode('utf8')
                 if key in ['playcount', 'favorite', 'community', 'upcount', 'downcount'] and not isinstance(v, int):
                     try:
                         v = int(v)
                     except Exception, e:
                         self.logger.error('Unknow value ' + v + ' ' + key)
                         v = 0
                 save[key] = v
                 site_id = Sites.objects.get(name=self.settings.site, status=u'A').id
                 save.update({'site_id': site_id})
             all_data.append(save)
コード例 #3
0
ファイル: job.py プロジェクト: hengheng0haha/spiderx
 def video_null(self, response=None, children=None, url=None):
     if not response:
         return None
     result = dict()
     for key, value in self.settings.video_children:
         if len(value):
             way = value[0].get('way')
             if way == 'html':
                 location = value[0].get('location')
                 get = get_from_html(response.response, location)
                 if not len(get):
                     self.logger.error('Error response, url:' + url.url)
                     self.urls.put(url)
                     return None
                 if key == 'title':
                     result[key] = get[0].text
                 else:
                     if 'get' not in value[0].keys():
                         result[key] = get[0].text
                     else:
                         pass
             elif way == 'json':
                 m_split = value[0].get('split', '')
                 json_key = value[0].get('key')
                 request = build_request(template=value[0].get('request'),
                                         param=value[0].get('param'),
                                         isre=value[0].get('isre', False),
                                         html=response.response)
                 if not len(request):
                     continue
                 json_response = Crawler(request[0]).crawling()
                 if len(m_split):
                     m_start = json_response.response.index(m_split.split(':')[0])
                     m_end = json_response.response.index(m_split.split(':')[1])
                     json_response.response = json_response.response[m_start + 1:m_end]
                 try:
                     d = json.loads(json_response.response)
                     if isinstance(d, dict):
                         res = find_dict(d, json_key)
                         if len(res):
                             result[key] = res[0].get(json_key)
                     elif isinstance(d, int):
                         result[key] = d
                 except ValueError, ve:
                     pass