Example #1
0
 def video_not_null(self, response=None, children=None, url=None):
     if response and children:
         video = self.settings.video
         request = build_request(template=video.get('request'),
                                 param=video.get('param'),
                                 isre=video.get('isre'),
                                 html=response.response)[0]
         resp = Crawler(request).crawling()
         json_response = json.loads(resp.response)
         result = dict()
         for key, value in self.settings.video_children:
             if len(value):
                 way = value[0].get('way')
                 if way == 'html':
                     location = value[0].get('location')
                     get = get_from_html(response.response, location)
                     if key == 'title':
                         temp = get[0].text
                         if not temp:
                             result[key] = get[0].attrib.get('title', '').encode('utf8')
                         else:
                             result[key] = temp.encode('utf8')
                     else:
                         result[key] = get[0].text.encode('utf8') if key == 'title' else get[0].text
                 elif way == 'json':
                     json_key = ''
                     if 'key' in value[0].keys():
                         json_key = value[0].get('key')
                     elif 'index' in value[0].keys():
                         json_key = int(value[0].get('index'))
                     result[key] = json_response[json_key]
         result.update({'status': 'A'})
         Videos.objects.filter(id=url.id_indb).update(**result)
Example #2
0
 def video_null(self, response=None, children=None, url=None):
     if not response:
         return None
     result = dict()
     for key, value in self.settings.video_children:
         if len(value):
             way = value[0].get('way')
             if way == 'html':
                 location = value[0].get('location')
                 get = get_from_html(response.response, location)
                 if not len(get):
                     self.logger.error('Error response, url:' + url.url)
                     self.urls.put(url)
                     return None
                 if key == 'title':
                     result[key] = get[0].text
                 else:
                     if 'get' not in value[0].keys():
                         result[key] = get[0].text
                     else:
                         pass
             elif way == 'json':
                 m_split = value[0].get('split', '')
                 json_key = value[0].get('key')
                 request = build_request(template=value[0].get('request'),
                                         param=value[0].get('param'),
                                         isre=value[0].get('isre', False),
                                         html=response.response)
                 if not len(request):
                     continue
                 json_response = Crawler(request[0]).crawling()
                 if len(m_split):
                     m_start = json_response.response.index(m_split.split(':')[0])
                     m_end = json_response.response.index(m_split.split(':')[1])
                     json_response.response = json_response.response[m_start + 1:m_end]
                 try:
                     d = json.loads(json_response.response)
                     if isinstance(d, dict):
                         res = find_dict(d, json_key)
                         if len(res):
                             result[key] = res[0].get(json_key)
                     elif isinstance(d, int):
                         result[key] = d
                 except ValueError, ve:
                     pass
Example #3
0
    def host_only_url(self, response=None, children=None):
        if response and children:
            html = response.response
            infos = {}
            for key, child in self.settings.host_children:
                infos[key] = get_from_html(html, child[0].get('location'))
            n = []
            for item in infos.get('url'):
                get = self.settings.url[0].get('get')
                identity = self.settings.url[0].get('id')
                # res_url = item.attrib.get(get)
                res_url = get_from_dict(item.attrib, get.split('|'))
                if len(self.settings.url_format):
                    flag = False
                    for f in self.settings.url_format:
                        if res_url.startswith(f):
                            flag = True
                            break
                if not flag:
                    continue

                if not res_url.startswith('http://'):
                    res_url = self.settings.host_url + res_url

                filters = self.settings.host.get('filter')
                if filters is not None:
                    flag = True
                    for fil in filters:
                        f_type = fil.get('type', '')
                        f_value = fil.get('value')
                        if len(f_type) and f_value not in res_url:
                            flag = False
                            break
                        elif not len(f_type) and f_value in res_url:
                            flag = False
                            break
                    if not flag:
                        continue

                d = {}
                d.update({'url': res_url, 'id': get_from_dict(item.attrib, identity.split('|'))})
                n.append(d)
            result = []
            for item in infos.get('thumbnail'):
                get = self.settings.thumbnail[0].get('get')
                identity = self.settings.thumbnail[0].get('id')
                if '_' + get in item.attrib.keys():
                    get = '_' + get
                # res_thumb = item.attrib.get(get)
                res_thumb = get_from_dict(item.attrib, get.split('|'))
                if identity.startswith('parent:'):
                    id_thumb = item.getparent().attrib.get(identity.split('parent:')[1])
                else:
                    # id_thumb = item.attrib.get(identity)
                    id_thumb = get_from_dict(item.attrib, identity.split('|'))
                for i in n:
                    if i.get('id') == id_thumb:
                        i.update({'thumbnail': res_thumb})
                        result.append(i)
                        break
            result = delete_same(result)
            for item in result:
                site = Sites.objects.get(name=self.settings.site)
                video = Videos.objects.create(url=item.get('url')+self.settings.video_url_end, thumbnail=item.get('thumbnail'), site_id=site.id)
                next_url = Url(url=video.url, url_type='video', url_tip=self.settings.site, id_indb=video.id)
                self.urls.put(next_url, block=True)