Example #1
0
def _urlencode(seq, enc='utf-8'):
    if seq:
        values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq
                  for v in (vs if is_listlike(vs) else [vs])]
        return urlencode(values, doseq=1)
    else:
        return ''
 def batch_make_requests(self,
                         spider,
                         depth=0,
                         link_or_url_list=[],
                         meta={}):
     if is_listlike(link_or_url_list) and link_or_url_list:
         for link_or_url in link_or_url_list:
             yield self.make_request(spider, depth, link_or_url, meta)
Example #3
0
 def _serialize_value(self, value):
     if isinstance(value, BaseItem):
         return self.export_item(value)
     if isinstance(value, dict):
         return dict(self._serialize_dict(value))
     if is_listlike(value):
         return [self._serialize_value(v) for v in value]
     encode_func = to_bytes if self.binary else to_unicode
     if isinstance(value, (six.text_type, bytes)):
         return encode_func(value, encoding=self.encoding)
     return value
Example #4
0
 def _serialize_value(self, value):
     if isinstance(value, Item):
         return self.export_item(value)
     elif is_item(value):
         return dict(self._serialize_item(value))
     elif is_listlike(value):
         return [self._serialize_value(v) for v in value]
     encode_func = to_bytes if self.binary else to_unicode
     if isinstance(value, (str, bytes)):
         return encode_func(value, encoding=self.encoding)
     return value
Example #5
0
 def _export_xml_field(self, name, serialized_value):
     self.xg.startElement(name, {})
     if hasattr(serialized_value, 'items'):
         for subname, value in serialized_value.items():
             self._export_xml_field(subname, value)
     elif is_listlike(serialized_value):
         for value in serialized_value:
             self._export_xml_field('value', value)
     else:
         self._xg_characters(serialized_value)
     self.xg.endElement(name)
Example #6
0
 def _serialize_value(self, value):
     if isinstance(value, BaseItem):
         return self.export_item(value)
     if isinstance(value, dict):
         return dict(self._serialize_dict(value))
     if is_listlike(value):
         return [self._serialize_value(v) for v in value]
     encode_func = to_bytes if self.binary else to_unicode
     if isinstance(value, (six.text_type, bytes)):
         return encode_func(value, encoding=self.encoding)
     return value
Example #7
0
 def _export_xml_field(self, name, serialized_value):
     self.xg.startElement(name, {})
     if hasattr(serialized_value, "items"):
         for subname, value in serialized_value.items():
             self._export_xml_field(subname, value)
     elif is_listlike(serialized_value):
         for value in serialized_value:
             self._export_xml_field("value", value)
     else:
         self._xg_characters(serialized_value)
     self.xg.endElement(name)
Example #8
0
 def _serialize_value(self, value):
     if isinstance(value, BaseItem):
         return self.export_item(value)
     if isinstance(value, dict):
         return dict(self._serialize_dict(value))
     if is_listlike(value):
         return [self._serialize_value(v) for v in value]
     if self.binary:
         return to_bytes(value, encoding=self.encoding)
     else:
         return to_unicode(value, encoding=self.encoding)
Example #9
0
def _check_field_len_validity(item, field_name, length=1):
    if not _check_field_in_item(item, field_name):
        return False
    str_or_list = item[field_name]
    if not str_or_list:
        return False
    elif isinstance(str_or_list, str):
        return len(str_or_list.strip()) >= length
    elif is_listlike(str_or_list):
        s = ''.join(flatten(str_or_list)).strip()
        return len(s) >= length
    return False
Example #10
0
 def _export_xml_field(self, name, serialized_value):
     self.xg.startElement(name, {})
     if hasattr(serialized_value, 'items'):
         for subname, value in serialized_value.items():
             self._export_xml_field(subname, value)
     elif is_listlike(serialized_value):
         for value in serialized_value:
             self._export_xml_field('value', value)
     elif isinstance(serialized_value, six.text_type):
         self._xg_characters(serialized_value)
     else:
         self._xg_characters(str(serialized_value))
     self.xg.endElement(name)
Example #11
0
 def _export_xml_field(self, name, serialized_value, depth):
     self._beautify_indent(depth=depth)
     self.xg.startElement(name, {})
     if hasattr(serialized_value, 'items'):
         self._beautify_newline()
         for subname, value in serialized_value.items():
             self._export_xml_field(subname, value, depth=depth + 1)
         self._beautify_indent(depth=depth)
     elif is_listlike(serialized_value):
         self._beautify_newline()
         for value in serialized_value:
             self._export_xml_field('value', value, depth=depth + 1)
         self._beautify_indent(depth=depth)
     elif isinstance(serialized_value, six.text_type):
         self._xg_characters(serialized_value)
     else:
         self._xg_characters(str(serialized_value))
     self.xg.endElement(name)
     self._beautify_newline()
Example #12
0
 def upload_item(self,
                 item,
                 file_key,
                 force_override_flag=False,
                 content_type="application/json"):
     """上传item到Ks3, 注意file_key必须保证唯一, PUT操作"""
     path = self.get_path(file_key)
     if not force_override_flag:  # 是否进行强制覆盖上传, 默认不进行,为False
         if self.is_exist_key(file_key):
             log.warning("文件:{} 已经存在于ks3中,不进行重复上传".format(path))
             return True
     s = None
     if is_listlike(item):
         s = json.dumps(item, ensure_ascii=False)
     if isinstance(item, str):
         s = item
     if not s:
         return False
     body = s.encode(encoding='utf-8')
     return self._upload_body(file_key, body, content_type)
Example #13
0
    def get_file_key_by_category(self, category, file_id):
        """
        :param category: listlike or str
        :param file_id: 这里对应mongodb的数据库_id
        :return: file_key str
        """
        if is_listlike(category):
            temp_categories = [c.strip() for c in category if c.strip()]
            dir_path = '/'.join(temp_categories).strip()
        else:
            dir_path = category.strip()
        if not dir_path:
            dir_path = '未知类别'

        # 注意最后需要url encode,因为中文的话需要编码
        file_key = "{}/{}".format(dir_path, file_id)
        file_key = url_encode(file_key)  # url编码是不对/进行编码的
        # 替换特殊字符, 不替换会进行报错403
        file_key = file_key.replace('//', '/%2F')
        file_key = file_key.replace('%7E', '~')
        return file_key
Example #14
0
def _urlencode(seq: Iterable, enc: str) -> str:
    values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq
              for v in (vs if is_listlike(vs) else [vs])]
    return urlencode(values, doseq=True)
Example #15
0
    def _export_xml_field(self,
                          name,
                          serialized_value,
                          depth,
                          attrs=None,
                          out_value=None):
        self._beautify_indent(depth=depth)

        if attrs is None:
            attrs = {}
        if out_value is not None:
            serialized_value = out_value

        if serialized_value is None:
            self.xg.ignorableWhitespace("<%s/>" % name)
            self._beautify_newline()
            return

        self.xg.startElement(name, attrs)
        if hasattr(serialized_value, 'items'):
            self._beautify_newline()
            for subname, value in serialized_value.items():
                if subname.startswith("_"):
                    continue
                _attrs = {}

                if hasattr(value, 'items'):
                    for key in value.keys():
                        if key.startswith("_"):
                            _attrs[key[1:]] = value[key]

                _tag = subname
                if "tag" in _attrs:
                    _tag = _attrs.pop("tag")

                _out_value = None
                if "value" in _attrs:
                    _out_value = _attrs.pop("value")

                self._export_xml_field(_tag,
                                       value,
                                       depth=depth + 1,
                                       attrs=_attrs,
                                       out_value=_out_value)
            self._beautify_indent(depth=depth)
        elif is_listlike(serialized_value):
            self._beautify_newline()
            _is_dict_inside = True
            for value in serialized_value:
                if not hasattr(value, 'items'):
                    _is_dict_inside = False
                    break
            if _is_dict_inside:
                for value in serialized_value:
                    _sub_attrs = {}
                    for key in value.keys():
                        if key.startswith("_"):
                            _sub_attrs[key[1:]] = value[key]
                    _sub_tag = 'value'
                    if "tag" in _sub_attrs:
                        _sub_tag = _sub_attrs.pop("tag")

                    _sub_out_value = None
                    if "value" in _sub_attrs:
                        _sub_out_value = _sub_attrs.pop("value")

                    self._export_xml_field(_sub_tag,
                                           value,
                                           depth=depth + 1,
                                           out_value=_sub_out_value)
            else:
                for value in serialized_value:
                    self._export_xml_field('value', value, depth=depth + 1)
            self._beautify_indent(depth=depth)
        elif isinstance(serialized_value, six.text_type):
            self._xg_characters(serialized_value)
        else:
            self._xg_characters(str(serialized_value))
        self.xg.endElement(name)
        self._beautify_newline()
Example #16
0
def _urlencode(seq, enc):
    if isinstance(seq, str):
        return bytes(seq, enc)
    values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq
              for v in (vs if is_listlike(vs) else [vs])]
    return urlencode(values, doseq=1)
Example #17
0
    def start_requests(self):
        """默认会调用CrawlSpider的parse方法"""
        rules = self.config.get('start_seeds_rules', None)
        request_factory_cls = load_object(rules.get("request_factory_class"))
        req_factory_obj = request_factory_cls()
        depth = int(rules.get("depth"))
        check_before_request_flag = rules.get("check_before_request_flag",
                                              False)

        # 第一种情况, 默认配置的urls集合
        category_urls = rules.get("category_urls")
        if isinstance(category_urls, dict) and category_urls:
            for category, urls in category_urls.items():
                if is_listlike(urls):
                    if check_before_request_flag:
                        urls = [
                            url for url in urls
                            if not self.check_url_in_redis_set(url)
                        ]
                    if not urls:
                        continue
                    for request in req_factory_obj.batch_make_requests(
                            spider=self,
                            depth=depth,
                            link_or_url_list=urls,
                            meta={'category': category}):
                        yield request

        # 第二种情况, 通过函数泛化
        callback_urls = rules.get("callback_urls")
        if isinstance(callback_urls, dict) and callback_urls:
            for category, callback_url_infos in callback_urls.items():
                for infos in callback_url_infos:
                    callback = infos.get('callback').strip()
                    if not callback:
                        continue
                    if isinstance(callback, str):
                        callback = load_object(callback)
                    params = infos.get('params')
                    if callable(callback) and isinstance(params, dict):
                        urls_or_requests = callback(**params)
                        if check_before_request_flag:
                            urls_or_requests = [
                                e for e in urls_or_requests
                                if not self.check_url_in_redis_set(e)
                            ]
                        if not urls_or_requests:
                            continue
                        urls = [
                            e for e in urls_or_requests
                            if isinstance(e, str) and e.startswith('http')
                        ]
                        for request in req_factory_obj.batch_make_requests(
                                spider=self,
                                depth=depth,
                                link_or_url_list=urls,
                                meta={'category': category}):
                            yield request
                        requests = [
                            e for e in urls_or_requests
                            if isinstance(e, Request)
                        ]
                        for request in requests:
                            yield request
Example #18
0
def _urlencode(seq, enc):
    values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq
              for v in (vs if is_listlike(vs) else [vs]) if v != '+']
    return urlencode(values, doseq=1)
Example #19
0
def _urlencode(seq, enc):
    values = [(to_bytes(k, enc), to_bytes(v, enc))
              for k, vs in seq
              for v in (vs if is_listlike(vs) else [vs])]
    return urlencode(values, doseq=1)