Example #1
0
    def __init__(self,
                 status,
                 headers=Headers(),
                 content=None,
                 message=None,
                 request=None):
        self.status = status
        self.message = message
        self.redirects = list()

        if (not isinstance(headers, Headers)):
            headers = Headers(headers)

        self._headers = headers

        methods_from_headers = [
            'last_modified', 'date', 'expires', 'content_length',
            'content_is_text', 'content_is_xml', 'content_is_xhtml'
        ]

        for m in methods_from_headers:
            setattr(self.__class__, m, getattr(headers, m))

        self._content = content
        self._request = request
Example #2
0
    def __init__(self, status, headers=Headers(), content=None,
        message=None, request=None):

        """
        Construct a new ``Response`` object.

        :param status: HTTP status code for the response
        :type status: integer
        :param headers: HTTP headers
        :type status: a list of tuples or a class:`Headers` object
        :param content: content
        :param message: HTTP message for the response
        :param request: origin Request object used
        :type request: class:`Request`

        .. attribute:: redirects

           List of redirections

        """

        self._status = status
        self.message = message
        self.redirects = list()

        if (not isinstance(headers, Headers)):
            headers = Headers(headers)

        self._headers = headers

        self._content = content
        self._request = request
Example #3
0
    def __init__(self, method, url, headers=None, content=None):
        """
        Construct a new ``Request`` object

        :param method: HTTP method
        :type method: string
        :param url: URL of the request
        :type url: string or class:`Url`
        :param headers: Headers for the request
        :type headers: list of tuples or class:`Headers`
        :param content: body
        """
        self.method = method
        self.content = content

        if not isinstance(url, Url):
            url = Url(url)
        self.url = url

        if headers is None:
            headers = Headers()
        elif not isinstance(headers, Headers):
            headers = Headers(headers)

        self._headers = headers
Example #4
0
    def from_socket(cls, sock: socket.socket) -> "Request":
        """Read and parse the request from a socket object.

        Raises:
          ValueError: When the request cannot be parsed.
        """
        lines = iter_lines(sock)
        try:
            request_line = next(lines).decode("ascii")
        except StopIteration:
            raise ValueError("Request line missing")
        try:
            method, path, __ = request_line.split(" ")
        except ValueError:
            raise ValueError(f"Malformed request line {request_line!r}.")

        headers = Headers()
        while True:
            try:
                line = next(lines)
            except StopIteration as e:
                buff = e.value
                break
            try:
                name, __, value = line.decode("ascii").partition(":")
                headers.add(name.lower(), value.lstrip())
            except ValueError:
                raise ValueError(f"Malformed header line {line!r}.")

        body = BodyReader(sock, buff=buff)
        return cls(method=method.upper(),
                   path=path,
                   headers=headers,
                   body=body)
Example #5
0
def Topics_Data(type, key):

    #构建话题请求链接
    topic_base_url = 'https://www.zhihu.com/topic'
    topics_url = topic_base_url + 's'

    #构建话题请求
    topics_data = requests.get(topics_url, headers=Headers('zhihu'))

    #提取父话题文本
    topics_text = topics_data.text

    #正则提取父话题Name&ID
    re_topic_id = r'data-id="(.*?)"'
    re_topic_name = r'<li.*?">.*?">(.*?)<'
    topics_id = re.findall(re_topic_id, topics_text)
    topics_name = re.findall(re_topic_name, topics_text)

    #根据需要,返回父话题名称、ID、字典组合
    if type == 'name':
        return topics_name
    elif type == 'id':
        return topics_id
    elif type == dict:
        topicsDict = dict(zip(topics_id, topics_name))
        return topicsDict
Example #6
0
    def __init__(self):
        """Instantiate the Coordinator object.  Automatically creates & links
        the required modules."""

        if CONFIG['settings']['debug']:
            print("coordinator.__init__: starting instantiation")

        self.overall_config = CONFIG
        self.settings = copy.deepcopy(CONFIG['settings'])
        self.contents = copy.deepcopy(CONFIG['contents'])
        self.callbacks = {}
        self.register_callbacks()
        self.active_guis = {}
        self.metrics = {
            'sending-rate': None,
            'sending-time': None,
            'remaining': None,
            'sent': None,
            'etc': None,
            'etr': None,
            'no-active-connections': None,
        }

        self.email = Email(self, None)
        self.sender = EmailSendHandler(self)
        self.gui = EmailGUI(self)
        self.headers = Headers(self, self.email)
        self.email.headers = self.headers

        self.last_exc = None
        self.ready_to_send = True

        if self.settings['debug']:
            print("coordinator.__init__: instantiation complete")
Example #7
0
    def __init__(self,
                 agent=None,
                 timeout=10,
                 keep_alive=1,
                 default_headers={},
                 max_redirect=7):

        self.timeout = 60
        self.max_redirect = max_redirect
        self._handlers = Handlers()

        if agent is None:
            self.agent = 'python-fluffyhttp'
        else:
            self.agent = agent

        if len(default_headers) == 0:
            default_headers = {
                'Connection': 'keep-alive',
            }

        if 'User-Agent' not in default_headers:
            default_headers['User-Agent'] = self.agent

        self._default_headers = Headers(default_headers)

        self._poolmanager = PoolManager(maxsize=keep_alive)
Example #8
0
    def __init__(self, method, url, headers=Headers(), content=None):
        # XXX no content on GET / DELETE ?
        self.method = method
        self.content = content

        if not isinstance(url, Url):
            url = Url(url)
        self.url = url

        if not isinstance(headers, Headers):
            headers = Headers(headers)
        self._headers = headers

        methods_from_headers = ['if_modified_since', 'if_unmodified_since']
        for m in methods_from_headers:
            setattr(self.__class__, m, getattr(headers, m))
Example #9
0
def Questions_ID(word, topicName):

    topic_base_url = 'https://www.zhihu.com/topic'

    topic_url = []
    question_id = []
    for i in range(1):
        #I_want函数提供子话题ID
        topic_url.append(topic_base_url + '/' + I_want(word, topicName)[i])
        get_questiontitle_data = requests.get(topic_url[i],
                                              headers=Headers('zhihu'))
        questiontitle_data_text = get_questiontitle_data.text
        re_question_id = r'content="https://www.zhihu.com/question/(.*?)"'
        # re_question_name = r'content="(.*?)"'
        question_id_list = re.findall(re_question_id, questiontitle_data_text)
        # question_name_list = re.findall(re_question_name, questiontitle_data_text)
        # print(question_name_list)

        #去重
        for i in range(len(question_id_list)):
            if 'answer' in question_id_list[i]:
                pass
            else:
                question_id.append(question_id_list[i])

    return question_id
Example #10
0
    def __init__(self, request):
        self.request = request
        self.code = 200
        self.code_text = self.CODES[self.code]
        self.headers = Headers()

        if self.has_body():
            self.body = ""
Example #11
0
 def getDetails(self, details):
     ret = []
     if len(details) > 0:
         detail = Config.url_main + details[0]
         headers = Headers().getHeaders()
         req = requests.get(detail, headers=headers)
         selector = etree.HTML(req.content)
         js = selector.xpath('//script/text()')
         if len(js) > 0:
             urls = js[0].split('"')
             if len(urls) > 0:
                 ret.append(urls[1])
     return ret
Example #12
0
 def run(self):
     while True:
         print (self.getName())
         if self._queue.empty():
             break
         ret_json = self._queue.get()
         ret = simplejson.loads(ret_json)
         id = ret[0]
         category_id = ret[1]
         content_url = ret[2]
         headers = Headers().getHeaders()
         print(content_url)
         try:
             req = requests.get(content_url, headers=headers, timeout=60)
             if req.status_code == 200:
                 html = req.content
                 selector = etree.HTML(html)
                 titles = selector.xpath('//*[@class="fl"]/a[1]/text()')
                 nicks = selector.xpath('//*[@class="fl"]/span[1]/a[1]/text()')
                 play_urls = selector.xpath('//*[@class="redBtn"]/a[1]/@href')
                 contents = []
                 contents_li = selector.xpath('//*[@class="movStaff line_BSld"]/li')
                 for c_li in contents_li:
                     c_temp = c_li.xpath('descendant::text()')
                     if len(c_temp) > 0:
                         contents.append(c_temp)
                 imgs = selector.xpath('//*[@class="imgBAyy db"]/descendant::img[1]/@src')
                 title = play_url = content = img = nick = ''
                 if len(titles) > 0:
                     title = titles[0]
                 if len(nicks) > 0:
                     nick = nicks[0]
                 title = title + ',' + nick
                 if len(play_urls) > 0:
                     play_url = play_urls[0]
                 if len(imgs) > 0:
                     img = imgs[0]
                 content = simplejson.dumps(contents)
                 created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                 if img != '':
                     # 存入content
                     sql = 'insert ignore into m1905_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)'
                     values = (category_id, title, content, play_url, img, content_url, created_at)
                     print(title)
                     self._mysqlDao.executeValues(sql, values)
         except:
             self._mysqlDao = MysqlDao()
         if img != '':
             # url置1
             sql = 'update m1905_url set `status`=1 where `id`=' + str(id)
             self._mysqlDao.execute(sql)
Example #13
0
 def __init__(self,
              status: str,
              headers: typing.Optional[Headers] = None,
              body: typing.Optional[typing.IO] = None,
              content: typing.Optional[str] = None,
              encoding: str = "utf-8") -> None:
     self.status = status.encode()
     self.headers = headers or Headers()
     if content is not None:
         self.body = io.BytesIO(content.encode(encoding))
     elif body is None:
         self.body = io.BytesIO()
     else:
         self.body = body
Example #14
0
    def getHeaders(self, info):
        """Extract the headers from an IRequestInfo or IResponseInfo object.
        Returns a Headers object with the headers.

        Args:

        * info (IRequestInfo or IResponseInfo): Request info. Use the output
            from getInfo or getInfoFromBytes.
        """
        from headers import Headers
        hdr = Headers()
        # this is IRequestInfo.getHeaders() or IResponseInfo.getHeaders() from Burp
        rawHdr = info.getHeaders()
        hdr.importRaw(rawHdr)
        return hdr
Example #15
0
    def clear(self):
        self.done = False
        self.close = False

        if self.request.server:
            server_version = self.request.server.version
        else:
            server_version = SERVER_VERSION

        self.headers = Headers([("Server", server_version),
                                ("Date", strftime("%a, %d %b %Y %H:%M:%S %Z")),
                                ("X-Powered-By", server_version)])

        self.cookie = self.request.cookie

        self.stream = False
        self.body = None
        self.time = time()
        self.status = "200 OK"
Example #16
0
def Topics_Child_Data(Topic_ID, index, type):

    post_url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'

    data = {
        "method":
        "next",
        "params":
        '{"topic_id": ' + str(Topic_ID) + ', "offset": ' + str(index) +
        ', "hash_id": "40b294bd32442bd6a654497741c02b17"}'
    }

    #构建子话题请求
    topics_data_child = requests.post(url=post_url,
                                      headers=Headers('zhihu'),
                                      data=data)

    #提取子话题文本
    topics_text_child = topics_data_child.text

    #正则提取子话题Name&ID
    re_topic_id_child = r'href=.*?".*?topic\\\/(.*?)\\"'
    re_topic_name_child = r'<strong>(.*?)<.*?/strong>'
    topics_id_child = re.findall(re_topic_id_child, topics_text_child)
    topics_name_child = re.findall(re_topic_name_child, topics_text_child)

    #转译Unicode编码
    topics_name_child_eval = []
    for i in range(len(topics_name_child)):
        xxx = eval("u" + "'" + topics_name_child[i] + "'")
        topics_name_child_eval.append(xxx)

    if type == 'name':
        return topics_name_child_eval
    elif type == 'id':
        return topics_id_child
    elif type == dict:
        #构建子话题字典
        topicDict_child = dict(zip(topics_id_child, topics_name_child_eval))

        return topicDict_child
Example #17
0
    def _build_response(self, r, request):
        status = r.status
        headers = Headers(r.headers)
        content = r.data

        resp = Response(status=status,
                        headers=headers,
                        content=content,
                        message=r.reason,
                        request=request)

        new_resp = self._handlers.dispatch('response_done', resp)
        if new_resp is not None:
            resp = new_resp

        req = self._handlers.dispatch('response_redirect', resp)

        if req is not None and isinstance(req, Request):
            return self.request(req)

        return resp
Example #18
0
    def getRequestResponse(self, environ):
        env = environ.get

        headers = Headers(list(self.translateHeaders(environ)))

        protocol = tuple(map(int, env("SERVER_PROTOCOL")[5:].split(".")))
        request = wrappers.Request(None, env("REQUEST_METHOD"),
                                   env("wsgi.url_scheme"), env("PATH_INFO"),
                                   protocol, env("QUERY_STRING"))

        request.remote = wrappers.Host(env("REMOTE_ADDR"), env("REMTOE_PORT"))

        request.headers = headers
        request.script_name = env("SCRIPT_NAME")
        request.wsgi_environ = environ
        request.body = env("wsgi.input")

        response = wrappers.Response(None, request)
        response.gzip = "gzip" in request.headers.get("Accept-Encoding", "")

        return request, response
    def start_response(self, status, headers, exc_info=None):
        """'start_response()' callable as specified by PEP 333"""

        if exc_info:
            try:
                if self.headers_sent:
                    # Re-raise original exception if headers sent
                    raise exc_info[0](exc_info[1]).with_traceback(exc_info[2])
            finally:
                exc_info = None  # avoid dangling circular ref
        elif self.headers is not None:
            raise AssertionError("Headers already set!")

        assert type(status) is str, "Status must be a string"
        assert len(status) >= 4, "Status must be at least 4 characters"
        assert int(status[:3]), "Status message must begin w/3-digit code"
        assert status[3] == " ", "Status message must have a space after code"

        self.status = int(status[:3])
        self.headers = Headers(headers)

        return self.write
Example #20
0
def Questions_ID(word, topicName):

    topic_base_url = 'https://www.zhihu.com/topic'

    topic_url = []
    question_id_list = []
    for i in range(1):
        # word = ['时尚', '生活方式', '咖啡', '凯恩斯主义', '', '']
        # topicName = '生活时尚'
        #I_want函数提供子话题ID
        topic_url.append(topic_base_url + '/' + I_want(word, topicName)[i])
        # print(I_want()[i])
        get_questiontitle_data = requests.get(topic_url[i],
                                              headers=Headers('zhihu'))
        questiontitle_data_text = get_questiontitle_data.text
        re_questiontitle_id = r'content="https://www.zhihu.com/question/(.*?)"'
        questiontitle_id_list = re.findall(re_questiontitle_id,
                                           questiontitle_data_text)

        for i in range(len(questiontitle_id_list)):
            question_id_list.append(questiontitle_id_list[i])
    return question_id_list
Example #21
0
    def clear(self):
        self.done = False
        self.close = False

        if self.request.server:
            server_version = self.request.server.version
        else:
            server_version = SERVER_VERSION

        self.headers = Headers([("Date", strftime("%a, %d %b %Y %H:%M:%S %Z")),
                                ("X-Powered-By", server_version)])

        if self.request.server is not None:
            self.headers.add_header("Server", server_version)

        self.cookie = self.request.cookie

        self.stream = False
        self._body = []
        self.time = time()
        self.status = "200 OK"
        self.protocol = "HTTP/%d.%d" % self.request.server_protocol
Example #22
0
def Topics_DICT():

    #构建话题请求链接
    topic_base_url = 'https://www.zhihu.com/topic'
    topics_url = topic_base_url + 's'

    #构建话题请求
    topics_data = requests.get(topics_url, headers=Headers('zhihu'))

    #提取父话题文本
    topics_text = topics_data.text

    #正则提取父话题Name&ID
    re_topic_id = r'data-id="(.*?)"'
    re_topic_name = r'<li.*?">.*?">(.*?)<'
    topics_id = re.findall(re_topic_id, topics_text)
    topics_name = re.findall(re_topic_name, topics_text)

    #构建父话题字典
    topicsDict = dict(zip(topics_id, topics_name))

    return topicsDict
Example #23
0
 def __init__(self, max_workers=10):
     self.headers = Headers()
     self.timeout = None
     self.proxies = None
     self.__thread_pool_executor = futures.ThreadPoolExecutor(
         max_workers=max_workers)
Example #24
0
class Spider(object):
    header = Headers()
    headers = header.headers()  #初始化获取随机的请求头
    normalize = Normalize()  #格式化url
    items_fans = {}  #用于存储粉丝列表的字典
    items_self = {}  #用于存储个人信息的字典
    redis = Redis()
    mongo = Mongo()
    s_time = 0  #起始时间
    e_time = 0  #程序运行结束时间

    flag = 0  #请求头切换标识
    default_time = 20

    def start_url(self):
        #初始链接
        start_urls = [
            'https://weibo.com/p/1004061537790411?is_hot=1',
        ]
        for start_url in start_urls:
            yield start_url

    def downloader(self, url_item, referer, retries_num=4):
        """
            返回源码
        """
        print("开始下载")
        self.e_time = time.time()  #获取当前时间
        time_dif = self.e_time - self.s_time
        if self.flag == 1:
            time_dif = 400
        flag = 0
        if time_dif > 300:
            self.headers = self.header.headers()  #获取随机的请求头
            self.s_time = self.e_time

        time.sleep(random.random() * 5 +
                   random.random() * 5)  #+ random.randint(1,5))

        if referer:  #判断是否需要防盗链

            self.headers['Referer'] = referer  #添加referer

            url = url_item[0]
            print("待抓取:", url)
            try:
                response = requests.get(url, headers=self.headers, timeout=30)
                #print(self.headers)
                print("状态码:", response.status_code)

                #print(response.text)
                if response.status_code == 200:
                    if len(response.text) > 50000:
                        return response.text
                    else:
                        return None
                else:
                    self.flag = 0  #切换请求头
                    if retries_num > 0:
                        print("第", 4 - retries_num, '次下载')
                        self.downloader(url_item, referer, retries_num - 1)
                    else:
                        self.redis.push(url_item)  #下载失败则重新下载
                        return None
            except requests.exceptions.ConnectionError as e:
                print("downloaderrl错误", url)
                print("错误信息:", str(e))
        else:
            response = requests.get(url, headers=self.headers)
            return response.text

    def parse_follow_page(self, html, referer):
        """
            从个人主页提取pageid, 用于构建 关注的人 的链接, 提取关注的人数,粉丝数
        """
        print("解析函数1")
        p1 = r'<title>(.*?[\u4e00-\u9fa5]{0,})的微博_微博</title>'  #用来匹配这是谁的微博
        p3 = r"\$CONFIG\['page_id'\]='(\d.*?)';"  #用于匹配pageid
        p4 = r"(\d{6})"  #用于从 pageid 中匹配pid

        p5 = r'<strong\sclass=\\"W_f12\\">(\d*?)<\\/strong><span\sclass=\\"S_txt2\\">关注<\\/span>'  #关注的人数
        p6 = r'<strong\sclass=\\"W_f12\\">(\d*?)<\\/strong><span\sclass=\\"S_txt2\\">粉丝<\\/span>'  #粉丝数

        self.items_self = {}

        self.items_self['collection'] = re.search(p1, html).group(
            1)  #谁的主页,用于建立collection
        self.items_self['page_id'] = re.search(p3, html).group(1)  #获得pageid
        self.items_self['pid'] = re.search(p4,
                                           self.items_self['page_id']).group(
                                               1)  #获得pid

        try:
            self.items_self['idol'] = int(re.search(p5, html).group(1))
        except:
            self.items_self['idol'] = '__'  #关注人数不可见,则idol列表不能添加
            print("关注的人数人不可访问")

        try:
            self.items_self['fans'] = int(re.search(p6, html).group(1))
        except:
            self.items_self['fans'] = 0
            print("粉丝数人不可访问")

        if self.items_self['fans'] > 50000:  #这是阻尼系数
            self.items_self['damp'] = 1
        else:
            self.items_self['damp'] = 0.5

        print(self.items_self)
        #self.mongo.save(self.items_self)   #存储
        yield self.items_self  #返回结果用于存储
        if isinstance(self.items_self['idol'], int):
            for url in self.normalize.nor_follow(
                    self.items_self['page_id']):  #关注着页面
                url_item = [url, self.parse_detail, referer]
                yield url_item  #只需返回关注页面的链接即可,其他的直接存储
        else:
            yield None

    def parse_detail(self, html, referer):
        """
            提取每个人的关注页面和首页链接
        """

        print("解析函数2")
        self.items_fans = {}

        p1 = r'<title>(.*?[\u4e00-\u9fa5]{0,})的微博_微博</title>'
        p2 = r'<a\starget=\\"_blank\\"\stitle=\\"(.*?[\u4e00-\u9fa5]{0,})\\"\shref=\\"(.*?)\\"\s>'  #用于匹配粉丝列表

        try:
            results = re.findall(p2, html)

            for result in results:
                if result:

                    collection = re.search(p1, html).group(1)  #控制表
                    idol_name = result[0]  #关注者的名字
                    link = self.normalize.nor_home(result[1].replace(
                        '\\', ''))  #关注者的首页链接

                    if re.search(r'\?', link):  #如果能找到 ‘?’ 则存入数据库
                        self.items_fans = {
                            'collection': collection,
                            'idol_name': idol_name,
                            'link': link,
                        }

                        print(self.items_fans)
                        #self.mongo.save(self.items_fans)  #存储到数据库
                        yield self.items_fans  #返回结果,用于存储
                        url_item = [
                            self.items_fans['link'], self.parse_follow_page,
                            referer
                        ]
                        yield url_item  #将结果返回
                    else:
                        print("链接不符合规定:", link)
                        yield None
        except:
            print("粉丝列表不可访问")

    def scheduler(self):
        #初始化
        #self.redis.delete()  #控制是否在爬虫关闭后继续抓取

        if self.redis.llen() == 0:
            for url in self.start_url():
                callback = self.parse_follow_page
                referer = "https://weibo.com"
                url_item = [url, callback, referer]
                self.redis.push(url_item)

        while True:
            print("开始执行")
            if not self.redis.llen():

                url_item = self.redis.pop()

                url = url_item[0]
                callback = url_item[1]
                referer = url_item[2]

                html = self.downloader(url_item, referer=referer)
                if html is not None:
                    print("html的长度:", len(html))

                    for items in callback(html, url):
                        if isinstance(items, list):
                            print("返回结果是列表")
                            self.redis.push(items)
                        if isinstance(items, dict):
                            print("返回结果是字典")
                            self.mongo.save(items)

                        if items is None:
                            pass  #剔除掉粉丝列表不可看的
                else:
                    print("html的值:", html)
            else:
                break

    def run(self):
        self.scheduler()
Example #25
0
 def _merge_headers(self, headers):
     final_headers = Headers(self.default_headers.items() + headers.items())
     return final_headers
 def __init__(self, *args, **kwargs):
     self.protocol = kwargs.get('protocol')
     self.path = kwargs.get('path')
     self.method = kwargs.get('method')
     self.body = kwargs.get('body')
     self.headers = Headers(**kwargs.get('headers'))
 def run(self):
     while True:
         print(self.getName())
         ret_json = self._redisDao.lpop('kansogou')
         if ret_json == None:
             break
         ret = simplejson.loads(ret_json)
         id = ret[0]
         category_id = ret[1]
         content_url = ret[2]
         img = ret[3]
         headers = Headers().getHeaders()
         print(content_url)
         try:
             req = requests.get(content_url, headers=headers, timeout=30)
             if req.status_code == 200:
                 html = req.content
                 selector = etree.HTML(html)
                 # 电影
                 if category_id == 1:
                     titles = selector.xpath(
                         '//*[@class="title txt-overflow"]/a[1]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="title txt-overflow"]/a[1]/@href')
                     contents = selector.xpath(
                         '//*[@class="video-info"]/descendant::text()')
                 # 电视剧
                 if category_id == 2:
                     titles = selector.xpath('//*[@class="tt-mnc"]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="tt-mnc"]/@href')
                     contents = selector.xpath(
                         '//*[@class="lines"]/descendant::text()')
                 # 综艺
                 if category_id == 3:
                     titles = selector.xpath(
                         '//*[@class="info"]/h1[1]/a[1]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="info"]/h1[1]/a[1]/@href')
                     contents = selector.xpath(
                         '//*[@class="info"]/descendant::span/descendant::text()'
                     )
                 # 动漫
                 if category_id == 4:
                     titles = selector.xpath(
                         '//*[@class="title"]/a[1]/text()')
                     play_urls = selector.xpath(
                         '//*[@class="title"]/a[1]/@href')
                     contents = selector.xpath(
                         '//*[@class="video-info"]/descendant::text()')
                 title = play_url = content = ''
                 if len(titles) > 0:
                     title = titles[0]
                 if len(play_urls) > 0:
                     play_url = Config.url_main + play_urls[0]
                 content = simplejson.dumps(contents)
                 created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                 # 存入content
                 sql = 'insert ignore into kansogou_content (`category_id`,`title`,`content`,`play_url`,`img`,`url`,`created_at`) VALUES (%s,%s,%s,%s,%s,%s,%s)'
                 values = (category_id, title, content, play_url, img,
                           content_url, created_at)
                 print(title)
                 self._mysqlDao.executeValues(sql, values)
         except:
             self._mysqlDao = MysqlDao()
         # url置1
         sql = 'update kansogou_url set `status`=1 where `id`=' + str(id)
         self._mysqlDao.execute(sql)
Example #28
0
#构建回答页面链接
api_url = 'https://www.zhihu.com/api/v4/questions/'
req_url = '/answers?include=data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics%3Bdata%5B%2A%5D.settings.table_of_content.enabled&limit='
limit = 15
question_id = 34243513

word = ['时尚', '', '', '', '', '']
topicName = '生活方式'
print(I_want(word, topicName)[0])
# list = ['咖啡', '时尚']
url_ = api_url + str(Questions_ID(word, topicName)[0]) + req_url + str(limit)
print(Questions_ID(word, topicName)[0])

#请求数据
get_pic_data = requests.get(url_, headers=Headers('zhihu'))
pic_data_text = get_pic_data.text

#正则匹配图片地址
re_pic_src = r'src=."(https:.*?)"'
re_content = r'content":"(.*?)\\u003'

#查找图片
pic_src_list = re.findall(re_pic_src, pic_data_text)
# print(len(pic_src_list))
for i in range(0, 15):
    print('(', i + 1, ')', pic_src_list[i], '\n')

    #保存图片,默认位置C:\Users\lfeng
    # img_src = pic_src_list[i]
    # r = requests.get(img_src,stream=True)
Example #29
0
'''
    大鲲 爬虫
'''
import requests
from headers import Headers
from bs4 import BeautifulSoup
import time
import re
import numpy as np
from storage import Storage

headers = Headers()
storage = Storage()


class DakunSpider(object):

    # 获取列表数据
    def fetch_list(self, _type, page):
        url = 'https://pro.lagou.com/project/%s/%d' % (_type, page)

        res = requests.get(url, headers=headers.getHeader())

        _result = []

        soup = BeautifulSoup(res.text,
                             'html5lib').find(id='project_list').find_all('li')

        for item in soup:
            link = item.find('a')
Example #30
0
#             'scheme': 'https',
#             'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
#             'accept-encoding': 'gzip, deflate, br',
#             'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
#             'cache-control': 'max-age=0',
#             'cookie': 'session-id=142-0742346-9197407; ubid-main=133-7610726-3999729; x-main="4kcZZToPabr9kasOAP@fUF?gnBEFaM5G"; at-main=Atza|IwEBIHakK7hcNKZ8SnOHIESyAbGkJm881U1DzVVSDUsQtW4m9IBGj3eRSHyCTielSYV5LFEZgM52WDU-iPqlVlfoyFH50lsV1lcHWTw-Lx0TKAQtyZ6lNtbPPeR61oLvW9lrKdJhmGKMyb_q4CM05NPDtIWwNwNlfDHJlWJiyMvDVj_ei_0q8rIGvCsv0iSo5UecLdunGbhQcvp1cCOQl9JecTQY; sess-at-main="QG3NzTZK9Oq/AnB9D2p9/kHvrdx2o/40aMhb9ncESI0="; sst-main=Sst1|PQFB0bUIS1fXsMsV2S59DyOUCW5nwqhMy78gX87n0biokDXU_fAKXLMZr9qeysTcdSa7CZu1fReugmLsmJSOEcuOW6rifXNp3uuO-9JwzfegBoGOwKCMPvM2DVFb9vKeL8qsOvwdE1CF1nsDPO7C67FLmrV3ReHONyDcqc58rITDQebzhrYOXYIRj-Lx4memd-sVbkKPZGLrw7pE7Flqpoodv66BW8DlE1lWRyZv5DvGAIivV_7JHGD3hJCJM_WYWjmiQJfLy4fx_c_iVECH2s1IHTppiitYq8kwq54gZUOfmO4; lc-main=en_US; session-id-time=2082787201l; csd-key=eyJ3YXNtVGVzdGVkIjp0cnVlLCJ3YXNtQ29tcGF0aWJsZSI6dHJ1ZSwid2ViQ3J5cHRvVGVzdGVkIjpmYWxzZSwidiI6MSwia2lkIjoiYjVjNzZlIiwia2V5IjoiVDk4S251VHI2aWMvdE44OGVYcEhsVXBOdjM1dEdsdjR2UEJZOVpNL052SjVmRzNsMEV4RjlzdStQcU9LZ1dTQ1owSTF4bWQ3amJXY1RUSE55OWJleDNaMGRYSG93eW5kRk1aaDF2c2Zpc0ZZTmdFeXllZ2lybktCNUd4Y0REdUF3eEhiUit5RytoU1QvSnZvTDh5SnRiT1JJVDBqanJtK2N3blFGbGpXTHZRYnhjSkZiNWJ6MTdJL0ZZUmRockZ3ejI3Z20wVGVwRFVmNHRGc1A1K3RZbFVSTHp4ckNQWHBCMm8xQ0RqTVIrK0l0dVNhelR6Q2FyVEl0UmJUdE0yZ2xVUGtqYU44UDRoUEl2NzFQT25QVWVBQWpMSWRkMURrSXlJb0FXZlBKalByaE9pOXV4Q0RVSjVYVUdrVzJtY0J6cFJHMEtydTg0R1QyZEs4R2JuQzNBPT0ifQ==; session-token="iYYndurGfOfXDEnCsH9uwyeX7b0S+ZABM97JVwXRTvNCEaqpBjoPjB5SmgoBsH8RZRpnTyH3FR3j0vVnX2Jlm96lE8f/iksXfejUBpTe5aVQQ6iCjhz/i/rSjgLeZ2aG8khFYc6fXz/7BaqN2qGUsVjZ+NDN0VmQXQHey4LkWKp58hQbDH6JW9jFkhDwgPC7hfZhQDMsUMH4ZaycRAq2qYLsqy5iTaJW5pUsXmBnZgaCx2DW3EkX9TEmOtVijRRhEt6ML6qD0/+9mVgz1Y26ig=="; csm-hit=adb:adblk_no&t:1608966225254&tb:FCA9V8C9Y22B2CENQZCH+s-QNQR62MA46RMT9NTY65E|1608966225254',
#         }
# amazon_headers

# 创建URL组合
start_url = 'https://s.taobao.com/search?q='
goods = 'lego 21309'
url = start_url + goods + '&s=' + str(0)

# 通过URL爬取内容
get_contents = requests.get(url, headers = Headers('taobao'))
get_text = get_contents.text
# print(get_text)

# 正则表达式匹配指定内容
raw_ = r'"title":"([^"]+)"'
title_co = r'"raw_title":"([^"]+)"'
price_co = r'"view_price":"([^"]+)"'
nick_co = r'"nick":"([^"]+)"'
innerText_co = r'"innerText":"([^"]+)"'
view_sales_co = r'"view_sales":"([^"]+)"'
comment_count_co = r'"comment_count":"([^"]+)"'
item_loc_co = r'"item_loc":"([^"]+)"'

tb_sku_title_ = re.findall(title_co, get_text)
tb_sku_prcie_ = re.findall(price_co, get_text)