Example #1
0
def main():
    proxy = Proxy()
    proxy = proxy.get_proxy()
    db_build()
    url = 'https://www.rusprofile.ru/codes/89220'
    url_2 = 'https://www.rusprofile.ru/codes/429110'
    list_url = [url, url_2]
    address = get_page_address(list_url, proxy)
    print(address)
    for i in address:
        html_company = get_html(i, proxy)
        get_page_data(html_company)

    con.commit()
    con.close()
class Scraper:
    def __init__(self, url_base, custom_headers=None):
        self.url_base = url_base
        self.custom_headers = custom_headers
        self.proxy = Proxy()
        self.user_agent = UserAgent()

    @staticmethod
    def make_url(url, *res, **params):
        for r in res:
            url = '{}/{}'.format(url, r)
        if params:
            url = '{}?{}'.format(url, urlencode(params))
        return url

    def set_proxy(self, session):
        """
        Configure the session to use one of the proxy_candidates.  If verify is
        True, then the proxy will have been verified to work.
        """
        proxy = self.proxy.get_proxy()
        while True:
            session.proxies = {
                'https': 'https://{}:{}'.format(proxy['IP Address'],
                                                proxy['Port'])
            }
            try:
                return session.get('https://httpbin.org/ip').json()
            except Exception:
                proxy = self.proxy.get_proxy()

    def crawl(self, *url_path, **url_params):
        session = requests.Session()
        if self.custom_headers:
            session.headers = self.custom_headers
        url_crawl = self.make_url(self.url_base, *url_path, **url_params)
        while True:
            try:
                session.headers = {'User-Agent': self.user_agent.random}
                self.set_proxy(session)
                response = session.get(url_crawl)
                response.raise_for_status()
                return response.text
            except (requests.exceptions.HTTPError,
                    requests.exceptions.ProxyError,
                    requests.exceptions.SSLError):
                pass
Example #3
0
def readweb():
    print u'get jobdict.....'
    configmap = {}
    jobnamesfile = 'dict/jobnames.pkl'
    if os.path.isfile(jobnamesfile):
        configmap = pickle.load(open(jobnamesfile))
        return configmap
    p = Proxy()
    while (True):
        proxies = p.getproxies()
        try:
            r = requests.get(url='http://www.lagou.com/',
                             proxies=proxies, timeout=60)
            break
        except Exception, e:
            p.nextip()
            logging.debug(str(e))
Example #4
0
def get_company_description(fetchallist):
    db = MySQLdb.connect(dbadd, user, password, database,
                         use_unicode=True, charset="utf8")
    cursor = db.cursor()
    p = Proxy()
    for id in fetchallist:
        while (True):
            try:
                values = get_company_info_byid(id[0], p)
                values.append(id[0])
                cursor.execute(
                    'update company set companyUrl = %s,description = %s,fullName = %s,shortName = %s,detailPosition = %s,industryField = %s,companySize = %s,city = %s,financeStage = %s,profile = %s where companyId = %s',
                    values)
                db.commit()
                print u"update:", id[0]
                break
            except Exception, e:
                logging.debug(str(e))
                p.nextip()
Example #5
0
def scrapy(jobname):
    # print 'crawling ' + jobname + '.....'
    p = Proxy()
    db = MySQLdb.connect(dbadd, user, password, database,
                         use_unicode=True, charset="utf8")
    cursor = db.cursor()
    req_url = 'http://www.lagou.com/jobs/positionAjax.json?'
    headers = {'content-type': 'application/json;charset=UTF-8'}
    while (True):
        proxies = p.getproxies()
        try:
            req = requests.post(req_url, params={'first': 'false', 'pn': 1, 'kd': jobname}, headers=headers, timeout=60,
                                proxies=proxies, allow_redirects=False)
            totalCount = req.json()['content']['positionResult']['totalCount']
            pageSize = req.json()['content']['positionResult']['pageSize']
            maxpagenum = totalCount / pageSize

            break
        except Exception, e:
            p.nextip()
            logging.debug(str(e))
Example #6
0
    def start(self):
        self.is_lang = False
        self.delete_msg()
        self.startButton.config(state=DISABLED)
        site = self.siteBox.get()
        asin = self.asinEntry.get()
        page = self.pageEntry.get()
        if not asin:
            self.write_msg('asin 为空,请输入asin')
            self.startButton.config(state=NORMAL)
            return
        if not page:
            self.write_msg('页码 为空, 请输入页码')
            self.startButton.config(state=NORMAL)
            return
        try:
            page = int(page)
        except Exception as e:
            print(e)
            self.write_msg('出现错误, 原因: 页码不是数字')
            self.startButton.config(state=NORMAL)
            return
        self.write_msg('开始任务...,站点--{},Asin--{}'.format(site, asin))
        if not self.is_proxies.get():
            self.write_msg('不使用代理')
            proxies = None
            session = None
        else:
            self.write_msg('使用代理, 正在准备代理')
            try:
                session, proxies = Proxy(self).get_proxies(site)
                if not proxies or type(proxies) != dict:
                    self.write_msg(
                        '代理获取失败, 原因: {}'.format(proxies['msg'] if proxies
                                                and 'msg' in proxies else '无'))
                    self.startButton.config(state=NORMAL)
                    return
            except Exception as e:
                print(e)
                self.write_msg('出现错误, 原因: {}'.format(e))
                self.startButton.config(state=NORMAL)
                return

        #初始化请求类
        self.requests = AmazonRequests(site, asin, page, session, proxies)
        self.csv = JsonCsv(asin)
        t = threading.Thread(target=self.start_download)
        self.daemon = t.setDaemon(True)
        t.start()
Example #7
0
def get_catalog_info(url):
    """
    :param url: the started url
    :return: the next url
    """
    while True:
        # 获取代理地址
        proxy = rd.lpop("catalog_proxies")
        if proxy is None:
            Proxy("catalog_proxies")
            proxy = rd.lpop("catalog_proxies")
        print proxy

        # 如果请求失败则重复发送请求,直到请求成功为止
        try:
            r = requests.get(url, headers=headers, timeout=5, 
                             proxies={'http': proxy})
            r.encoding = 'gb2312'
            html = etree.HTML(r.text)
            # 获取下一页的 link
            next_page_link = html.xpath("//div[@class='controlbar']/span[2]/a")[0].get('href')
        # 这里可能是 ConnectionError, ReadTimeout 等
        except Exception:
            catalog_logger.info("requests error: %s", url)
        else:
            break

    for tr in html.xpath("//table[@class='cytable']//tr[position()>1]"):
        try:
            # info 元素顺序依次:作者 作品 标签 风格 进度 字数 作品积分 发表时间
            #
            # 以 whitespace 进行 split,因为整个字符串中以 whitespace 打头和结尾,
            # 所以 split 后第一项元素和最后一项元素为 '',需要剔除
            info = re.split(r'\s{4,}', tr.xpath("string(.)"), re.UNICODE)[1:][:-1]

            # 测试时发现有些栏目对不上,这里直接舍弃
            # 取标签 <a> 中 rel 属性的值作为 abstract 和 tag
            [abstract, tag]= tr.xpath(".//a[@rel]")[0].get("rel").strip().split(u"<br />标签:")

            # 测试时发现有些栏目可能都是空的,这里直接舍弃
            # xpath 中的 index 从 1 开始
            author_link = tr.xpath(".//td[1]/a")[0].get("href")
            author_link = urlparse.urljoin(r.url, author_link)
            author_link_query = urlparse.urlparse(author_link).query
            author_id = urlparse.parse_qs(author_link_query)['authorid'][0]

            novel_link = tr.xpath(".//td[2]/a")[0].get("href")
            novel_link = urlparse.urljoin(r.url, novel_link)
            novel_link_query = urlparse.urlparse(novel_link).query
            novel_id = urlparse.parse_qs(novel_link_query)['novelid'][0]
        except Exception:
            pass
        else:
            #print info[1]
            catalog_logger.info(info[1])
            # TODO:可能有些字段还是不匹配,这里直接舍弃
            try:
                catalog = OrderedDict([
                    ("novel", info[1]),
                    ("novel_id", int(novel_id)),
                    ("novel_link", novel_link),
                    ("author", info[0]),
                    ("author_id", int(author_id)),
                    ("author_link", author_link),
                    ("tag", tag.strip() or u'无'), # 再次 strip() 避免 whitespace
                    ("abstract", abstract or u'无'),
                    ("style", info[3]),
                    ("process", info[4]),
                    ("word_count", int(info[5])),
                    ("point", int(info[6])),
                    ("publish_time", info[7]),
                    ("status", 'WAITING'),   # 爬取状态
                    ("create_time", datetime.datetime.now()),
                ])
            except Exception:
                pass
            else:
                insert_catalog(catalog)


    if next_page_link is not None:
        next_page_link = urlparse.urljoin(r.url, next_page_link)
    print next_page_link
    catalog_logger.info(next_page_link)

    return next_page_link
 def __init__(self, url_base, custom_headers=None):
     self.url_base = url_base
     self.custom_headers = custom_headers
     self.proxy = Proxy()
     self.user_agent = UserAgent()
Example #9
0
def get_target(queue, log):
    # MongoDB
    client = MongoClient()
    db = client.jingjiang
    catalog_col = db.catalog

    while True:
        catalog = catalog_col.find_one({"status": "WAITING"},
                                       sort=[("create_time", 1)])

        # 结束 producer,并将结束信号传递给 customer
        if catalog == None:
            for i in xrange(3):
                queue.put("target:-1")
            queue.close()
            queue.join_thread()
            os._exit(1)

        novel_id = catalog["novel_id"]
        print 'producer start', os.getpid(), novel_id

        while True:
            # 获取代理地址
            proxy = rd.lpop("producer_proxies")
            if proxy is None:
                Proxy("producer_proxies")
                proxy = rd.lpop("producer_proxies")

            # 如果请求失败则重复发送请求,直到请求成功为止
            try:
                r = requests.get(catalog["novel_link"],
                                 headers,
                                 timeout=5,
                                 proxies={'http': proxy})
                r.encoding = 'gb2312'
                html = etree.HTML(r.text)
                trs = html.xpath("//tr[contains(@itemprop, 'chapter')]")
            except Exception:
                log.info("requests error: %s", catalog["novel_link"])
            else:
                break

        for tr in trs:
            try:
                # info 元素顺序依次为 章节 标题 摘要 字数 更新日期
                # 以 whitespace 进行 split,因为整个字符串中以 whitespace 打头和结尾,
                # 所以 split 后第一项元素和最后一项元素为 '',需要剔除
                info = re.split(r'\s{4,}', tr.xpath("string(.)"),
                                re.UNICODE)[1:][:-1]
                # 剔除 *最新章节
                if u'\xa0*\u6700\u65b0\u66f4\u65b0' in info:
                    info = info[:-1]
                # 摘要可能会有换行符分隔,将其合并
                if len(info) > 5:
                    info[2:len(info) - 2] = [''.join(info[2:len(info) - 2])]
                # 摘要可能没有
                elif len(info) == 4:
                    info.insert(2, '无')
                # 点击数量 暂时不需要, 接口如下(xxx 为 novelid 的值):
                # r = requests.get('http://s8.static.jjwxc.net/getnovelclick.php?novelid=xxx')

                # chapter_link 可能被禁,导致没有
                chapter_link = tr.xpath(".//a[@itemprop='url']")[0].get('href')
                #print chapter_link
                log.info(chapter_link)
            except Exception:
                if 'target' in locals():
                    del target
                # 删除 redis 中的消息队列
                rd.delete('target:%s' % novel_id)
                # 更改任务状态为 SUSPENDED
                suspend_task(catalog_col, novel_id)
                break
            else:
                target = {
                    "chapter_id": info[0],
                    "title": info[1],
                    "abstract": info[2],
                    "word_count": int(info[3]),
                    "publish_time": info[4],
                    "chapter_link": chapter_link,
                }
                # 写入 redis
                target = pickle.dumps(target)
                rpush_to_redis(target, novel_id)

        # 写入队列
        if 'target' in locals():
            key = 'target:%s' % novel_id
            queue.put(key)
            # 更改任务状态为 QUEUEING
            queue_task(catalog_col, novel_id)

        # 删除 catalog 表中相同的 document(去重)
        delete_same_catalog(catalog_col, novel_id)
Example #10
0
def parse_target(queue, lock, log):
    # MongoDB
    novel_col, catalog_col = connect_to_MongoDB()

    while True:
        chapters = []
        task = queue.get()
        print 'customer start', os.getpid()
        novel_id = int(task.split(':')[1])

        # 结束 customer 进程
        if novel_id == -1:
            os._exit(-1)

        # 更改任务状态为 PROCESSING
        process_task(catalog_col, novel_id)        

        while True:
            ptarget = rd.lpop(task)

            if ptarget is None:
                break

            target = pickle.loads(ptarget)
            log.info(target["chapter_link"])

            while True:
                # 获取代理地址
                proxy = rd.lpop("customer_proxies")
                # 只允许一个 customer 去添加 proxy
                if rd.llen("customer_proxies") <= 10:
                    if lock.acquire(block=False):
                        Proxy("customer_proxies")
                        lock.release()
                if proxy is None:
                    time.sleep(0.5)
                    continue

                # 如果请求失败则重复发送请求,直到请求成功为止
                try:
                    r = requests.get(
                            target["chapter_link"], headers=headers,
                            timeout=5, proxies={'http': proxy})
                except Exception:
                    log.info('requests error: %s', target["chapter_link"])
                else:
                    break

            r.encoding = 'gb2312'
            html = etree.HTML(r.text)

            # 某些页面可能需要登陆,或者被锁。
            try:
                novel_text = html.xpath("//div[@class='noveltext']")[0]
            except IndexError:
                # 挂起任务
                suspend_task(catalog_col, novel_id)
                del chapters
                # 删除 redis 中的数据
                rd.delete(task)
                break
            else:
                novel_text = etree.tostring(
                    novel_text, encoding="unicode", method="html")
                # 剔除前半部分无关的内容
                novel_text = re.split(
                    r'<div style="clear:both;"></div>(\s*<div class="readsmall".*?</div>)?', 
                    novel_text)[2]
                # 剔除后半部分无关的内容
                novel_text = re.split(
                    r'<div id="favoriteshow_3".*</div>', novel_text)[0]
                # 剔除干扰部分 <font>...</font><br>
                paras = re.split(r'<font.*?<br>', novel_text)
                paras = [para.strip().replace("<br>", "\r\n") for para in paras if para]
                content = '\r\n'.join(paras)
                chapters.append(OrderedDict([
                    ("chapter_id", target["chapter_id"]),
                    ("chapter_link", target["chapter_link"]),
                    ("title", target["title"]),
                    ("abstract", target["abstract"]),
                    ("word_count", target["word_count"]),
                    ("publish_time", target["publish_time"]),
                    ("content", content),
                ]))
        
        # 更改任务状态为 FINISHED
        #if chapters:
        if 'chapters' in locals():
            novel_title = finish_task(catalog_col, novel_id)
            novel = OrderedDict([
                ("novel", novel_title),
                ("novel_id", novel_id),
                ("chapters", chapters),
                ("create_time", datetime.datetime.now()),
            ])
            insert_novel(novel_col, novel)
        print 'customer end %d, pid: %d' % (novel_id, os.getpid())
def test_proxy_10_attempts():
    proxy = Proxy()
    proxy.set_number_of_attempts(10)
    assert proxy.get()