コード例 #1
0
ファイル: base_downloader.py プロジェクト: zthbruce/spider
 def get_proxy(self, proxy_type):
     ret = {}
     if 'http' in proxy_type:
         proxy = get_proxy()
         if proxy:
             ret['http'] = "http://" + proxy
     if 'https' in proxy_type:
         proxy = get_proxy()
         if proxy:
             ret['https'] = "https://" + proxy
     return ret
コード例 #2
0
ファイル: baidu.py プロジェクト: huichen90/Search
def get_result(keyword, page):
    data = {
        'wd': keyword,
        'rn': 50,
        'pn': (page - 1) * 50
    }
    proxy = get_proxy()
    proxies = {
        'https': 'https://' + proxy,
        'http': 'http://' + proxy
    }
    url = 'https://www.baidu.com/s?' + urlencode(data)
    print(url)
    response = requests.get(url, headers=headers, proxies=proxies)
    if response.status_code == 200:
        html = response.text
        doc = pq(html)
        results = doc('.result.c-container').items()
        for result in results:
            title = result.find('h3.t').text()
            href = result.find('h3.t a').attr('href')
            abstract = result.find('.c-abstract').text()
            url = result.find('.c-showurl').text().replace(' ', '')
            snapshot = result.find('.m').attr('href')
            yield dict({
                'title': title,
                'href': href,
                'abstract': abstract,
                'url': url,
                'snapshot': snapshot
            })
コード例 #3
0
 def proxy(self):
     ip = get_proxy()
     proxies = {
         'http': ip,
         'https': ip,
     }
     return proxies
コード例 #4
0
ファイル: ins.py プロジェクト: qiuzhihui/ins-account
		def first_get(self):
			global _session
			_session=requests.session()
			main_url='https://www.instagram.com'
			try:

				_session.get(main_url,proxies=self.use_proxy,verify=True)
				self.save_cookies()
				if os.path.exists('cookiefile'):#print('have cookies')
					self.csrf=self.read_cookies()
					self.data=self.create_ajax()
					print(self.data)
					self.ins()
					time.sleep(5)#wait for 5 seconds
					login_client = login(self.u_name, self.passwd)
					if login_client.do_first() is True:
						print("[*]Save account to file, congrats!")
						print(self.data)
						self.save_account_info(self.u_name, self.passwd)
				else:
					pass

			except:
				print("[x]Invalid proxy ip! Updating proxy now \n")
				self.use_proxy=proxy.get_proxy("US") if USE_PROXY else None
				pass
コード例 #5
0
def get_result(keyword, page):
    data = {
        'q': keyword,
        'first': (page - 1) * 50,
    }
    proxy = get_proxy()
    proxies = {
        'https': 'https://' + proxy,
        'http': 'http://' + proxy
    }
    url = 'http://cn.bing.com/search?' + urlencode(data)
    print(url)
    response = requests.get(url, headers=headers, proxies=proxies)
    if response.status_code == 200:
        html = response.text
        doc = pq(html)
        results = doc('li.b_algo').items()
        for result in results:
            title = result.find('h2').text()
            href = result.find('h2 a').attr('href')
            abstract = result.find('.b_caption').text()
            url = result.find('cite').text().replace(' ', '')
            yield dict({
                'title': title,
                'href': href,
                'abstract': abstract,
                'url': url,
            })
コード例 #6
0
def download(url,
             proxy=proxy_tool.get_proxy(),
             num_retries=config.NUM_RETRIES):
    """
    :param url: 网址链接
    :param user_agent: 用户代理
    :param proxy: 设置代理
    :param num_retries: 下载错误重新下载次数
    :return:
    """
    time.sleep(config.DELAY)
    log.info('Downloading:{}'.format(url))
    headers = {
        'User-agent': choice(ua_list.UA_LIST),
    }
    request = urllib2.Request(url, headers=headers)
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme: proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        log.error('Download error:{}'.format(e))
        if proxy:
            proxy_tool.delete_proxy(proxy)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 400 <= e.code < 600:
                return download(url, choice(ua_list.UA_LIST), num_retries - 1)
    return html
コード例 #7
0
def get_page_data(url):
    p = get_proxy()
    proxy = {p['schema']: p['address']}
    useragents = open('useragent.txt').read().split('\n')
    useragent = {'User-Agent': choice(useragents)}
    print(useragent)
    print(proxy)
    url = get_json(url, useragent, proxy)
    print(url)
    items = (find_element(url, 'items'))
    print(items)
    sleep(0.5)
    for keys in items:
        value = find_element(keys, 'value')
        print(value)
        id = find_element(keys, 'id')
        if id is not None:
            uri = find_element(keys, 'uri_mweb')
            url = 'https://www.avito.ru' + str(uri)
            print(url)
            location = find_element(keys, 'location')
            user_type = find_element(keys, 'userType')
            try:
                number = get_number(id, useragent, proxy)
            except:
                number = ""
            if not check_number(number):
                data = {
                    'id': id,
                    'number': number,
                    'url': url,
                    'location': location,
                    'userType': user_type
                }
                write_csv(data)

        if find_element(value, 'list'):
            list = find_element(value, 'list')
            for k in list:
                id = find_element(k, 'id')
                uri = find_element(k, 'uri_mweb')
                url = 'https://www.avito.ru' + str(uri)
                location = find_element(k, 'location')
                user_type = find_element(k, 'userType')
                print(url)
                try:
                    number = get_number(id, useragent, proxy)
                except:
                    number = ""
                if not check_number(number):
                    data = {
                        'id': id,
                        'number': number,
                        'url': url,
                        'location': location,
                        'userType': user_type
                    }
                    write_csv(data)
                continue
コード例 #8
0
def get_urls():
    # TODO -
    nmdproxy = proxy_lib.get_proxy()
    proxy = recompute_proxy_data(nmdproxy, 'production')
    proxy.update(recompute_proxy_data(nmdproxy, 'staging'))
    for url, app in proxy.items():
        protocol = app['protocol']
        full_url = "{protocol}://{url}".format(protocol=protocol, url=url)
        yield full_url, len(proxy)
コード例 #9
0
ファイル: main_old.py プロジェクト: karidon/Nnm_club_parser
def main():
	value_proxy = proxy.get_proxy(proxy.parse(proxy.get_html(proxy.BASE_URL)))
	print(value_proxy)

	topic_count = parse(get_html(FOREIGN_URL, proxies={'http': value_proxy}))
	print('Результатов поиска: %d (max: 50)' % len(topic_count))

	foreign_films = 'Зарубежные фильмы'
	save_html(foreign_films, topic_count, 'project.html')

	value_proxy = proxy.get_proxy(proxy.parse(proxy.get_html(proxy.BASE_URL)))
	print(value_proxy)

	topic_count = parse(get_html(OUR_URL, proxies={'http': value_proxy}))
	print('Результатов поиска: %d (max: 50)' % len(topic_count))

	our_films = 'Наши фильмы'
	save_html(our_films, topic_count, 'project.html', mode='a')  # до писали фаил
コード例 #10
0
ファイル: xitekv2.py プロジェクト: fred-chen/python
    def switch_proxy(self):
        if self.protocol and self.addr and self.port:
            proxy.proxy_die(self.protocol, self.addr, self.port)

        protocol, addr, port = proxy.get_proxy()
        if self.protocol and self.addr and self.port:
            self.log(u"切换Proxy: %s:%s => %s:%s" % (self.addr, self.port, addr, port))
        self.protocol, self.addr, self.port = protocol, addr, port
        prxhandler = urllib2.ProxyHandler( { self.protocol:"%s:%s" % (self.addr, self.port) } )
        self.opener.add_handler(prxhandler)
コード例 #11
0
ファイル: main_old.py プロジェクト: karidon/Nnm_club_parser
def main():
    value_proxy = proxy.get_proxy(proxy.parse(proxy.get_html(proxy.BASE_URL)))
    print(value_proxy)

    topic_count = parse(get_html(FOREIGN_URL, proxies={'http': value_proxy}))
    print('Результатов поиска: %d (max: 50)' % len(topic_count))

    foreign_films = 'Зарубежные фильмы'
    save_html(foreign_films, topic_count, 'project.html')

    value_proxy = proxy.get_proxy(proxy.parse(proxy.get_html(proxy.BASE_URL)))
    print(value_proxy)

    topic_count = parse(get_html(OUR_URL, proxies={'http': value_proxy}))
    print('Результатов поиска: %d (max: 50)' % len(topic_count))

    our_films = 'Наши фильмы'
    save_html(our_films, topic_count, 'project.html',
              mode='a')  # до писали фаил
コード例 #12
0
def get_books_per_page(page_url):
    try:
        px = proxy.get_proxy()
        book_list_html = requests.get(page_url, headers=header)
        # book_list_html = requests.get(page_url, proxies=px, headers=header)
        book_list_content = BeautifulSoup(book_list_html.text, 'html.parser')
        book_list = book_list_content.find_all('div', class_='info')
        books = []
        for bl in book_list:
            books.append(bl.find('a'))
        return books
    except:
        return get_books_per_page(page_url)
コード例 #13
0
 def first_step(self):
     global _session
     use_proxy = proxy.get_proxy(1)  #获取代理ip及端口,这个函数会返回一个数组,包含1*10个代理
     test = emailget.email()  #实例化邮箱获取脚本
     new_email = test.get_emailaddress()  #获取一个新的邮箱地址
     print(new_email)  #展示给你看,并且自动写入“注册邮箱”
     _session = requests.session()  #准备建立注册会话
     self.register_new()  #注册脚本开始运行,今天是4.30,知乎仍未开放邮箱注册,所以test到这一步就停了
     test.get_content()  #登录一次性邮箱,获取知乎的邮件内容,目前是test,获取的邮件是欢迎邮件,等知乎开放注册后进行修改
     test.vertify_email()  #自行寻找验证邮件中网址的内容,粘贴到提示处,将会完成验证
     account_info.savedata(
         new_email, password
     )  #将新的机器人账户邮箱和密码保存到目录下的txt文本中,password参数目前不存在,需要使用re将邮箱str分割
コード例 #14
0
ファイル: middlewares.py プロジェクト: weichentc/news
 def process_request(self, request, spider):
     # print(time.strftime('%H:%M:%S'))
     proxy_config = get_proxy(keep_ip=False)['proxy']
     # kwargs['proxies'] = {'http': 'http://%(user)d:%(pwd)s@%(proxy)s' % proxy_config,
     # print(proxy_config)
     # print(time.strftime('%H:%M:%S'))
     # print('proxy_config: ',proxy_config)
     # request.meta['proxies'] = {'http': 'http://%(user)d:%(pwd)s@%(proxy)s' % proxy_config,
     #                      'https': 'https://%(user)d:%(pwd)s@%(proxy)s' % proxy_config}
     # 设置代理的主机和端口号
     # request.meta['proxy'] = 'http://%s:%d/get-proxy-api' % ('118.190.114.196',8080)
     # print('proxy_config: ',proxy_config)
     if proxy_config:
         request.meta['proxy'] = 'http://%s' % proxy_config
コード例 #15
0
def get(url,
        retry=3,
        fpfirst=False,
        use_proxy=False,
        render_js=False,
        headers=None,
        content_length_limit=None):
    """
    Download a web page via proxy and return as unicode string.

    A proxy server is automatically retrieved from server
    and used, unless use_proxy is set to False, where the page
    will be fetched directly. The proxy status is reported back
    to server after each successful or failed use.

    Note: JavaScript renderer is not currently supported.
    """
    schema = url.split('://')[0]
    if schema == url:
        LOG.warning('URL schema missing. Assuming HTTP.')
        url = 'http://' + url
    elif schema not in (
            'http',
            'https',
    ):  # 'ftp'):
        LOG.error('URL schema "%s" not supported. Returning nothing.' % schema)
        return None

    if render_js and (fpfirst or headers != None):
        LOG.error(
            'fpfirst and headers are not supported when render_js is specified. Ignoring.'
        )

    for i in range(retry):
        if use_proxy:
            proxy = get_proxy()
            if not proxy:
                LOG.warning('No valid proxy to get page. Continuing.')
                continue
        else:
            wait_b4_try(i)
            proxy = ''
        if render_js:
            p = _get_page_phantomjs(url, proxy)
        else:
            p = _get_page_requests(url, proxy, fpfirst, headers,
                                   content_length_limit)
        if p:
            return p
    return None
コード例 #16
0
def download_from_url(url, output_directory, filename=None, use_cache=True):
    """Download file from a url and put it under output_directory.

    :param url: Url that gives response.
    :type url: str

    :param output_directory: Directory to put the diagram.
    :type output_directory: str

    :param filename: Optional filename for downloaded file.
    :type filename: str

    :param use_cache: If there is a cached copy of the file already in the
        output directory, do not refetch it (True) or force refecth it (False).
    :type use_cache: bool

    :returns: File path if success to download, else None
    :rtype: str
    """
    if filename is None:
        filename = get_filename(url)
    LOGGER.info('Download file %s from %s' % (filename, url))
    file_path = os.path.join(output_directory, filename)
    if os.path.exists(file_path) and use_cache:
        LOGGER.info('File %s exists, not downloading' % file_path)
        return file_path

    # Set Proxy in webpage
    proxy = get_proxy()
    network_manager = QNetworkAccessManager()
    if not proxy is None:
        network_manager.setProxy(proxy)

    # Download Process
    # noinspection PyTypeChecker
    downloader = FileDownloader(network_manager, url, file_path)
    try:
        result = downloader.download()
    except IOError as ex:
        raise DownloadException(ex)

    if result[0] is not True:
        _, error_message = result
        raise DownloadException(error_message)

    if os.path.exists(file_path):
        return file_path
    else:
        return None
コード例 #17
0
def download_from_url(url, output_directory, filename=None, use_cache=True):
    """Download file from a url and put it under output_directory.

    :param url: Url that gives response.
    :type url: str

    :param output_directory: Directory to put the diagram.
    :type output_directory: str

    :param filename: Optional filename for downloaded file.
    :type filename: str

    :param use_cache: If there is a cached copy of the file already in the
        output directory, do not refetch it (True) or force refecth it (False).
    :type use_cache: bool

    :returns: File path if success to download, else None
    :rtype: str
    """
    if filename is None:
        filename = get_filename(url)
    LOGGER.info('Download file %s from %s' % (filename, url))
    file_path = os.path.join(output_directory, filename)
    if os.path.exists(file_path) and use_cache:
        LOGGER.info('File %s exists, not downloading' % file_path)
        return file_path

    # Set Proxy in webpage
    proxy = get_proxy()
    network_manager = QNetworkAccessManager()
    if not proxy is None:
        network_manager.setProxy(proxy)

    # Download Process
    # noinspection PyTypeChecker
    downloader = FileDownloader(network_manager, url, file_path)
    try:
        result = downloader.download()
    except IOError as ex:
        raise DownloadException(ex)

    if result[0] is not True:
        _, error_message = result
        raise DownloadException(error_message)

    if os.path.exists(file_path):
        return file_path
    else:
        return None
コード例 #18
0
def get_book_info(book_url):
    try:
        px = proxy.get_proxy()
        book_html = requests.get(book_url, headers=header)
        # book_html = requests.get(book_url, proxies=px, headers=header)
        book_content = BeautifulSoup(book_html.text, 'html.parser')
        book_info = book_content.find('div', class_='subject')
        if book_info is None:
            # 一般情况下是代理池中的其中一个ip被封了,正好被我取到了
            logger.info('没有爬到这本书的信息,重试:' + book_url)
            return get_book_info(book_url)
        return book_info
    except:
        logger.info('异常,重试:' + book_url)
        return get_book_info(book_url)
コード例 #19
0
def get_all_tag_url():
    try:
        px = proxy.get_proxy()
        # tag_page_html = requests.get(home_url + '/tag/', proxies=px, headers=header)
        tag_page_html = requests.get(home_url + '/tag/', headers=header)
        tag_page_content = BeautifulSoup(tag_page_html.text, 'html.parser')
        tags = tag_page_content.find('div', class_='article').find_all('a')
        tag_urls = []
        for a in tags:
            if not a.get('href') is None:
                tag_urls.append(home_url + a.get('href'))
        del tag_urls[0]
        return tag_urls
    except:
        return get_all_tag_url()
コード例 #20
0
def main():
    geturl.main()
    time.sleep(5)
    link = random_link()
    name = raw_input('Enter name: ')
    hashtag = raw_input('Enter hashtag: ')
    proxy_choice = raw_input('Do you want to use proxies? (yes/no)')
    print('Logggin in using\nUsername='******'\nPassword='******'yes'):
        proxies = proxy.get_proxy()
        i = random.randint(0, len(proxies) - 1)
        pr = proxies[i]
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.http", pr['ip'])
        profile.set_preference("network.proxy.http_port", int(pr['port']))
        profile.set_preference("network.proxy.ssl", pr['ip'])
        profile.set_preference("network.proxy.ssl_port", int(pr['port']))
        options = setup_headless()
        browser = webdriver.Firefox(firefox_options=options)
        login(browser)
    else:
        options = setup_headless()
        browser = webdriver.Firefox(firefox_options=options)
        login(browser)

    while True:
        print('Visitng group URL')
        browser.get(groupURL)
        tb = browser.find_element_by_name('xhpc_message_text')
        message = get_message()
        msg = prep_message(message, link, name, hashtag)
        print('Posting')
        time.sleep(5)
        print(msg)
        tb.send_keys(msg)
        time.sleep(5)
        try:
            post_btn = browser.find_element_by_xpath(
                '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div/div[2]/div/div[2]/div[3]/div/div[2]/div/div[2]/button'
            )
            post_btn.click()
            break
        except Exception as e:
            print(e)
    browser.close()
コード例 #21
0
def find_max_page(tag_url):
    try:
        px = proxy.get_proxy()
        home_html = requests.get(tag_url, headers=header)
        # home_html = requests.get(tag_url, proxies=px, headers=header)
        page_content = BeautifulSoup(home_html.text, 'html.parser')
        # 取得页数div中倒数第二个标签,即为最大页数
        paginator = page_content.find('div', class_='paginator').find_all('a')
        if paginator is not None:
            size = int(paginator[len(paginator) - 2].text)
        else:
            size = 0
        # 豆瓣傻逼,虽然页数很多,但是50页后全都是空的。
        if size > 50:
            size = 50
        return size
    except:
        return find_max_page(tag_url)
コード例 #22
0
def ttscache_get_proxy():
    """Get the proxy.


    :returns: the get_proxy content.

    """
    if request.method == 'GET':
        logging.debug("GET request")
        header, body = get_proxy()
        if 'filename' in header:
            logging.debug("Send certificate file")
            return send_file(BytesIO(body),
                             attachment_filename=header.get('filename'),
                             mimetype=header.get('Content-Type'))
        else:
            logging.debug("Send response")
            response = make_response(body)
            response.headers['Content-Type'] = header.get('Content-Type')
            return response
コード例 #23
0
ファイル: main.py プロジェクト: amineKammah/web-bot
def create_accounts():
    while True:
        logging.info("Getting proxy")
        proxy = get_proxy()
        logging.info(f"Got proxy, {proxy}")
        for _ in range(10):
            try:
                driver = get_driver(proxy)
                register_link = "https://login.aliexpress.com/"
                driver.get(register_link)
                set_location_cookie(driver)
                email, password = create_new_account(driver)

                with open("accounts.txt", "a") as myfile:
                    myfile.write(f"{email}:{password}\n")

                driver.close()
            except Exception as e:
                logging.warning(e)
                break
コード例 #24
0
def load_data(url, data, headers):
    while True:
        # 从代理池获取代理,若代理连续5次不能使用,从池中删除
        pro = proxy.get_proxy()
        proxies = {"http": "http://{}".format(pro)}
        print proxies["http"]
        retry_count = 5
        while retry_count > 0:
            try:
                t = random.randint(1, 5)
                time.sleep(t)
                # 发起请求
                response = requests.post(url,
                                         data=data,
                                         headers=headers,
                                         proxies=proxies)
                text = response.text
                result = json.loads(text)
                return result
            except Exception:
                retry_count -= 1
        proxy.delete_proxy(pro)
コード例 #25
0
ファイル: requester.py プロジェクト: wangdiaodiao520/tsspider
 def get(self):
     proxy = Redis().get()
     while not proxy:
         get_proxy()
         proxy = Redis().get()
     proxy_request = {'http': proxy}
     try:
         response = requests.get(self.url,headers=self.head,proxies=proxy_request,timeout=self.timeout)
         if response.status_code == 200:
             return response
         elif response.status_code == 404:
             return '页面资源无法请求到或不存在'
         else:
             print('请求失败,更换代理重新请求')
             #Redis().remove(proxy)
             get_proxy()
             return self.get()
     except:
         print('请求失败,更换代理重新请求')
         #Redis().remove(proxy)
         get_proxy()
         return self.get()
コード例 #26
0
    'Accept-Encoding':
    'gzip, deflate, sdch, br',
    'Accept-Language':
    'zh-CN,zh;q=0.8',
    'Connection':
    'keep-alive',
    'Cache-Control':
    'max-age=0',
    'Host':
    'www.zhihu.com',
    'Upgrade-Insecure-Requests':
    '1',
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
}
use_proxy = proxy.get_proxy(1)  #获取代理ip及端口,这个函数会返回一个数组,包含1*10个代理,需要耐心等待
proxy_dic = {}
for i in range(1):
    proxy_dic["http"] = 'http://' + use_proxy[0][i] + ':' + use_proxy[1][
        i]  #生成了一个代理数组,可以直接被requests调用


class Register():
    _session = None

    def __init__(self):
        self.first_step()

    def first_step(self):
        global _session
        use_proxy = proxy.get_proxy(1)  #获取代理ip及端口,这个函数会返回一个数组,包含1*10个代理
コード例 #27
0
ファイル: tasks.py プロジェクト: Augustles/scrapydemo
def check_remove_proxy_ip(self, proxy_name, ipstr):
    from proxy import get_proxy
    consumer = get_proxy(proxy_name)
    if not consumer.valid_proxy(ipstr):
        consumer.remove_proxy(ipstr)
        return "removed"
コード例 #28
0
ファイル: download.py プロジェクト: PtahX/Kitsune
def download_file(ddir, url, name=None, **kwargs):
    temp_name = str(uuid.uuid4()) + '.temp'
    tries = 10
    makedirs(ddir, exist_ok=True)
    for i in range(tries):
        try:
            r = requests.get(url, stream=True, proxies=get_proxy(), **kwargs)
            r.raw.read = functools.partial(r.raw.read, decode_content=True)
            r.raise_for_status()
            # Should retry on connection error
            with open(join(ddir, temp_name), 'wb+') as file:
                shutil.copyfileobj(r.raw, file)
                # filename guessing
                mimetype, _ = cgi.parse_header(r.headers['content-type'])
                extension = mimetypes.guess_extension(
                    mimetype,
                    strict=False) if r.headers.get('content-type') else None
                extension = extension or '.txt'
                filename = name or r.headers.get(
                    'x-amz-meta-original-filename')
                if filename is None:
                    filename = get_filename_from_cd(
                        r.headers.get(
                            'content-disposition')) or 'Untitled' + extension
                filename = slugify(filename)
                # ensure unique filename
                filename = uniquify(join(ddir, filename))
                # content integrity
                is_image = r.headers.get(
                    'content-type') == 'image/png' or r.headers.get(
                        'content-type') == 'image/jpeg'
                if r.headers.get('content-length') and getsize(
                        join(ddir, temp_name)) < int(
                            r.headers.get('content-length')):
                    reported_size = getsize(join(ddir, temp_name))
                    downloaded_size = r.headers.get('content-length')
                    raise DownloaderException(
                        f'Downloaded size is less than reported; {downloaded_size} < {reported_size}'
                    )
                elif r.headers.get('content-length') is None and is_image:
                    try:
                        im = Image.open(join(ddir, temp_name))
                        im.verify()
                        im.close()
                        im = Image.open(join(ddir, temp_name))
                        im.transpose(Image.FLIP_LEFT_RIGHT)
                        im.close()
                    except:
                        raise DownloaderException(
                            'Image integrity check failed')
                file.close()
                rename(join(ddir, temp_name), join(ddir, filename))
                return filename, r
        except requests.HTTPError as e:
            raise e
        except:
            if i < tries - 1:  # i is zero indexed
                continue
            else:
                raise
        break
コード例 #29
0
 def __init__(self):
     self.use_proxy = proxy.get_proxy("US")
     print("==Instagram-robots-account-generate==\n[*] start")  #可以删除
コード例 #30
0
ファイル: tasks.py プロジェクト: Augustles/scrapydemo
def check_add_proxy_ip(self, proxy_name, ipstr):
    from proxy import get_proxy
    consumer = get_proxy(proxy_name)
    consumer.on_producer_add(ipstr)
コード例 #31
0
def import_posts(key,
                 url='https://api.fanbox.cc/post.listSupporting?limit=50'):
    conn = psycopg2.connect(host=config.database_host,
                            dbname=config.database_dbname,
                            user=config.database_user,
                            password=config.database_password,
                            cursor_factory=RealDictCursor)

    scraper_data = requests.get(url,
                                cookies={
                                    'FANBOXSESSID': key
                                },
                                headers={
                                    'origin': 'https://fanbox.cc'
                                },
                                proxies=get_proxy()).json()

    if scraper_data.get('body'):
        for post in scraper_data['body']['items']:
            parsed_post = FanboxPost(post['id'], None, post)
            if parsed_post.is_restricted:
                continue
            try:
                file_directory = f"files/fanbox/{post['user']['userId']}/{post['id']}"
                attachments_directory = f"attachments/fanbox/{post['user']['userId']}/{post['id']}"

                cursor1 = conn.cursor()
                cursor1.execute(
                    "SELECT * FROM dnp WHERE id = %s AND service = 'fanbox'",
                    (post['user']['userId'], ))
                bans = cursor1.fetchall()
                if len(bans) > 0:
                    continue

                check_for_flags('fanbox', post['user']['userId'], post['id'])

                cursor2 = conn.cursor()
                cursor2.execute(
                    "SELECT * FROM booru_posts WHERE id = %s AND service = 'fanbox'",
                    (post['id'], ))
                existing_posts = cursor2.fetchall()
                if len(existing_posts) > 0:
                    continue

                post_model = {
                    'id': post['id'],
                    '"user"': post['user']['userId'],
                    'service': 'fanbox',
                    'title': post['title'],
                    'content': parsed_post.body_text,
                    'embed': {},
                    'shared_file': False,
                    'added': datetime.datetime.now(),
                    'published': post['publishedDatetime'],
                    'edited': post['updatedDatetime'],
                    'file': {},
                    'attachments': []
                }

                for i in range(len(parsed_post.embeddedFiles)):
                    if i == 0:
                        filename, _ = download_file(
                            join(config.download_path, file_directory),
                            parsed_post.embeddedFiles[i],
                            cookies={'FANBOXSESSID': key},
                            headers={'origin': 'https://fanbox.cc'})
                        post_model['file']['name'] = filename
                        post_model['file'][
                            'path'] = f'/{file_directory}/{filename}'
                    else:
                        filename, _ = download_file(
                            join(config.download_path, attachments_directory),
                            parsed_post.embeddedFiles[i],
                            cookies={'FANBOXSESSID': key},
                            headers={'origin': 'https://fanbox.cc'})
                        post_model['attachments'].append({
                            'name':
                            filename,
                            'path':
                            f'/{attachments_directory}/{filename}'
                        })

                post_model['embed'] = json.dumps(post_model['embed'])
                post_model['file'] = json.dumps(post_model['file'])
                for i in range(len(post_model['attachments'])):
                    post_model['attachments'][i] = json.dumps(
                        post_model['attachments'][i])

                columns = post_model.keys()
                data = ['%s'] * len(post_model.values())
                data[-1] = '%s::jsonb[]'  # attachments
                query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format(
                    fields=','.join(columns), values=','.join(data))
                cursor3 = conn.cursor()
                cursor3.execute(query, list(post_model.values()))
                conn.commit()
            except DownloaderException:
                continue

    conn.close()
    if scraper_data['body'].get('nextUrl'):
        import_posts(key, scraper_data['body']['nextUrl'])
コード例 #32
0
ファイル: leagues.py プロジェクト: sunnywalden/leagues
class leaguesSpider(scrapy.Spider):
    logger = logging.getLogger(__name__)
    logger.info('start generate proxy ip')
    proxy.get_proxy()
    name = 'leagues'
    allowed_domains = ['sodasoccer.com']
    start_urls = [
        'http://www.sodasoccer.com/dasai/index.html',
    ]

    #获取五大联赛页面url
    def parse(self, response):
        #只获取欧洲五大联赛对应的列表元素
        leagues = response.xpath('//div[@class="league_box1"][2]/ul/li')[0:5]
        #获取五大联赛各自的详情页相对地址
        leagues_urls = leagues.xpath('div[@class="l_box"]/a/@href').extract()
        #拼接得到联赛的详情页绝对url
        for league_url in leagues_urls:
            url = 'http://www.sodasoccer.com' + league_url
            self.logger.info(url)
            #调用爬取联赛信息的函数
            yield scrapy.Request(url, callback=self.parse_league)

    #爬取联赛详情
    def parse_league(self, response):
        clubs = []
        league = LeagueItem()
        #联赛logo
        league['img_urls'] = [
            response.xpath('//div[@class="limg"]/img/@src').extract()[0].split(
                '?')[0]
        ]
        #联赛中文名称
        league['name'] = response.xpath(
            '//h1[@class="lh1"]/text()').extract()[0]
        #联赛英文名称
        league['league_uname'] = response.xpath(
            '//h2[@class="lh2"]/text()').extract()[0]
        #联赛下的俱乐部列表
        league_clubs = response.xpath(
            '//div[@class="l_zwq"]/ul/li/p/a/text()').extract()
        #将列表转化为字符串,避免存入MySQL后中文显示为unicode
        for club in league_clubs:
            tmp = club.strip('\r\n\t\t\t').strip()
            clubs.append(tmp)
        league['league_clubs'] = clubs
        #获取各当前联赛下各俱乐部的详情页
        clubs_details = response.xpath(
            '//div[@class="l_zwq"]/ul/li/div[@class="qiuduitu_wb"]/a/@href'
        ).extract()
        #递归调用俱乐部信息爬虫函数
        for club_details in clubs_details:
            yield scrapy.Request('http://www.sodasoccer.com' + club_details,
                                 callback=self.parse_club)
        self.logger.info(league)
        yield league

    #俱乐部信息爬取
    def parse_club(self, response):
        if response.status == 200:
            club = ClubItem()
            #俱乐部所在联赛中文名
            club['club_league'] = response.xpath(
                '//div[@class="leida"]/ul/li[@class="world_fu1_li world_fu1_li_frist"]/span/a/text()'
            ).extract()[0]
            #俱乐部logo
            club['img_urls'] = [
                response.xpath('//div[@class="photo"]/img/@src').extract()
                [0].split('?')[0]
            ]
            #俱乐部详情父元素
            club_info = response.xpath(
                '//div[@class="jiben"]/ul[@class="xin"]')
            #俱乐部中文名称
            club['name'] = club_info.xpath('li/text()').extract()[0]
            #俱乐部英文名称
            club['club_uname'] = club_info.xpath('li/text()').extract()[1]
            #俱乐部主教练
            club_manager = club_info.xpath('li/a/text()').extract()[0]
            if club_manager:
                club['club_manager'] = club_manager
            else:
                club['club_manager'] = '-'
    #俱乐部球场
            soccerfield = club_info.xpath('li/text()').extract()[2]
            if soccerfield:
                club['club_soccerfield'] = soccerfield
            else:
                club['club_soccerfield'] = '-'
            try:
                new_info = response.xpath('//div[@id="lineup_0"]/table')
                old_info = response.xpath('//div[@id="lineup_1"]/table')
                #新赛季引进球员
                new_players = new_info.xpath('tr/td/a/text()').extract()[::2]
                #上个赛季阵容
                old_players = old_info.xpath('tr/td/a/text()').extract()[::2]
                #得到所有球员
                club['club_players'] = new_players + old_players
                new_players_details = new_info.xpath(
                    'tr/td/a/@href').extract()[::2]
                old_players_details = old_info.xpath(
                    'tr/td/@href').extract()[::2]
                players_details = new_players_details + old_players_details
                self.logger.info('All players of club %s is %s',
                                 club['club_uname'], players_details)
            except:
                club['club_players'] = '-'
            if players_details:
                #递归调用球员信息爬虫
                for player_details in players_details:
                    yield scrapy.Request('http://www.sodasoccer.com' +
                                         player_details,
                                         callback=self.parse_player)
            self.logger.info(club)
            yield club
        else:
            self.logger.info('get league failed, Try again')
            yield scrapy.Request(request.url, callback=self.parse_club)

    #球员信息爬取

    def parse_player(self, response):
        player = PlayerItem()
        #球员中文名
        player['name'] = response.xpath(
            '//div[@class="detailhead"]/h1/text()').extract()[0]
        info = response.xpath('//div[@class="jiben"]/ul[@class="xin"]')
        #球员英文名
        player['player_uname'] = info.xpath('li/text()').extract()[0].strip(
            ':').strip()
        try:
            birth_tmp = info.xpath('li/text()').extract()[1].strip().split(
                '-')[0]
            #球员生日信息
            birth = int(birth_tmp.strip())
            this_year = int(datetime.datetime.now().year)
            #球员年龄
            player['player_age'] = this_year - birth
        except Exception as error:
            self.logger.info(error)
            player['player_age'] = 'unknow'
    #场上位置
        player['player_position'] = info.xpath(
            'li/span/strong/text()').extract()[0].strip()
        #国籍
        player['player_nationality'] = info.xpath(
            'li/span/strong/text()').extract()[3].strip()
        #身高
        player['player_high'] = info.xpath('li/text()').extract()[3].strip()
        #体重
        player['player_weight'] = info.xpath(
            'li/span/strong/text()').extract()[2].strip()
        #身价
        player['player_networth'] = info.xpath(
            'li/text()').extract()[2].strip()
        #当前俱乐部
        player['player_club'] = response.xpath(
            '//div[@class="leida"]/ul/li[@class="world_fu1_li world_fu1_li_frist"]/span/a/text()'
        ).extract()[0].strip()
        #球衣号码
        player_number = response.xpath(
            '//div[@class="leida"]/ul/li[@class="world_fu1_li world_fu1_li_sec"]/span[@class="world_hao_con world_hao_con1"]/text()'
        ).extract()
        if not player_number:
            player['player_number'] = 'unknow'
        else:
            player['player_number'] = player_number[0].strip()

    #照片
        player['img_urls'] = [
            response.xpath('//div[@class="photo"]/img/@src').extract()
            [0].strip().split('?')[0]
        ]
        #联赛
        player['player_league'] = response.xpath(
            '//div[@id="career_stat_0"]/table/tr/td/text()').extract(
            )[0].strip()

        self.logger.info(player)
        yield player
コード例 #33
0
        freqs=", ".join(str(x) for x in valid_frequencies))
    print "So we will use the closest acceptable number of {best_freq}".format(
        best_freq=best_frequency)

    return best_frequency


if __name__ == '__main__':
    #fix_monitor_frequency(frequency_threshold=15, new_frequency=60)
    #fix_monitor_type("SIMPLE", "BROWSER")

    # Get all the synthetics monitors
    monitors = {x['uri']: x for x in newrelic.get_synthetics_monitors()}

    # Pull the proxy databag and parse out the staging and production sites
    proxy_layer = proxy.get_proxy()

    monitor_default_frequency = calculate_synthetics_timing(site_entries)
    fix_monitor_frequency(frequency_threshold=monitor_default_frequency - 1,
                          new_frequency=monitor_default_frequency)

    for site_url, entry in proxy['production'].items():
        # Determine protocol from proxy databag
        if "ssl" in entry or "ssl_force" in entry:
            protocol = "https"
        else:
            protocol = "http"
        full_url = "{protocol}://{site_url}".format(protocol=protocol,
                                                    site_url=site_url)
        print "Working on {url}...".format(url=full_url)
        if full_url in monitors.keys():
コード例 #34
0
    if request.method == 'GET':
        logging.debug("GET request")
        header, body = get_proxy()
        if 'filename' in header:
            logging.debug("Send certificate file")
            return send_file(BytesIO(body),
                             attachment_filename=header.get('filename'),
                             mimetype=header.get('Content-Type'))
        else:
            logging.debug("Send response")
            response = make_response(body)
            response.headers['Content-Type'] = header.get('Content-Type')
            return response


@APP.route('/health', methods=['GET'])
def health():
    """Check app health."""
    return "OK", 200


if __name__ == '__main__':
    logging.basicConfig(
        filename='/var/log/ttscache/app.log',
        format=
        '[%(asctime)s][%(levelname)s][%(filename)s@%(lineno)d]->[%(message)s]',
        level=logging.DEBUG)
    APP.logger.setLevel(logging.DEBUG)
    get_proxy()
    APP.run(host="0.0.0.0", port=80)
コード例 #35
0
ファイル: man.py プロジェクト: psysungeng/siteSEO
logger.addHandler(handler)           # 为logger添加handler
logger.setLevel(logging.DEBUG)    #设置日志保存级别

# 打印到屏幕设定
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(formatter)
logger.addHandler(console)
logging.debug("程序启动,初始化完成")

logging.info("开始百度优化访问")
p_dic = {'keyword': 'IT运维经验', 'url': 'www.qnjslm.com'}
baidu_spider = Baidu_Spider.GetKeyWordUrl(p_dic)
ua = Get_UA.get_user_agent()
proxy = proxy.GetProxy()
while 1:
    (proxy_code, proxy_ip) = proxy.get_proxy()
    if proxy_code:
        if "no proxy" in proxy_ip:
            logging.warning("IP地址池没有可用代理IP地址,暂停等待")
            time.sleep(120)

        (baidu_code, baidu_message) = baidu_spider.man(proxy_ip, ua)
        if baidu_code == 10:
            sys.exit(1)
        else:
            proxy.delete_proxy(proxy_ip)
    else:
        proxy.delete_proxy(proxy_ip)