コード例 #1
0
def verify_code():
    cop = ChromeOperate('https://www.jsdati.com/login',
                        executable_path=executable_path)
    sys.exit()
    ele = cop.find_element_by_id('login-captcha-img')
    code = get_auth_code(cop.driver, ele)
    name_input = cop.find_element_by_name('username')
    psd_input = cop.find_element_by_name('password')
    captcha_input = cop.find_element_by_name('captcha')
    cop.input_words(name_input, 'vobile123')
    cop.input_words(psd_input, 'vobile@123')
    cop.input_words(captcha_input, code)
    time.sleep(3)
    captcha_input.send_keys(Keys.ENTER)
    #这里没有做异常处理,比如验证码识别失败应该多点几次

    time.sleep(200)
コード例 #2
0
def main(needLogin=False):
    arguments = []
    if not needLogin:
        #arguments.append('headless')
        pass
    so = ChromeOperate(
        executable_path=r'C:\Users\Administrator\Desktop\chromedriver.exe',
        User_data_dir=r'D:\User Data',
        arguments=arguments)
    driver = so.driver
    driver.get('https://www.bilibili.com/')
コード例 #3
0
def main():
    cop = ChromeOperate(
        executable_path=r'F:\work\py3_pj\amazon_craw\chromedriver.exe')
    url = 'https://m.weibo.cn/status/4470730772817031'
    cop.open(url)
    while True:
        time.sleep(1)
        cop.down_page()
        page_buf = cop.open_source()
        comments = ct.getXpath('//h3/text()', page_buf)
        comments = [[text] for text in comments]
        ct.writer_to_csv(comments, 'comments.csv')
コード例 #4
0
# -*- coding: utf-8 -*-
# @Time    : 2019/7/23 10:09
# @Author  : meng_zhihao
# @Email   : [email protected]
# @File    : twitter.py

from selenium_operate import ChromeOperate

import re
import time
cop = ChromeOperate(executable_path=r'chromedriver.exe',
                    arguments=['--proxy-server=http://%s' % '127.0.0.1:1080'])

cop.open('https://twitter.com/1984to1776')

page = cop.open_source()
コード例 #5
0
from selenium.webdriver.common.keys import Keys
import getpass
import time
from tools.db_util import DBUtil
from selenium_operate import ChromeOperate
from tools.md5_convert import *
username = getpass.getuser()
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#default_path = r'C:\Users\%s\AppData\Local\Google\Chrome\User Data' % username
default_path = r'D:\User Data'
arguments = []
arguments.append('headless')
so = ChromeOperate(
    executable_path=r'C:\Users\Administrator\Desktop\chromedriver.exe',
    User_data_dir=r'D:\User Data',
    arguments=arguments)

driver = so.driver
# driver = webdriver.Chrome()
# driver = webdriver.PhantomJS()
time.sleep(5)

time_range = 60 * 60 * 14
start_time = time.time()  # url,hash?
print('打开链接')
driver.get('https://www.baidu.com/s?wd=python')
print('打开浏览器')
print(driver.title)

conn, cursor = DBUtil.get_cursor('web_detect')
コード例 #6
0
def jingdong():
    cop = ChromeOperate(
        'http://miao.item.taobao.com/564003208573.htm?spm=5070.7116889.1996665613.3.iQXafH',
        executable_path=executable_path)
コード例 #7
0
def taobao():
    print executable_path
    cop = ChromeOperate(
        'https://fuwu.taobao.com/ser/detail.htm?spm=a1z13.8114203.1234-fwlb.5.66c25acaQRsoGC&service_code=ts-1796606&tracelog=category&scm=1215.1.1.51052018%E3%80%81',
        executable_path=executable_path)
コード例 #8
0
def weibo_search():
    start_date = '2019-12-01'
    end_date = '2020-01-01'
    cop = ChromeOperate(
        executable_path=r'F:\work\py3_pj\amazon_craw\chromedriver.exe')
    date_list = dateRange(start_date, end_date)
    print(date_list)
    search_url_template = 'https://s.weibo.com/weibo?q=%E5%8D%8E%E4%B8%BA%20-%E5%96%9C%E9%A9%AC%E6%8B%89%E9%9B%85&scope=ori&typeall=1&suball=1&timescope=custom:{0}-{1}:{2}-{3}&Refer=SWeibo_box'
    line = ['时间', '博文id', '用户', '用户id', '评论', '转发', '点赞', '博文']
    hours = [[0, 2], [2, 4], [4, 6], [6, 8], [8, 10], [10, 12], [12, 14],
             [14, 16], [16, 18], [18, 20], [20, 22], [22, 23]]
    yield line
    for i in range(len(date_list) - 1):
        for hour in hours:
            start_date = date_list[i]
            end_date = date_list[i + 1]
            print(start_date)
            shour, ehour = hour[0], hour[1]
            search_url = search_url_template.format(start_date, shour,
                                                    start_date, ehour)
            cop.open(search_url)
            for page_num in range(5):
                try:
                    page_buf = cop.open_source()
                    posts = ct.getXpath('//div[@class="card-wrap"]', page_buf)
                    for post in posts:
                        texts = ct.getXpath(
                            '//p[@node-type="feed_list_content_full"]//text()',
                            post)
                        if not texts:
                            texts = ct.getXpath(
                                '//p[@node-type="feed_list_content"]//text()',
                                post)
                        texts = ''.join(texts)
                        if not texts:
                            continue
                        date = ""
                        from_source = ct.getXpath('//p[@class="from"]', post)
                        if from_source:
                            date = ct.getXpath1('//a/text()', from_source[-1])
                            date = date.strip()
                        nick = ct.getXpath1('//a/@nick-name', post)
                        user_id = ct.getXpath1('//a[@class="name"]/@href',
                                               post)
                        user_id = ct.getRegex('weibo.com/(\d+)', user_id)
                        mid = ct.getXpath1('//div/@mid', post)
                        ''
                        # 评论
                        comments_button = ct.getXpath1(
                            '//a[@action-type="feed_list_comment"]/text()',
                            post)
                        comments_count = ct.getRegex('评论 (\d+.*)',
                                                     comments_button)
                        if not comments_count:
                            comments_count = 0
                        # get_comments = []
                        feed_list_forward_button = ct.getXpath1(
                            '//a[@action-type="feed_list_forward"]/text()',
                            post)
                        forward_count = ct.getRegex('转发 (\d+.*)',
                                                    feed_list_forward_button)
                        if not forward_count:
                            forward_count = 0

                        like_button = ct.getXpath(
                            '//a[@action-type="feed_list_like"]', post)
                        if like_button:
                            like_button = like_button[-1]
                            like_button = ct.getXpath1("//em/text()",
                                                       like_button)
                            like_count = ct.getRegex('(\d+.*)', like_button)
                            if not like_count:
                                like_count = 0
                        else:
                            like_count = 0

                        # feed_list_forward  # 转发

                        # if ct.getRegex('评论 (\d+.*)',comments_button):
                        #                     #     try:
                        #                     #         # proxy = ct.get_new_1min_proxy()
                        #                     #         # proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy}
                        #                     #         # try:
                        #                     #         #     comments_page = ct.get('https://m.weibo.cn/api/comments/show?id='+mid,proxies=proxies) # 获取评论会封ip,另外有很多评论不可见(敏感词,用户设置) 虽然评论数不是空
                        #                     #         # except:
                        #                     #         #     time.sleep(2)
                        #                     #         time.sleep(2)
                        #                     #         comments_page = ct.get('https://m.weibo.cn/api/comments/show?id=' + mid)
                        #                     #         json_data = json.loads(comments_page)
                        #                     #         comments = json_data['data']['data']
                        #                     #         for comment in comments:
                        #                     #             comment_text = comment['text']
                        #                     #             get_comments.append(comment_text)
                        #                     #     except Exception as e:
                        #                     #         print(e,mid)

                        line = [
                            date, mid, nick, user_id, comments_count,
                            forward_count, like_count, texts
                        ]
                        yield line

                    next_button = cop.find_elements_by_xpath(
                        '//a[@class="next"]')
                    if next_button:
                        time.sleep(random.randint(2, 4))
                        next_button[0].click()
                    else:
                        break
                except Exception as e:
                    print(e)
コード例 #9
0
def weibo_search():
    # start_date = '2020-01-10' # 10-12
    start_date = '2020-01-10'
    end_date = '2020-02-12'
    cop = ChromeOperate(
        executable_path=r'F:\work\py3_pj\amazon_craw\chromedriver.exe')
    date_list = dateRange(start_date, end_date)
    print(date_list)
    search_url_template = 'https://s.weibo.com/weibo/%25E6%2594%25BF%25E5%25BA%259C?q=whzf&scope=ori&suball=1&timescope=custom:{0}-0:{1}-0&Refer=g'

    for i in range(len(date_list) - 1):
        start_date = date_list[i]
        end_date = date_list[i + 1]
        search_url = search_url_template.format(start_date, end_date)
        cop.open(search_url)
        for page_num in range(50):
            try:
                page_buf = cop.open_source()
                posts = ct.getXpath('//div[@class="card-wrap"]', page_buf)
                # proxy = ct.get_new_1min_proxy()
                # proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy}
                for post in posts:
                    texts = ct.getXpath(
                        '//p[@node-type="feed_list_content_full"]//text()',
                        post)
                    if not texts:
                        texts = ct.getXpath(
                            '//p[@node-type="feed_list_content"]//text()',
                            post)
                    texts = ''.join(texts)
                    if not texts:
                        continue
                    date = ""
                    from_source = ct.getXpath('//p[@class="from"]', post)
                    if from_source:
                        date = ct.getXpath1('//a/text()', from_source[-1])
                        date = date.strip()
                    nick = ct.getXpath1('//a/@nick-name', post)
                    mid = ct.getXpath1('//div/@mid', post)
                    ''
                    # 评论
                    comments_button = ct.getXpath1(
                        '//a[@action-type="feed_list_comment"]/text()', post)
                    get_comments = []
                    if ct.getRegex('评论 (\d+.*)', comments_button):
                        try:

                            # try:
                            #     comments_page = ct.get('https://m.weibo.cn/api/comments/show?id='+mid,proxies=proxies) # 获取评论会封ip,另外有很多评论不可见(敏感词,用户设置) 虽然评论数不是空
                            #     json_data = json.loads(comments_page)
                            #     comments = json_data['data']['data']
                            #     for comment in comments:
                            #         comment_text = comment['text']
                            #         get_comments.append(comment_text)
                            # except:
                            #     time.sleep(2)
                            #     proxy = ct.get_new_1min_proxy()

                            # time.sleep(2)
                            comments_page = ct.get(
                                'https://m.weibo.cn/api/comments/show?id=' +
                                mid)
                            json_data = json.loads(comments_page)
                            comments = json_data['data']['data']
                            for comment in comments:
                                comment_text = comment['text']
                                get_comments.append(comment_text)
                        except Exception as e:
                            print(e, mid)

                    line = [date, mid, nick, texts] + get_comments
                    yield line

                next_button = cop.find_elements_by_xpath('//a[@class="next"]')
                if next_button:
                    time.sleep(random.randint(1, 2) * 0.6)
                    next_button[0].click()
                else:
                    break
            except Exception as e:
                print(e)
コード例 #10
0
#         new_url = get_Complete_url(line)
#         if not new_url:
#             url_info =re.search('table=([\w-]+)',line)
#             if not url_info:
#                 url_info =re.search('ProjectToken=([\w-]+)',line)
#             if url_info:
#                 key_id = url_info.group(1)
#                 new_url = 'https://s.cint.com/Survey/Complete?ProjectToken=' + key_id
#         if new_url:
#             print(new_url)
#             cop = ChromeOperate(executable_path=r'chromedriver.exe')
#             cop.open('https://www.lifepointspanel.com/')
#             cop_list.append(cop)
#             new_url_list.append(new_url)

cop = ChromeOperate(executable_path=r'chromedriver.exe')
cop.open('https://www.lifepointspanel.com/')
while True:
    time.sleep(1)
    page_source = cop.driver.page_source
    if '很高兴见到您' in page_source:
        break

page_source = cop.driver.page_source
urls = re.findall('survey-link="(https:.*?)"', page_source)
links = [url for url in urls]

print(links)

for link in links:
    #     'https://router.cint.com/survey/AQAAAL7nugAAAAAAmG-5FA@AQAAAHtb29WspZP0tE42GLad1uZV0jb-oDFne7DJ21JKNo_8'