Esempio n. 1
0
def test_saysth():
    browser = Browser()
    # 用户打开了主页
    browser.get(index_url)
    left_bar = browser.find_element_by_class_name('layout-aside-left')
    # 用户看见了登录按键
    login_button = left_bar.find_element_by_class_name('q-card-main')
    assert '登录' in login_button.text
    # 用户点击了登录
    login_button.click()
    sleep(1)
    # 用户登录了网站
    assert try_login_with(browser, TESTING_CARD_ID, TESTING_PASSWORD)
    # 用户点击了"广场"
    browser.find_element_by_xpath(
        '//*[@id="q-app"]/div/aside/div[1]/div[2]/div[3]/div[2]/div').click()
    # 用户点击了"+"按钮
    browser.find_element_by_css_selector(
        'div.layout-page-container.transition-generic > main > div:nth-child(1) > div.z-fixed.fixed-bottom-right > button'
    ).click()
    sleep(1)
    # 用户输入了一些话
    browser.find_element_by_xpath(
        '/html/body/div[3]/div/div/div[2]/div[1]/div/div/div/div/a/div[2]/div[2]/textarea'
    ).send_keys('测试')
    # 用户点击了发布
    browser.find_element_by_xpath(
        '/html/body/div[3]/div/div/div[1]/div/button[2]/span').click()
    sleep(1)
    # 用户看见了自己的帖子
    the_post = browser.find_element_by_css_selector(
        '#q-app > div > div.layout-page-container.transition-generic > main > div:nth-child(1) > div.q-infinite-scroll > div.q-infinite-scroll-content > div:nth-child(3)'
    )
    post_text = the_post.text
    assert '测试' in post_text
    # 用户点击了删除
    the_post.find_element_by_tag_name('button').click()
    sleep(1)
    browser.find_element_by_class_name(
        'modal-buttons').find_elements_by_tag_name('button')[1].click()
    sleep(1)
    # 用户不再能看见了自己的帖子了
    try:
        the_post = browser.find_element_by_css_selector(
            '#q-app > div > div.layout-page-container.transition-generic > main > div:nth-child(1) > div.q-infinite-scroll > div.q-infinite-scroll-content > div:nth-child(3)'
        )
    except:
        pass
    else:
        post_text = the_post.text
        assert '测试' not in post_text
    browser.close()
Esempio n. 2
0
def extract_citation_for_publication(link):
    """
    this function craws the list of articles from a given link. If it has next page, it will continue to it until there is none
    @param[in]      profile_url     the link of google scholar profile you want to crawl
    @return         the list of articles as a list where each entry is dictionary
    """
    browser = Browser('chromedriver.exe')
    citation = {}
    # go the citation view
    # as the page is written is javascript, we are not able to get its content via urllib2
    # intead we will use Selenium to simulate a web browser to render the page
    # req=urllib2.Request(publication[k]['link'], headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'})
    # p=urllib2.urlopen(req)
    # sub_soup=BeautifulSoup(p.readlines()[0], 'html.parser')
    # s=sub_soup.find(id='gs_ccl')
    browser.get(link)
    while True:
        citation_root = browser.find_element_by_id('gs_ccl')
        citation_list = citation_root.find_elements_by_class_name('gs_r')
        for citation_item in citation_list:
            # title
            title = citation_item.find_element_by_class_name('gs_rt').text
            # try to get the downloading link, if there is one
            try:
                link = citation_item.find_element_by_id('gs_ggsW2')
                link = link.find_element_by_link_text(
                    link.text).get_attribute('href')
            except:
                link = None
            # author
            author_line = citation_item.find_element_by_class_name('gs_a')
            author_name = author_line.text.split(', ')
            author = {}
            # for each of the author, find its link if its exits
            for a in author_name:
                try:
                    print '.',
                    # there is a google scholar profile with author
                    item = author_line.find_element_by_link_text(a)
                    author[a] = item.get_attribute('href')
                except:
                    # there is not such profile
                    author[a] = None
            # we can also press the cite button to get the detailed citation information, skipped here
            citation[title] = {'link': link, 'author': author}
        # go to next page, if there is one
        if not next_page(browser):
            break
    # close
    browser.close()
    return citation
Esempio n. 3
0
def extract_movies(max_page_num=5):
    browser = Browser()
    browser.get(URL)
    movies = {}
    while True:
        movie_list = browser.find_elements_by_class_name('item')
        for movie in movie_list:
            title = movie.find_element_by_tag_name("p").text.strip()
            rating = movie.find_element_by_tag_name("strong").text.strip()
            movies[title] = rating
        if max_page_num > 0:
            max_page_num -= 1
            if not have_more(browser):
                break
        else:
            break
    browser.close()
    return movies
Esempio n. 4
0
def extract_hongren(max_page_num=5):
    suffix = "hongren"
    # 正常情况下,把driver文件所在路径加到Path环境变量里就可以了
    # 但是我这里不知道怎么回事就是不行,干脆放在代码所在目录下面了
    browser = Browser('chromedriver.exe')
    browser.get(BASE_URL + suffix)
    items = {}
    while True:
        item_list = browser.find_elements_by_class_name('wall_item')
        for item in item_list:
            href = item.find_element(By.CSS_SELECTOR, ".pic_box.pic").get_attribute("href")
            desc = item.find_elements_by_class_name("desc")[0].text.strip()
            items[href] = desc
        if max_page_num > 0:
            max_page_num -= 1
            if not scroll_to_next(browser):
                break
        else:
            break
    browser.close()
    return items
Esempio n. 5
0
def test_login():
    browser = Browser()
    # 用户打开了主页
    browser.get(index_url)
    left_bar = browser.find_element_by_class_name('layout-aside-left')
    # 用户看见了登录按键
    login_button = left_bar.find_element_by_class_name('q-card-main')
    assert '登录' in login_button.text
    # 用户点击了登录
    login_button.click()
    sleep(1)
    # 用户尝试用不正确的密码登录
    success = try_login_with(browser, TESTING_CARD_ID, '000000')
    # 并没有登录成功
    assert not success
    # 用户尝试输入正确的密码
    browser.refresh()
    sleep(1)
    success = try_login_with(browser, TESTING_CARD_ID, TESTING_PASSWORD)
    assert success
    # 登录成功
    assert '首页' in browser.find_element_by_class_name('q-toolbar-title').text
    browser.close()
def thread(queue: Queue):
    while True:
        lp = queue.get()
        br = Browser()
        matches = LOGIN_PASSWORD_FORMAT.match(lp)
        login, password = matches.group('login'), matches.group('password')
        br.get(LOGIN_URL)
        while not (tag := br.find_element_by_id("email")).is_displayed():
            time.sleep(0.1)
        tag.send_keys(login)
        completed = False
        while not completed:  # Может быть случай когда сначало логин и потом кнопку продолжить для пароля
            if (tag := br.find_element_by_id("password")).is_displayed():
                tag.send_keys(password)
                try:
                    br.find_element_by_id("btnLogin").click()
                    completed = True
                except:
                    ...
            elif (tag := br.find_element_by_id("btnNext")).is_displayed():
                try:
                    tag.click()
                except:
                    ...
from selenium.webdriver import PhantomJS as Browser
import json
import time
import re

proxy_list_url = "http://spys.one/socks/"
proxies = []
br = Browser()
br.get(proxy_list_url)
sizes = [25, 50, 100, 200, 300, 500]
pattern = re.compile(r"[.\s]+\((\d+)\)")
for country_id in range(1, 171):
    try_counter = 0
    count = 0
    while (elm := br.find_element_by_id('tldc')).find_element_by_xpath(
            f"./option[@selected]").get_attribute("value") != str(country_id):
        elm = elm.find_element_by_xpath(f'./option[@value="{country_id}"]')
        elm.click()
        try_counter += 1
        if try_counter >= 2:
            break
    if try_counter >= 2:
        continue
    count = int(pattern.findall(elm.text)[0])
    key = 0
    for key, size in enumerate(sizes):
        if int(size) > count:
            break
    try_counter = 0
    while (elm := br.find_element_by_id("xpp")).find_element_by_xpath(
            "./option[@selected]").get_attribute("value") != str(key):
from selenium.webdriver import Firefox as Browser

driver = Browser()
driver.get("www.google.com")
Esempio n. 9
0
def extract_publication(profile_url, verbose=verbose_citation_list):
    """
    this function crawl the publication list from the google scholar profile
    @param[in]      profile_url     the link of google scholar profile you want to crawl
    @param[in]      verbose         the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton
    @return         the list of pulication as a list, where each entry is a dictionary
    """
    # scholar's artical list
    browser = Browser()
    browser.get(profile_url)
    publication = {}
    while True:
        publication_list = browser.find_elements_by_class_name('gsc_a_tr')
        for publication_item in publication_list:
            title = publication_item.find_element_by_class_name(
                'gsc_a_at').text
            print title
            author = publication_item.find_elements_by_class_name(
                'gs_gray')[0].text.split(', ')
            vendor = publication_item.find_elements_by_class_name(
                'gs_gray')[1].text
            try:
                citation = int(
                    publication_item.find_element_by_class_name(
                        'gsc_a_ac').text)
                link = publication_item.find_element_by_class_name(
                    'gsc_a_ac').get_attribute('href')
            except:
                citation = 0
                link = None
            try:
                year = int(
                    publication_item.find_element_by_class_name(
                        'gsc_a_h').text)
            except:
                year = None
            """
            # to get citation for every paper, but will be detected as robot
            if citation>0 and verbose>=verbose_citation_list:
                print 'and its citation list',
                # to solve anti-crawl, but not work
                # time.sleep(2)
                cited_by=extract_citation_for_publication(link)
            else:
                cited_by=None

            print 'finished'
            publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year}
            """
            publication[title] = {
                'link': link,
                'author': author,
                'vendor': vendor,
                'citation': citation,
                'cited by': citation,
                'year': year
            }
        if not next_page_new(browser):
            break
    browser.close()
    return publication