Example #1
0
def main_func(search_term):
    search_term = url_encode(search_term)
    browser = None
    browser = webdriver.Chrome("%AZ_BATCH_NODE_WORKING_DIR%\\chromedriver 2")
    scrapeCNN(browser, search_term)
    scrapeBBC(browser, search_term)
    scrapeFOX(browser, search_term)
    export_json()

    # Set the limit for number of articles to download
    LIMIT = 30
    data = {}
    data['newspapers'] = {}

    documents = {"documents": []}

    count = 1

    # Iterate through each news company
    for company, value in all_data.items():
        if 'rss' in value:
            d = fp.parse(value['rss'])
            print("Downloading articles from ", company)

            newsPaper = {
                "rss": value['rss'],
                "link": value['link'],
                "articles": []
            }

            for entry in d.entries:
                # Check if publish date is provided, if no the article is skipped.
                # This is done to keep consistency in the data and to keep the script from crashing.
                if hasattr(entry, 'published'):
                    if count > LIMIT:
                        break
                    article = {}
                    article['link'] = entry.link
                    date = entry.published_parsed
                    article['published'] = datetime.fromtimestamp(
                        mktime(date)).isoformat()
                    try:
                        content = Article(entry.link)
                        content.download()
                        content.parse()
                    except Exception as e:
                        # If the download for some reason fails (ex. 404) the script will continue downloading
                        # the next article.
                        print(e)
                        print("continuing...")
                        continue
                    article['title'] = content.title
                    article['text'] = content.text
                    newsPaper['articles'].append(article)
                    print(count, "articles downloaded from", company,
                          ", url: ", entry.link)
                    count = count + 1

        else:
            # This is the fallback method if a RSS-feed link is not provided.
            # It uses the python newspaper library to extract articles
            print("Building site for ", company)

            for link in value['link']:
                content = Article(link)

                newsPaper = {"link": link, "articles": []}

                noneTypeCount = 0

                if count > LIMIT:
                    break
                try:
                    content.download()
                    content.parse()
                except Exception as e:
                    print(e)
                    print("continuing...")
                    continue
                # Again, for consistency, if there is no found publish date the article will be skipped.
                # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.

                article = {}
                article['title'] = content.title
                article['text'] = content.text
                article['link'] = content.url
                if content.publish_date is not None:
                    article['published'] = content.publish_date.isoformat()
                newsPaper['articles'].append(article)

                info = {}

                if len(content.text) < 5100:
                    info["id"] = company + str(count)
                    info["title"] = content.title
                    info['link'] = content.url
                    info['source'] = company
                    info["language"] = "en"
                    info["text"] = content.text

                    documents["documents"].append(info)

                    print(count, "articles downloaded from", company,
                          " using newspaper, url: ", content.url)
                    count = count + 1
                    noneTypeCount = 0

                    data['newspapers'][company] = newsPaper

    run_sample()
Example #2
0
from selenium import webdriver

driver = webdriver.Chrome(
    '/home/alejandro/Desarrollo de Software/Laboratorio4/chromedriver')
driver.get("https://www.python.org")
boton = driver.find_element_by_xpath('//*[@id="news"]/a')
boton.click()
input = driver.find_element_by_xpath('//*[@id="id-search-field"]')
input.send_keys('disc')
boton.clear()
Example #3
0
# You should run retrain.py before using this
import datetime, json, math, re
from html import unescape
from time import sleep
import tweepy
from selenium import webdriver
from selenium.common.exceptions import (
    NoSuchElementException,
    StaleElementReferenceException,
)
from selenium.webdriver.common.keys import Keys
import Markov

# Configurable options
delay = 10  # time to wait on each page load before reading the page
driver = webdriver.Chrome()  # options are Chrome() Firefox() Safari()

try:
    with open("data.json") as f:
        data = json.load(f)
except IOError as e:
    print(e)
    exit()

print("Connecting to Twitter API")
# connect to twitter api
auth = tweepy.OAuthHandler(
    data["keys"]["consumer_token"],
    data["keys"]["consumer_secret"],
    "https://auth.r0uge.org",
)
class runTestCase(object):
    log = Logger(level="debug").logger
    toFileLog = Logger(level="debug")
    module_path = path.pathutil().rootPath
    imagepath = ''
    datatypemenu = ['ElementsInfo', 'InputData', 'OutputData']
    browser = webdriver.Chrome()
    urlReg = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$',
        re.IGNORECASE)

    def __init__(self):
        self.switch = {
            "打开":
            lambda casedealinfoDic, ifrerun: self.test_open(
                casedealinfoDic, ifrerun),
            "点击":
            lambda casedealinfoDic, ifrerun: self.test_click(
                casedealinfoDic, ifrerun),
            "输入":
            lambda casedealinfoDic, ifrerun: self.test_input(
                casedealinfoDic, ifrerun),
            "清除":
            lambda casedealinfoDic, ifrerun: self.test_clear(
                casedealinfoDic, ifrerun),
            "等待":
            lambda casedealinfoDic, ifrerun: self.test_wait(
                casedealinfoDic, ifrerun),
            "断言":
            lambda casedealinfoDic, ifrerun: self.test_assertion(
                casedealinfoDic, ifrerun),
            "引入":
            lambda casedealinfoDic, ifrerun: self.test_introduce(
                casedealinfoDic, ifrerun),
            "最大化":
            lambda casedealinfoDic, ifrerun: self.test_maximize(
                casedealinfoDic, ifrerun)
        }

    # 定义一个保存截图函数
    def save_img(self, img_name):
        self.imagepath = self.module_path + '/Output/Resources/' + img_name + '.png'
        self.browser.get_screenshot_as_file(self.imagepath)

    # 启动函数,每个用例测试前,都会执行该函数
    def setUp(self, url):
        self.starttime = parse(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        self.browser.set_window_size(1920, 1080)
        print("开始测试时间:", self.starttime)
        self.browser.get(url)
        #time.sleep(3)

    # 结束函数,每个用例测试结束后,都会执行该函数
    def tearDown(self):
        #time.sleep(3)
        self.browser.quit()
        self.endtime = parse(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        print("测试结束时间:", self.endtime)
        totaltime = (self.endtime - self.starttime).total_seconds()
        print("总时长:", totaltime, "秒")

    def switchRun(self, casedealinfoDic):
        ifrerun = False
        if casedealinfoDic[1][0] is not None:
            if self.urlReg.match(DPD.getAllData().checkIfExists(
                    self.datatypemenu[1], casedealinfoDic[1][0])):
                ifrerun = True
        try:
            self.switch[casedealinfoDic[1][2]](casedealinfoDic, ifrerun)
        except KeyError as e:
            self.log(e)

    def test_open(self, casedealinfoDic, ifrerun):
        if ifrerun == True:
            self.setUp(DPD.getAllData().checkIfExists(self.datatypemenu[1],
                                                      casedealinfoDic[1][0]))
            self.save_img(casedealinfoDic[0])
        self.tearDown()

    def test_click(self, casedealinfoDic, ifrerun):
        pass

    def test_input(self, casedealinfoDic, ifrerun):
        pass

    def test_clear(self, casedealinfoDic, ifrerun):
        pass

    def test_wait(self, casedealinfoDic, ifrerun):
        pass

    def test_assertion(self, casedealinfoDic, ifrerun):
        pass

    def test_introduce(self, casedealinfoDic, ifrerun):
        pass

    def test_maximize(self, casedealinfoDic, ifrerun):
        pass
 def __init__(self):
     super().__init__()
     self.driver = webdriver.Chrome(
         os.path.join(settings.BASE_DIR, "drivers/chromedriver"))
Example #6
0
 def ChromeDriverBrowser(self):
     driverChrome = webdriver.Chrome()
     return driverChrome
Example #7
0
 def setUp(self):
     self.browser = webdriver.Chrome()
     self.browser.implicitly_wait(3) # selenium会等待3秒
from selenium.webdriver.common.alert import Alert
import re

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
# import config
# import config_request

workbook = xlsxwriter.Workbook('kdx_3001~3500.xlsx')
worksheet = workbook.add_worksheet()

chrome_options = Options()
chrome_options.add_argument('headless')
chrome_options.add_argument('--log-level=3')
driver = webdriver.Chrome('./chromedriver/chromedriver', chrome_options=chrome_options)

data = []

for num in range(3001, 3182):
    driver.get(f'https://kdx.kr/data/view?product_id={num}')

    try:
        WebDriverWait(driver, 1).until(EC.alert_is_present())
        Alert(driver).accept()
        print(f'id:{num} 에러')
        continue
    except TimeoutException:
        print(f'id:{num} 크롤링')
        # print("Alert not found. Move on...")
Example #9
0
# go and slowly add all the 1m who follow magic_fox? or slected? or Samsoe et Samsoe?
# how about you graph the numbers of followers the people who follow magic_fox have.
# bet there is a bunch of bots at the high end, but the vast majority have < 2000 follows.
# These are the real people to target. 


# Look at the people who like the #menswear posts --- follow them and like a photo
# Look at the people who follow the menswear bloggers --- follow them and like a photo



TagList = ['mensfashion', 'menswear', 'menstyle', 'mensstyle', 'menwithclass', 'menwithstyle']

Posts = []

driver = webdriver.Chrome('/Users/brianb/Dropbox/Instagram/chromedriver')

def RandomSleep(low,high):
    sleep(randint(low,high))

def RandomLike(percent):
    NumForUse = 100 - percent
    RanNum = randint(0,100)
    if RanNum > NumForUse:
        return True
    else:
        return False

def Login():
    driver.get('https://www.instagram.com/')
    User = Secrets.get('User')
Example #10
0
 def set_driver(self):
     self.driver = webdriver.Chrome()
     print('[+] Driver started...')
Example #11
0
 def __init__(self):
     self.driver = webdriver.Chrome()
     self.url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
     self.positions = []
Example #12
0
 def __init__(self):
     self.driver = webdriver.Chrome(
         executable_path="E:\chromedriver\chromedriver.exe")
Example #13
0
            for alist in span.find_all('a'):
                each_authors.append(alist.string)
            all_authors.append(each_authors)           
'''



if __name__ == "__main__":
    # 某个老师的百度学术主页
    url = "http://xueshu.baidu.com/scholarID/CN-B8748R1J"

    chrome_options = Options()
    # specify headless mode
    chrome_options.add_argument("--headless")
    browser = webdriver.Chrome(options=chrome_options)
    browser.set_page_load_timeout(600)
    browser.set_script_timeout(600)
    browser.get(url)

    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="articlelist_container"]/div[2]/div[1]/div[20]')))  # index="19"
    #browser.implicitly_wait(10)

    title = browser.find_elements_by_xpath('//*[@id="articlelist_container"]/div[2]/div[1]')
    content = title[0].get_attribute('innerHTML')
    soup = BeautifulSoup(content, 'lxml')

    Title, ArticleUrl = HomePageInfo(browser)

    # 开始第二页以后的内容
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

# Enter the path of the driver
# browser = webdriver.Chrome()
browser = webdriver.Chrome(executable_path=r"C:\Users\Parth\Documents\540\android_review_crawler\ios_Xamarin_and_React_apps_reviews\chromedriver.exe")
## change path depending if its native or framework
# path = "./Native_ios_apps_reviews"
path = "C:\\Users\\Parth\\Documents\\540\\android_review_crawler\\ios_Xamarin_and_React_apps_reviews"
# Tell Selenium to get the URL you're interested in.

data = pd.read_csv("apps data.csv", index_col=False)
main_info = []
for d in range(len(data)):
    # app_name = data.iloc[d][0]
    url = data.iloc[d][1]

    # url = "https://play.google.com/store/apps/details?id=com.facebook.katana&showAllReviews=true"
    browser.get(url)
    SCROLL_PAUSE_TIME = 0.5

    time.sleep(5)  # wait dom ready
    page = browser.page_source
    soup_expatistan = BeautifulSoup(page, "html.parser")

    app_name = soup_expatistan.find("h1", class_="product-header__title app-header__title").text.split()[0]
    overall_and_number = soup_expatistan.find("li",
Example #15
0
def activateAndSubscribe(name):
    goToServiceTemplates()
    clickOnGlobalListName(name)
    # Activate subscription
    if driver.find_elements_by_class_name("s-btn")[3].text == 'Activate':
        driver.find_elements_by_class_name("s-btn")[3].click()
    # go to subscriptions tab
    driver.find_element_by_id('subscriptions').click()
    # create new subscription
    driver.find_element_by_id('subscriptions_create').click()
    # give the first customer a brand new subscription!
    clickOnGlobalList(1)
    submitForm()
    driver.find_element_by_class_name("action").click()

driver=webdriver.Chrome('/chromedriver/chromedriver')
#def deleteAllResources():
if True:

    try:
        driver.get("http://z.pyer.apsdemo.org:8080/")
        instance="cdn"
        #assert 'Parallels® Automation' in driver.title
        login('admin','1qazXSW@')

        services=['VDN Embratel globals', 'VDN Embratel Management',   'VDN Live Channels', 
                  'VDN Content',          'VDN Job',                   'Content Delivery Network',
                  'VDN_HTTP_Traffic',     'VDN_HTTPS_Traffic',         'VDN_VOD_Encoding_Minutes', 
                  'VDN_VOD_Storage_MbH',  'VDN_Live_Encoding_Minutes', 'VDN_Live_DVR_Minutes']

        createAppReference(services[0])
        except Exception as e:
            print("Error en GetAll " + str(e))

    def Paginacion(self):
        try:
            buton = driver.find_element_by_class_name("next")
            buton.click()
            return

        except:
            print("An exception occurred Paginacion")


if "__main__" == __name__:
    PATH = "C:\Pentaho\chromedriver.exe"
    driver = webdriver.Chrome(PATH)
    driver.implicitly_wait(2)
    Url = "https://www.nationalcrimeagency.gov.uk/most-wanted-search"
    driver.get(Url)
    table = Table(driver)
    Lista = []
    rows = []
    ##Lista=table.GetAll()
    #total = len(Lista)
    for i in range(22):
        elemento = table.GetAll()
        if elemento is not None:
            if len(elemento) > 0:
                elemento[i].click()
                data = table.get_rows()
                if data is not None:
Example #17
0
 def ChromeDriverNOBrowser(self):
     chrome_options = Options()
     chrome_options.add_argument('--headless')
     chrome_options.add_argument('--disable-gpu')
     driverChrome = webdriver.Chrome(chrome_options=chrome_options)
     return driverChrome
from selenium import webdriver

driver webdriver.Chrome()
 def setUp(self):
     self.driver = webdriver.Chrome()
     self.driver.maximize_window()
Example #20
0
def browser(request):
    wd = webdriver.Chrome()
    request.addfinalizer(wd.quit)
    return wd
Example #21
0
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

driver = webdriver.Chrome()
driver.get("http://localhost/progetto/")
driver.maximize_window()

time.sleep(2)

link = driver.find_element_by_name("nome").send_keys("admin")
link = driver.find_element_by_name("password").send_keys("admin")
link = driver.find_element_by_name("login").click()

time.sleep(2)

driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

time.sleep(2)

################################
#inserisci specie animale
driver.find_element_by_link_text("Amministrazione").click()
time.sleep(2)
driver.find_element_by_xpath(
    '//*[@id="navbarSupportedContent"]/ul/li[9]/div/a[1]').click()
time.sleep(2)
driver.find_element_by_name("Classe").send_keys("REPTILIA")
driver.find_element_by_name("NomeLatino").send_keys("Nuova Viperw")
driver.find_element_by_name("NomeItaliano").send_keys("Vipera Velenosa")
driver.find_element_by_name("AnnoClassificazione").send_keys("1999")
 def setup_method(self, method):
   self.driver = webdriver.Chrome()
   self.vars = {}
Example #23
0
 def selenium_test_run(self,url):
     driver = webdriver.Chrome("C:\chromedriver4.exe")
     driver.get(url)
     driver.quit()
Example #24
0
from selenium import webdriver
browser = "ie"
if browser == "chrome":
    driver = webdriver.Chrome(
        executable_path=
        "C:/Users/Heggade/PycharmProjects/s_class/drivers/chromedriver.exe")
elif browser == "firefox":
    driver = webdriver.Firefox(
        executable_path=
        "C:/Users/Heggade/PycharmProjects/s_class/drivers/geckodriver.exe")
elif browser == "ie":
    driver = webdriver.Ie(
        executable_path=
        "C:/Users/Heggade/PycharmProjects/s_class/drivers/IEDriverServer.exe")
else:
    print("provide appropriate browser name")
driver.get("http://facebook.com")
driver.maximize_window()
driver.find_element_by_id("email").send_keys("test")
driver.find_element_by_id("pass").send_keys("pass")
driver.find_element_by_id("u_0_2").click()

driver.get
Example #25
0
        body = json.dumps({'cmd': cmd, 'params': params})
        response = driver.command_executor._request('POST', url, body)
        if response['status']:
            raise Exception(response.get('value'))
        return response.get('value')

    def add_script(driver, script):
        '''在页面加载前执行js'''
        send(driver, "Page.addScriptToEvaluateOnNewDocument", {"source": script})

    # 给 webdriver.Chrome 添加一个名为 add_script 的方法
    webdriver.Chrome.add_script = add_script # 这里(webdriver.Chrome)可能需要改,当调用不同的驱动时
    # *************** 专业造假 ###################

    browser = webdriver.Chrome(
        executable_path=driver_path,
        chrome_options=options
    )

    # ################## 辅助调试 *********************
    existed = {
        'executor_url': browser.command_executor._url,  # 浏览器可被远程连接调用的地址
        'session_id': browser.session_id  # 浏览器会话ID
    }
    pprint(existed)
    with open('existed.json', 'wt', encoding='utf-8') as f:
        json.dump(existed, f, ensure_ascii=False, indent=4)
    # ********************* 辅助调试 ##################

    # ############### 专业造假 ***************************
    browser.add_script("""
    Object.defineProperty(navigator, 'webdriver', {
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import io

#save the abstracts
driver = webdriver.Chrome("C:/Users/Supreme Ruler/Desktop/chromedriver.exe")


def goto_page(url):
    """Navigates to given url, url must be string"""
    driver.get(url)


def enter_query(query):
    """Enters the wanted query into the pubmed search bar and presses RETURN"""
    text_bar = driver.find_element_by_id("term")
    text_bar.send_keys(query)
    text_bar.send_keys(Keys.RETURN)


def get_href():
    """Ïterates through all results, adding href and title to list. Returns that list."""
    href_list = []
    try:
        total_pages = driver.find_element_by_name(
            "EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.cPage"
        )
        page_end = total_pages.get_attribute("last")
    except:
        page_end = 1
Example #27
0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import json,time,random


with open('data/config.json') as config_file:
   data = json.load(config_file)


url_login = r'https://www.linkedin.com/login'
url_job_search = r'https://www.linkedin.com/jobs/search/?currentJobId=2514734378&f_AL=true&f_E=2&f_WRA=true&geoId=103644278&keywords=software%20engineer&location=United%20States'

webdriver = webdriver.Chrome(data['driver_path'])

webdriver.get(url_login)
username_element = webdriver.find_element_by_css_selector('#username')
username_element.clear()
username_element.send_keys(data['email'])
password_element = webdriver.find_element_by_css_selector('#password')
password_element.clear()
password_element.send_keys(data['password'])
password_element.send_keys(Keys.RETURN)

webdriver.get(url_job_search)
random_time = random.uniform(3.5, 4.9)
time.sleep(random_time)

amount_of_results = webdriver.find_element_by_css_selector('body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div > div > section.jobs-search__left-rail > div > header > div.jobs-search-results-list__title-heading > small')

print(f'amount_of_results: {amount_of_results.text}')
Example #28
0
import traceback
from tkinter import filedialog
from tkinter import *
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.add_argument("--user-data-dir=/Users/Vanze/Library/Application Support/Google/Chrome/Profile 6")
browser = webdriver.Chrome('./chromedriver', options = options)

delay = 30

# Google Sheets stuff
# You should change these to match your own spreadsheet
if os.path.exists('gsheet_id.txt'):
    with open('gsheet_id.txt', 'r') as file:
       json_repr = file.readline()
       data = json.loads(json_repr)
       GSHEET_ID = data["GSHEET_ID"]
       RANGE_NAME = data["RANGE_NAME"]
else:
    GSHEET_ID = '10PmGsjxMXvIMDIig1QiS-YVYxqOClZEvEu8B9Z69MeA'
    RANGE_NAME = 'Transactions!A:I'
    
Example #29
0
# -*- coding: utf-8 -*-

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import sys
import time
reload(sys)
sys.setdefaultencoding('utf8')
#fo = open('111.log','w')
#sys.stdout = fo

browser = webdriver.Chrome()
#browser=webdriver.PhantomJS(service_args=['--ssl-protocol=any'])#PhantomJS
browser.get('//rate.taobao.com/user-rate-95cf5ce4398a3bd471b08cbe9bf6e0fb.htm?spm=2013.1.1000126.10.68ab26c2ws4Vvj')
#browser=webdriver.PhantomJS(service_args=['--ssl-protocol=any'])#PhantomJS
#browser.get(url)
time.sleep(5)
decribe_score = browser.find_element_by_xpath('//*[@id="dsr"]/li[1]/div[1]/em[1]')
print 'decribe_score:',decribe_score
attitude_score = browser.find_element_by_xpath('//*[@id="dsr"]/li[2]/div[1]/em[1]')
print 'attitude_score:',attitude_score
logistics_score = browser.find_element_by_xpath('//*[@id="dsr"]/li[3]/div[1]/em[1]')
print 'logistics_score:',logistics_score
store_rating = browser.find_element_by_xpath('//*[@id="dsr"]/li[2]/div[2]/div/div[1]/em')
print 'store_rating:',store_rating
browser.quit()

Example #30
0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = webdriver.Chrome(
    executable_path=
    r'C:\Users\Akumar4\Downloads\chromedriver_win32\chromedriver.exe')

driver.get('https://www.expedia.com/')
driver.maximize_window()

try:
    # path = '//*[@id="reasons-to-believe-banner"]/li[1]/span[2]' #
    wait = WebDriverWait(driver, 10)
    # ele = wait.until(EC.presence_of_element_located((By.XPATH, path)))#
    # ele = wait.until(EC.title_is('Expedia Travel: Search Hotels, Cheap Flights, Car Rentals & Vacations')) #
    ele = wait.until(
        EC.element_to_be_clickable(
            (By.XPATH, '//*[@id="header-account-menu"]')))

finally:
    driver.quit()