Example #1
0
 def __init__(self, *args, **kwargs):
     options = Options()
     options.headless = True
     self.driver = webdriver.Firefox(options=options)
Example #2
0
def driver():
    driver = webdriver.Firefox(firefox_options=Options(),
                               projectname="Examples",
                               jobname=None)
    yield driver
    driver.quit()
Example #3
0
"""
For now this scraper is standalone
"""
import json

from bs4 import BeautifulSoup
from selenium import webdriver
import requests
from selenium.webdriver.firefox.options import Options

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
courses_data = {}

course_code_file = open("course_codes.txt", "r")

for line in course_code_file.readlines():
    course = line.split(" ")[0]
    print(course)
    url = "https://www.handbook.unsw.edu.au/undergraduate/courses/2021/{}".format(
        course)
    try:
        driver.get(url)

        inner = driver.find_element_by_class_name("OverviewInner")
        print("Getting course:", course)

        buttons = inner.find_elements_by_tag_name("button")
        buttons[0].click()
Example #4
0
    def __init__(self,
                 client="firefox",
                 username="******",
                 proxy=None,
                 command_executor=None,
                 loadstyles=True,
                 profile=None,
                 headless=False,
                 autoconnect=True,
                 logger=None,
                 extra_params=None,
                 chrome_options=None):
        """Initialises the webdriver"""

        self.logger = logger or self.logger
        extra_params = extra_params or {}

        if profile is not None:
            self._profile_path = profile
            self.logger.info("Checking for profile at %s" % self._profile_path)
            if not os.path.exists(self._profile_path):
                self.logger.critical("Could not find profile at %s" % profile)
                raise WhatsAPIException("Could not find profile at %s" %
                                        profile)
        else:
            self._profile_path = None

        self.client = client.lower()
        if self.client == "firefox":
            if self._profile_path is not None:
                self._profile = webdriver.FirefoxProfile(self._profile_path)
            else:
                self._profile = webdriver.FirefoxProfile()
            if not loadstyles:
                # Disable CSS
                self._profile.set_preference('permissions.default.stylesheet',
                                             2)
                # Disable images
                self._profile.set_preference('permissions.default.image', 2)
                # Disable Flash
                self._profile.set_preference(
                    'dom.ipc.plugins.enabled.libflashplayer.so', 'false')
            if proxy is not None:
                self.set_proxy(proxy)

            options = Options()

            if headless:
                options.headless = True

            options.profile = self._profile

            capabilities = DesiredCapabilities.FIREFOX.copy()
            capabilities['webStorageEnabled'] = True

            self.logger.info("Starting webdriver")

            self.executable_path = './WebWhatsApi/driver/geckodriver'
            if platform.system().lower() == "windows":
                self.executable_path += ".exe"
            elif platform.system().lower() == "linux":
                self.executable_path += "-linux"
            self.executable_path = os.path.abspath(self.executable_path)

            self.driver = webdriver.Firefox(
                executable_path=self.executable_path,
                capabilities=capabilities,
                options=options,
                **extra_params)

        elif self.client == "chrome":

            self._profile = webdriver.ChromeOptions()
            if self._profile_path is not None:
                self._profile.add_argument("user-data-dir=%s" %
                                           self._profile_path)
            if proxy is not None:
                self._profile.add_argument('--proxy-server=%s' % proxy)
            if headless:
                self._profile.add_argument('headless')
            if chrome_options is not None:
                for option in chrome_options:
                    self._profile.add_argument(option)

            self.logger.info("Starting webdriver")

            self.executable_path = './WebWhatsApi/driver/chromedriver'
            if platform.system().lower() == "windows":
                self.executable_path += ".exe"
            elif platform.system().lower() == "linux":
                self.executable_path += "-linux"
            self.executable_path = os.path.abspath(self.executable_path)

            self.driver = webdriver.Chrome(
                executable_path=self.executable_path,
                chrome_options=self._profile,
                **extra_params)

        elif client == 'remote':
            if self._profile_path is not None:
                self._profile = webdriver.FirefoxProfile(self._profile_path)
            else:
                self._profile = webdriver.FirefoxProfile()
            capabilities = DesiredCapabilities.FIREFOX.copy()
            self.driver = webdriver.Remote(command_executor=command_executor,
                                           desired_capabilities=capabilities,
                                           **extra_params)

        else:
            self.logger.error("Invalid client: %s" % client)
        self.username = username
        self.wapi_functions = WapiJsWrapper(self.driver, self)

        self.driver.set_script_timeout(500)
        self.driver.implicitly_wait(10)

        if autoconnect:
            self.connect()
def scrap(request):
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options,
                               executable_path='/usr/local/bin/geckodriver')

    # driver = webdriver.Firefox(options=options,executable_path = '/usr/local/bin/geckodriver')
    #driver = webdriver.Firefox(executable_path = '/usr/local/bin/geckodriver')

    driver.get(
        "https://manage.travel.rakuten.co.jp/portal/inn/mp_kanri_image_up.main"
    )
    time.sleep(3)

    id1 = driver.find_element_by_name("f_id")
    id1.send_keys("first-t")

    pw = driver.find_element_by_name("f_pass")
    pw.send_keys("first-75")

    submit = driver.find_element_by_xpath(
        "/html/body/table/tbody/tr/td/table[1]/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[2]/td/form/table/tbody/tr[2]/td[3]/input"
    )
    submit.click()

    driver.find_element_by_xpath(
        "/html/body/table[2]/tbody/tr/td[3]/table[3]/tbody/tr[2]/td[1]/table/tbody/tr[3]/td/table/tbody/tr[1]/td[4]/input[1]"
    ).click()

    jp_date = driver.find_element_by_xpath(
        "/html/body/table[11]/tbody/tr/td/table/tbody/tr[1]/td[2]").text
    jp_date = jp_date.replace('å¹´', '')  #Special symbol means Year
    jp_date = jp_date.replace('月', '')  #Special symbol means Month
    yearmonth = str(jp_date)
    table = driver.find_element_by_xpath(
        "/html/body/table[11]/tbody/tr/td/table")

    my_list = []

    #import pickle
    #with open('filename.pkl',‘wb’) as f:
    #pickle.dump(table, f)

    #import pickle
    #example_dict = {1:"6",2:"2",3:"f"}
    #pickle_out = open("dict.pickle","wb")
    #pickle.dump(table, pickle_out)
    #pickle_out.close()
    #sys.exit()

    #table1 = table
    #fp = open('filename.pkl', 'wb')
    #pickle.dump(table1, fp)
    #sys.exit()

    rows = table.find_elements_by_tag_name(
        "tr")  # get all of the rows in the table

    my_dict = dict()
    base = 1
    rowcount = 1
    #day = 0
    for row in rows:

        #if rowcount == 8:
        #     sys.exit()

        print("####################  ROW " + str(rowcount) +
              " #################")

        # Get the columns (all the column 2)
        cols = row.find_elements_by_tag_name(
            "td")  #note: index start from 0, 1 is col 2
        first_col = cols[0].text
        first_col_str = str(first_col)
        fca = first_col_str.split(':')
        first_col_first = fca[0]
        if 1 < len(fca):
            first_col_second = fca[1]

        tr_tuple_new = (1, 2, 3, 13, 14, 24, 25, 35, 36, 46, 47, 57)
        tr_tuple_date_new = (4, 15, 26, 37, 48)

        colcount = 1
        #print("First Col Str: " + first_col_str)

        #if rowcount not in tr_tuple_new and first != '':
        if rowcount not in tr_tuple_new:

            if rowcount in tr_tuple_date_new:
                print("Hello")
                for col in cols:
                    if col != cols[0] and col.text != '' and col.text != ' ':
                        base = col.text
                        break

            if rowcount not in tr_tuple_date_new:

                #base = base[:-1]
                base = str(base)
                print("Base: " + base)
                base = base.replace('æ—¥', '')  # Special symbol means "day"
                base = int(base)
                print("Base" + str(base) + "BASE")
                #print(base)

                day = 0
                for col in cols:

                    if col != cols[
                            0] and col.text != '' and col.text != ' ' and col.text != '済':  # This special symbol means 'already'

                        print("######  COLUMN " + str(colcount) + " ######")

                        print("Room Type ID")
                        print(first_col_first)

                        print("Room Type Name")
                        print(first_col_second)

                        #print("Base")
                        #print(base)

                        print("Stock")
                        print(col.text)

                        coltextarray = col.text.split('/')
                        col_text_first = coltextarray[0]
                        col_text_second = coltextarray[0]

                        print("Date")
                        date = base + day

                        if date < 10:
                            date = "0" + str(date)
                        else:
                            date = str(date)
                        #print("day: " + days)

                        fulldate = yearmonth + date
                        print(fulldate)

                        my_dict = {
                            "date": fulldate,
                            "hotel_id": 4304,
                            "room_type_id": first_col_first,
                            "room_type_name": first_col_second,
                            "room_stock": col_text_first,
                            "reservations": col_text_second
                        }

                        my_list.append(my_dict)

                        day += 1
                        colcount += 1

        rowcount += 1
    driver.quit()

    #print(my_list)
    my_json = json.dumps(my_list, ensure_ascii=False)
    return HttpResponse(my_json)
Example #6
0
    # parsear conteudo HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find(name='table')

    # Estruturar conteudo em um dataframe
    df_full = pd.read_html(str(table))[0].head(10)
    df = df_full[['Unnamed: 0', 'PLAYER', 'TEAM', label]]
    df.columns = ['pos', 'player', 'team', 'total']

    # transformar os dados em um dicionário de dados prórpio
    return df.to_dict('records')


# instanciar Firefox
option = Options()
option.headless = True
# driver = webdriver.Firefox(options=option)
driver = webdriver.Firefox()

driver.get(url)
time.sleep(2)

for k in rankings:
    top10ranking[k] = buildrank(k)

driver.quit()

# converter e salvar em JSON
js = json.dumps(top10ranking)
fp = open('ranking.json', 'w')
Example #7
0
logger.setLevel(logging.INFO)
fh = logging.StreamHandler()
fh.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)

eans = "0600753339886\n5050466796820"

logger.info(eans)
logger.info(inputpath)

opts = Options()
opts.headless = True
logger.info("Starte FireFox")
browser = Firefox(options=opts)

try:
    logger.info("Öffne Startseite")

    # Startseite
    browser.get('https://www.bonavendi.de/verkaufen/sammeleingabe.html')
    time.sleep(10)
    try:
        browser.get_screenshot_as_file(inputpath + "/1.png")
    except WebDriverException:
        logger.warning("Bild 1 konnte nicht gespeichert werden.")
Example #8
0
 def __init__(self):
     self.opts = Options()
     self.opts.add_argument("--headless") # To make firefox invisible of course (Headless)
     self.browsers = {} # Here we save all the browsers we create so we can control and use later
     self.useragent = ""
     self.sessions_file = os.path.join("core","sessions.json")
Example #9
0
def get_driver():
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(
        executable_path=GeckoDriverManager().install(), options=options)
    return driver
Example #10
0
    def setUpClass(cls):
        options = Options()
#        options.headless = True
        cls.driver = webdriver.Firefox(options=options)

        print("in setUpClass")
Example #11
0
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
import time

WINDOW_SIZE = "1920,1080"

firefox_options = Options()

firefox_options.add_argument("--headless")
firefox_options.add_argument("--window-size=%s" % WINDOW_SIZE)
firefox_options.set_preference("browser.download.folderList", 2)
firefox_options.set_preference("browser.download.dir", r"C:\Users\tians4")
firefox_options.set_preference("browser.download.useDownloadDir", True)
firefox_options.set_preference("browser.download.folderList", 2)
firefox_options.set_preference("browser.download.manager.showWhenStarting",
                               False)
firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk",
                               "text/csv")


def test_download_csv():
    browser = webdriver.Firefox(
        executable_path=r'C:\Users\tians4\geckodriver.exe',
        firefox_options=firefox_options)
    browser.get("https://jira.cec.lab.emc.com:8443/login.jsp")
    #time.sleep(10)
    browser.implicitly_wait(10)
    elem_login = browser.find_element_by_id("login-form-username")
    elem_login.click()
Example #12
0
def build_options():
    options = Options()
    options.headless = True
    return options
Example #13
0
def start_web_driver():
    from selenium.webdriver.firefox.options import Options
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)
    return driver
Example #14
0
    def browser(self, userlist, index, channel, command_list):
        def chunks(lst, n):
            """Yield successive n-sized chunks from lst."""
            for i in range(0, len(lst), n):
                yield lst[i:i + n]

        try:
            with open(userlist, "r") as _namelist:
                if len(_namelist.readlines()) > 0:
                    self.browser_status[index] = "Starting"
                    _namelist.seek(0)
                    _namelist_stripped = sorted(
                        map(str.strip, _namelist.readlines()))
            chunked_lists = chunks(_namelist_stripped, self.chunk_size)
            for chunk in chunked_lists:
                self.thread_lock.acquire()
                profile = webdriver.FirefoxProfile(
                    self.config["Firefox_profile"])
                profile.set_preference(
                    "security.insecure_field_warning.contextual.enabled",
                    False)
                profile.set_preference("security.enterprise_roots.enabled",
                                       True)
                options = Options()
                if index != 0 and self.headless_mode:
                    options.add_argument('--headless')
                with webdriver.Firefox(
                        options=options,
                        executable_path=
                        "FirefoxPortable/App/Firefox64/geckodriver.exe",
                        firefox_profile=profile,
                        firefox_binary=
                        "FirefoxPortable/App/Firefox64/firefox.exe") as driver:
                    # print(driver.profile.profile_dir)
                    self.thread_lock.release()
                    driver.set_window_size(1000, 1000)
                    wait = WebDriverWait(driver, 120)
                    wait_rules = WebDriverWait(driver, 5)
                    driver.get(
                        "https://www.twitch.tv/popout/{channel}/chat".format(
                            channel=channel))
                    chat_field = wait.until(
                        presence_of_element_located(
                            (By.CSS_SELECTOR, ".ScInputBase-sc-1wz0osy-0")))
                    chat_welcome_message = wait.until(
                        presence_of_element_located(
                            (By.CSS_SELECTOR, ".chat-line__status")))
                    time.sleep(1)
                    if chat_field.is_displayed():
                        chat_field.click()
                    try:  # remove rules window
                        rules_button = wait_rules.until(
                            presence_of_element_located(
                                (By.CSS_SELECTOR, ".jQtUJo")))
                        if rules_button.is_displayed():
                            rules_button.click()
                    except (NoSuchElementException, TimeoutException):
                        pass
                    if chat_field.is_displayed():
                        chat_field.click()
                        chat_field = wait.until(
                            presence_of_element_located(
                                (By.CSS_SELECTOR,
                                 ".ScInputBase-sc-1wz0osy-0")))
                        chat_field.send_keys(
                            f"{self.greeting_emote} {index} {self.greeting_emote}",
                            Keys.ENTER)
                        self.browser_status[index] = "Ready"
                        while not self.all_browsers_ready:
                            time.sleep(0.1)
                        with open("banned_part{index}.txt".format(index=index),
                                  "w") as banned_names:
                            for _name in chunk:
                                try:
                                    for command in command_list:
                                        chat_field = wait.until(
                                            presence_of_element_located(
                                                (By.CSS_SELECTOR,
                                                 ".ScInputBase-sc-1wz0osy-0")))
                                        chat_field.send_keys(
                                            "{cmd} {name}".format(cmd=command,
                                                                  name=_name),
                                            Keys.ENTER)
                                    banned_names.write(f"{_name}\n")
                                    self.counter[index] += 1
                                except (ElementNotInteractableException,
                                        ElementClickInterceptedException):
                                    try:  # remove rules window again, if nescessary
                                        rules_button = wait_rules.until(
                                            presence_of_element_located(
                                                (By.CSS_SELECTOR, ".jQtUJo")))
                                        if rules_button.is_displayed():
                                            rules_button.click()
                                    except (NoSuchElementException,
                                            TimeoutException):
                                        pass
                with self.thread_lock:
                    with open(
                            "banned_lists/{streamer}.txt".format(
                                streamer=channel),
                            "a") as banlist, open(
                                "banned_part{index}.txt".format(index=index),
                                "r") as banned_names:
                        _names = banned_names.readlines()
                        banlist.writelines(_names)
        except LookupError:
            print("couldn't start instance {}".format(index))
        finally:
            self.browser_status[index] = "Done"
Example #15
0
_LOGGER = logging.getLogger(__name__)
_LOGGER.setLevel(logging.DEBUG)
logging.debug("test")
HTML_PARSER = 'html.parser'
ATTRIBUTION = 'Information provided by Aesop'
LOGIN_URL = 'https://sub.aesoponline.com/Substitute/Home'
LOGIN_TIMEOUT = 10
COOKIE_PATH = './aesop_cookies.pickle'
CACHE_PATH = './aesop_cache'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
CHROME_WEBDRIVER_ARGS = [
    '--headless', '--user-agent={}'.format(USER_AGENT), '--disable-extensions',
    '--disable-gpu', '--no-sandbox'
]
CHROMEDRIVER_PATH = 'C:/Users/asaboo/Downloads/chromedriver_76/chromedriver'
FIREFOXOPTIONS = Options()
FIREFOXOPTIONS.add_argument("--headless")


class AESOPError(Exception):
    """AESOP error."""

    pass


def _save_cookies(requests_cookiejar, filename):
    """Save cookies to a file."""
    with open(filename, 'wb') as handle:
        pickle.dump(requests_cookiejar, handle)

Example #16
0
 def setup(self):
     self.opts = Options()
     self.opts.set_headless()
     browser = Firefox(options=self.opts)
     browser.get(self.url)
     return browser
Example #17
0
def main():
    # Parse the command line arguments
    models = [
        'hash', 'rr', 'random', 'cloudflare', 'google', 'quad9', 'nextdns'
    ]
    parser = argparse.ArgumentParser()
    parser.add_argument('website')
    parser.add_argument('dns_type',
                        choices=['dns', 'doh', 'dot', 'dnscrypt-proxy_doh'])
    parser.add_argument('trr_resolver_ip')
    parser.add_argument('trr_resolver_uri')
    parser.add_argument('model', choices=models)
    parser.add_argument('--timeout', type=int, default=45)
    args = parser.parse_args()

    dnscrypt_config_file = '/dnscrypt-proxy/dnscrypt-proxy/dnscrypt-proxy-{0}.toml'.format(
        args.model)

    # Enable devtools in Firefox
    options = Options()
    options.headless = True
    options.add_argument('-devtools')

    # Enable the netmonitor toolbox in devtools so we can save HARs
    profile = webdriver.FirefoxProfile()
    profile.set_preference('devtools.toolbox.selectedTool', 'netmonitor')

    # Set up DNS configuration
    subprocess.run(
        ["sudo", "cp", "/etc/resolv.conf", "/etc/resolv.upstream.conf"])
    subprocess.run(["sudo", "cp", "resolv.conf", "/etc/resolv.conf"])
    if args.dns_type == 'dnscrypt-proxy_doh':
        subprocess.run(
            "sudo /dnscrypt-proxy/dnscrypt-proxy/dnscrypt-proxy -config {0} &> /dev/null &"
            .format(dnscrypt_config_file),
            shell=True)
        subprocess.run(["sudo", "sleep", "5s"])

    # Configure the DNS settings in Firefox
    if args.dns_type == 'dns' or args.dns_type == 'dot' or args.dns_type == 'dnscrypt-proxy_doh':
        options.set_preference('network.trr.mode', 0)
    elif args.dns_type == 'doh':
        options.set_preference('network.trr.mode', 3)
        options.set_preference('network.trr.request-timeout', 1500)
        options.set_preference('network.trr.max-fails', 5)
        trr_resolver_ip = args.trr_resolver_ip
        trr_resolver_uri = args.trr_resolver_uri
        if trr_resolver_ip:
            options.set_preference('network.trr.bootstrapAddress',
                                   trr_resolver_ip)
        if trr_resolver_uri:
            options.set_preference('network.trr.uri', trr_resolver_uri)

    # Launch Firefox and install our extension for getting HARs
    driver = webdriver.Firefox(options=options,
                               firefox_profile=profile,
                               firefox_binary="/opt/firefox/firefox-bin")
    driver.install_addon("/home/seluser/measure/harexporttrigger-0.6.2-fx.xpi")
    driver.set_page_load_timeout(args.timeout)

    # Make a page load
    started = datetime.now()
    driver.get(args.website)

    # Once the HAR is on disk in the container, write it to stdout so the host machine can get it
    har_file = "/home/seluser/measure/har.json"

    def har_file_ready():
        return os.path.exists(har_file + ".ready")

    while (datetime.now() - started).total_seconds() < args.timeout \
            and not har_file_ready():
        time.sleep(1)

    if har_file_ready():
        with open(har_file, 'rb') as f:
            sys.stdout.buffer.write(f.read())
    driver.quit()
Example #18
0
def main(cmd_args):
    group_id = cmd_args.group
    result_filename = cmd_args.out
    local = pytz.timezone("Europe/Moscow")
    study_calendar = Calendar()

    try:
        print(Fore.WHITE,
              Style.BRIGHT, "Checking for the existence of a "
              "group...",
              Style.RESET_ALL,
              end="")

        if not exist_group(group_id):
            raise NoSuchGroupID(group_id)

        print(Fore.GREEN, Style.BRIGHT, " Ok", Style.RESET_ALL)

        opts = Options()
        opts.headless = True
        browser = Firefox(options=opts)
        base_url = f"{MAI_SCHEDULE_DETAIL}?group={group_id}"

        browser.get(base_url)

        print(Fore.WHITE,
              Style.BRIGHT,
              "Number of university weeks:",
              Style.RESET_ALL,
              end="")

        # get all weeks
        number_study_weeks = len(
            browser.find_elements_by_css_selector(".table tr a"))

        print(number_study_weeks)

        for i in range(1, number_study_weeks + 1):  # iterates over weeks
            print(Fore.WHITE, Style.BRIGHT, f"\nGetting {i} week schedule...",
                  Style.RESET_ALL)

            browser.get(f"{base_url}&week={i}")

            # get year of current week
            stud_weeks = browser.find_elements_by_css_selector(".table tr a")
            year = stud_weeks[i - 1].text[-4:]

            stud_days = browser.find_elements_by_class_name("sc-container")
            for stud_day in stud_days:
                day, month = stud_day.find_element_by_class_name(
                    "sc-day-header").text[:5].split(".")

                start_date = f"{year}-{month}-{day}"  # YYYY-MM-DD

                items = stud_day.find_elements_by_css_selector(
                    ".sc-table-detail > .sc-table-row")

                for item in items:
                    event = Event()
                    start_time, end_time = item.find_element_by_class_name(
                        "sc-item-time").text.split(" – ")

                    event.name = item.find_element_by_class_name(
                        "sc-title").text  # get title of study item

                    # convert local begin time to utc
                    naive = datetime.datetime.strptime(
                        f"{start_date} {end_time}", "%Y-%m-%d %H:%M")
                    local_dt = local.localize(naive, is_dst=None)
                    utc_dt = local_dt.astimezone(pytz.utc)
                    event.end = utc_dt.strftime("%Y-%m-%d %H:%M")

                    # convert local end time to utc
                    naive = datetime.datetime.strptime(
                        f"{start_date} {start_time}", "%Y-%m-%d %H:%M")
                    local_dt = local.localize(naive, is_dst=None)
                    utc_dt = local_dt.astimezone(pytz.utc)
                    event.begin = utc_dt.strftime("%Y-%m-%d %H:%M")

                    type_lesson = item.find_element_by_class_name(
                        "sc-item-type").text

                    location = item.find_element_by_class_name(
                        "sc-item-location").text  # get audience in MAI

                    # handle case when lecturer field is empty
                    try:
                        lecturer = item.find_element_by_class_name(
                            "sc-lecturer").text
                    except NoSuchElementException:
                        lecturer = ''

                    event.description = f"Type: {type_lesson}\nLocation: " \
                                        f"{location}\nLecturer: {lecturer}\n"

                    study_calendar.events.add(event)

                print(Fore.WHITE, Style.BRIGHT, f'\t{start_date} -',
                      Fore.GREEN, '\u2713', Style.RESET_ALL)

        # save ics file
        with open(result_filename, "w") as ics_file:
            ics_file.writelines(study_calendar)

        print(Fore.GREEN, Style.BRIGHT,
              f"\n  Done! Created {result_filename}\n", Style.RESET_ALL)

    except NoSuchGroupID as e:
        print(Fore.RED, Style.BRIGHT, e, Style.RESET_ALL)
Example #19
0
def download_gisaid_EpiCoV(
        uname,  # username
        upass,  # password
        normal,  # normal mode (quite)
        wd,  # output dir
        loc,  # location
        host,  # host
        cs,  # collection start date
        ce,  # collection end date
        ss,  # submission start date
        se,  # submission end date
        cg,  # complete genome only
        hc,  # high coverage only
        le,  # low coverage excluding
        to,  # timeout in sec
        rt,  # num of retry
        iv,  # interval in sec
        meta_dl  # also download meta
):
    """Download sequences and metadata from EpiCoV GISAID"""

    # output directory
    if not os.path.exists(wd):
        os.makedirs(wd, exist_ok=True)

    wd = os.path.abspath(wd)
    # GISAID_FASTA = f'{wd}/sequences.fasta.bz2'
    # GISAID_TABLE = f'{wd}/gisaid_cov2020_acknowledgement_table.xls'
    GISAID_DTL_JASON = f'{wd}/gisaid_detail_metadata.json'
    # GISAID_TSV   = f'{wd}/metadata.tsv.bz2'
    metadata = []

    # MIME types
    mime_types = "application/octet-stream"
    mime_types += ",application/excel,application/vnd.ms-excel"
    mime_types += ",application/pdf,application/x-pdf"
    mime_types += ",application/x-bzip2"
    mime_types += ",application/x-gzip,application/gzip"

    # start fresh
    try:
        os.remove(GISAID_DTL_JASON)
    except OSError:
        pass

    print("Opening browser...")
    profile = webdriver.FirefoxProfile()
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference("browser.download.dir", wd)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                           mime_types)
    profile.set_preference("plugin.disable_full_page_plugin_for_types",
                           mime_types)
    profile.set_preference("pdfjs.disabled", True)

    options = Options()
    if not normal:
        options.add_argument("--headless")
    driver = webdriver.Firefox(firefox_profile=profile, options=options)

    # driverwait
    driver.implicitly_wait(20)
    wait = WebDriverWait(driver, to)

    # open GISAID
    print("Opening website GISAID...")
    driver.get('https://platform.gisaid.org/epi3/frontend')
    waiting_sys_timer(wait)
    print(driver.title)
    assert 'GISAID' in driver.title

    # login
    print("Logining to GISAID...")
    username = driver.find_element_by_name('login')
    username.send_keys(uname)
    password = driver.find_element_by_name('password')
    password.send_keys(upass)
    driver.execute_script("return doLogin();")

    waiting_sys_timer(wait)

    # navigate to EpiFlu
    print("Navigating to EpiCoV...")
    epicov_tab = driver.find_element_by_xpath("//div[@id='main_nav']//li[3]/a")
    epicov_tab.click()

    waiting_sys_timer(wait)

    # when user doesn't enter time/location, download nextstrain sequences and metadata
    if not (cs or ce or ss or se or loc):
        # download from downloads section
        print("Clicking downloads...")
        pd_button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, "//div[@class='sys-actionbar-bar']//div[3]")))
        pd_button.click()
        waiting_sys_timer(wait)

        # have to click the first row twice to start the iframe
        iframe = waiting_for_iframe(wait, driver, rt, iv)
        driver.switch_to.frame(iframe)
        waiting_sys_timer(wait)

        print("Downloading Nextstrain sequences...")
        dl_button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//div[contains(text(), "nextfasta")]')))
        dl_button.click()
        waiting_sys_timer(wait)

        fn = wait_downloaded_filename(wait, driver, 3600)
        print(f"Downloaded to {fn}.                     ")

        waiting_sys_timer(wait)

        print("Downloading Nextstrain metadata...")
        dl_button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//div[contains(text(), "nextmeta")]')))
        dl_button.click()

        fn = wait_downloaded_filename(wait, driver, 1800)
        print(f"Downloaded to {fn}.                     ")

        waiting_sys_timer(wait)

        # go back to main frame
        back_button = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//button[contains(text(), "Back")]')))
        back_button.click()

        driver.switch_to.default_content()
        waiting_sys_timer(wait)

    # have to reduce the range of genomes
    if cs or ce or ss or se or loc:
        print("Browsing EpiCoV...")
        browse_tab = wait.until(
            EC.element_to_be_clickable(
                (By.XPATH, '//*[contains(text(), "Browse")]')))
        browse_tab.click()
        waiting_sys_timer(wait)
        waiting_table_to_get_ready(wait)

        # set location
        if loc:
            print("Setting location...")
            loc_input = driver.find_element_by_xpath(
                "//td/div[contains(text(), 'Location')]/../following-sibling::td/div/div/input"
            )
            loc_input.send_keys(loc)
            waiting_sys_timer(wait, 7)

        # set host
        if host:
            print("Setting host...")
            host_input = driver.find_element_by_xpath(
                "//td/div[contains(text(), 'Host')]/../following-sibling::td/div/div/input"
            )
            host_input.send_keys(host)
            waiting_sys_timer(wait, 7)

        # set dates
        date_inputs = driver.find_elements_by_css_selector(
            "div.sys-form-fi-date input")
        dates = (cs, ce, ss, se)
        for dinput, date in zip(date_inputs, dates):
            if date:
                print("Setting date...")
                dinput.send_keys(date)

        ActionChains(driver).send_keys(Keys.ESCAPE).perform()
        waiting_sys_timer(wait, 7)

        # complete genome only
        if cg:
            print("complete genome only...")
            checkbox = driver.find_element_by_xpath(
                '//input[@value="complete"]')
            checkbox.click()
            waiting_sys_timer(wait)

        # high coverage only
        if hc:
            print("high coverage only...")
            checkbox = driver.find_element_by_xpath('//input[@value="highq"]')
            checkbox.click()
            waiting_sys_timer(wait)

        # excluding low coverage
        if le:
            print("low coverage excluding...")
            checkbox = driver.find_element_by_xpath('//input[@value="lowco"]')
            checkbox.click()
            waiting_sys_timer(wait)

        # check if any genomes pass filters
        warning_message = None
        try:
            warning_message = driver.find_element_by_xpath(
                "//div[contains(text(), 'No data found.')]")
        except:
            pass
        if warning_message:
            print("No data found.")
            sys.exit(1)

        # select all genomes
        print("Selecting all genomes...")
        button_sa = driver.find_element_by_css_selector(
            "span.yui-dt-label input")
        button_sa.click()
        waiting_sys_timer(wait)

        # downloading sequence
        retry = 0
        while retry <= rt:
            try:
                print("Downloading sequences for selected genomes...")
                button = driver.find_element_by_xpath(
                    "//td[@class='sys-datatable-info']/button[contains(text(), 'Download')]"
                )
                button.click()
                waiting_sys_timer(wait)

                # switch to iframe
                iframe = waiting_for_iframe(wait, driver, rt, iv)
                driver.switch_to.frame(iframe)
                waiting_sys_timer(wait)

                button = driver.find_element_by_xpath(
                    "//button[contains(text(), 'Download')]")
                button.click()
                waiting_sys_timer(wait)
                driver.switch_to.default_content()

                fn = wait_downloaded_filename(wait, driver, 1800)
                print(f"Downloaded to {fn}.")

                break
            except:
                print(f"retrying...#{retry} in {iv} sec(s)")
                if retry == rt:
                    print("Unexpected error:", sys.exc_info())
                    sys.exit(1)
                else:
                    time.sleep(iv)
                    retry += 1

        # downloading metadata
        retry = 0
        while retry <= rt:
            try:
                print(
                    "Downloading acknowledgement table for selected genomes..."
                )
                button = driver.find_element_by_xpath(
                    "//td[@class='sys-datatable-info']/button[contains(text(), 'Download')]"
                )
                button.click()
                waiting_sys_timer(wait)

                # switch to iframe
                iframe = waiting_for_iframe(wait, driver, rt, iv)
                driver.switch_to.frame(iframe)
                waiting_sys_timer(wait)

                label = driver.find_element_by_xpath(
                    "//label[contains(text(), 'Acknowledgement Table')]")
                label.click()

                button = driver.find_element_by_xpath(
                    "//button[contains(text(), 'Download')]")
                button.click()

                waiting_sys_timer(wait)
                driver.switch_to.default_content()

                fn = wait_downloaded_filename(wait, driver, 180)
                print(f"Downloaded to {fn}.")

                break
            except:
                print(f"retrying...#{retry} in {iv} sec(s)")
                if retry == rt:
                    print("Unexpected error:", sys.exc_info())
                    sys.exit(1)
                else:
                    time.sleep(iv)
                    retry += 1

        # iterate each pages
        if meta_dl:
            page_num = 1
            print("Retrieving metadata...")
            while True:
                print(f"Starting processing page# {page_num}...")
                # retrieve tables
                tbody = wait.until(
                    EC.presence_of_element_located(
                        (By.XPATH, "//tbody[@class='yui-dt-data']")))

                waiting_table_to_get_ready(wait)

                # interate each row
                for tr in tbody.find_elements_by_tag_name("tr"):
                    td = tr.find_element_by_tag_name("td")
                    driver.execute_script("arguments[0].scrollIntoView();", td)

                    # have to click the first row twice to start the iframe
                    iframe = None
                    record_elem = None
                    retry = 1
                    while retry <= rt:
                        try:
                            td.click()
                            waiting_sys_timer(wait)
                            iframe = driver.find_element_by_xpath("//iframe")
                            if iframe:
                                break
                            else:
                                raise
                        except:
                            print(f"retrying...#{retry} in {iv} sec(s)")
                            if retry == rt:
                                print("Failed")
                                sys.exit(1)
                            else:
                                time.sleep(iv)
                                retry += 1

                    driver.switch_to.frame(iframe)

                    # detect error: "An internal server error occurred."
                    # and "error-token: DYX47"
                    error_token = driver.find_element_by_xpath("//b")
                    if error_token:
                        error_token_text = error_token.text
                        if "error-token" in error_token.text:
                            print(
                                "[FATAL ERROR] A website internal server error occurred."
                            )
                            print(error_token_text)
                            sys.exit(1)

                    # get the element of table with metadata
                    record_elem = wait.until(
                        EC.presence_of_element_located(
                            (By.XPATH, "//div[@class='packer']")))

                    # parse metadata
                    m = getMetadata(record_elem)
                    metadata.append(m)
                    print(f"{m['Accession ID']}\t{m['Virus name']}")

                    # get back
                    ActionChains(driver).send_keys(Keys.ESCAPE).perform()
                    time.sleep(1)
                    driver.switch_to.default_content()

                print(f"Compeleted page# {page_num}.")
                page_num += 1

                # go to the next page
                retry = 1
                button_next_page = None
                try:
                    button_next_page = driver.find_element_by_xpath(
                        f'//a[@page="{page_num}"]')
                except:
                    break

                if button_next_page:
                    print(f"Entering page# {page_num}...")
                    while retry <= rt:
                        try:
                            button_next_page.click()
                            time.sleep(10)
                            current_page = driver.find_element_by_xpath(
                                '//span[@class="yui-pg-current-page yui-pg-page"]'
                            ).text
                            if current_page != str(page_num):
                                raise
                            else:
                                break
                        except:
                            print(f"retrying...#{retry} in {iv} sec(s)")
                            if retry == rt:
                                print("Failed")
                                sys.exit(1)
                            else:
                                time.sleep(iv)
                                retry += 1

            # writing metadata to JSON file
            print("Writing detail metadata...")
            with open(GISAID_DTL_JASON, 'w') as outfile:
                json.dump(metadata, outfile)

    # close driver
    driver.quit()
Example #20
0
def webdriver_init(): 
    firefox_options = Options()
    firefox_options.headless = hide_firefox
    driver = webdriver.Firefox(options=firefox_options )
    return driver
Example #21
0
            city_list = region_dict[regione_name]
    else:
        print('specifica: Parametro Regione giorno mese anno giorno mese anno')
        sys.exit()
        regione_name = 'Lombardia'
        parametro = 'Precipitazioni'
        gi = '1'
        mi = '1'
        ai = '2010'
        gf = '1'
        mf = '1'
        af = '2011'
        city_list = region_dict['Lombardia']

    url = "http://clima.meteoam.it/RichiestaDatiGenerica.php"
    p = Path(os.path.realpath(__file__))
    parent = p.parent.parent.parent
    driver_path = os.path.join(parent, "geckodriver")
    optionsFire = Options()
    optionsFire.add_argument('--headless')
    html_list = []

    for c in city_list:
        print('controllo ' + c)
        aeronatutica(parametro, c, gi, mi, ai, gf, mf, af, html_list)

    if (len(html_list) != 0):
        filename = regione_name + ai + af + '.csv'
        finalParsing(html_list, filename)
        print('ci sono risultati')
# Siempre en los programas de selenium desde Python
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

# Configurar el driver
opciones = Options()
opciones.headless = True  # Haga todo el trabajo sin mostrar fisicamente la pantalla del navegador. Con esta opcion no tiene sentido tener sleep.
navegador = webdriver.Firefox(
    executable_path="./drivers/geckodriver", options=opciones
)  # Utiliza geckodriver para abrir un navegador con Firefox.
navegador.set_window_position(0, 0)
navegador.set_window_size(800, 500)

# Abrir el navegador en una ruta
navegador.get("http://google.es")

# Identificaremos los elementos y actuaremos sobre ellos
navegador.find_element_by_xpath("//input[@type='text']").send_keys(
    "Sevilla"
)  # Send keys escribe un texto. El elemento que hemos seleccionado es la sección de buscar en Google.
time.sleep(2)
navegador.find_element_by_xpath("//input[@name='btnK']").click()
time.sleep(3)
estadisticas = navegador.find_element_by_xpath("//div[@id'result-stats']").text
print(estadisticas)

# Cerrar el navegador
navegador.quit()
def options():
    return Options()
Example #24
0
def publicstorage_scrap(zip_code_lst, unique_url_lst):
    for zip in zip_code_lst:
        try:
            url = "https://www.publicstorage.com/self-storage-search?location=" + str(
                zip)
            t = random.randint(25, 30)
            time.sleep(t)

            options = Options()
            options.headless = True

            root_dir = os.path.dirname(os.path.abspath(__file__))

            try:
                driver = webdriver.Firefox(options=options,
                                           executable_path=root_dir +
                                           '/geckodriver')

                driver.get(url)
                time.sleep(3)
                content = soup(driver.page_source, 'html.parser')

                url_lst = content.findAll(
                    "a", {"class": "ps-property-v2__view-plp"})

                ur_lst = []
                for urls in url_lst:
                    try:
                        url = "https://www.publicstorage.com" + str(
                            urls['href'])

                        if url in unique_url_lst:
                            continue
                        else:
                            unique_url_lst.append(url)
                            ur_lst.append(url)
                    except:
                        pass

                for i in range(len(ur_lst)):
                    try:
                        storage_link = ur_lst[i]

                        t = random.randint(10, 15)
                        time.sleep(t)
                        driver.get(storage_link)
                        time.sleep(3)

                        content = soup(driver.page_source, 'html.parser')

                        address = ''
                        addr_lst = []
                        addr_lst = re.findall(r'"FormattedAddress":"(.*?)","',
                                              content.text)[0].split(" ")

                        for addr in addr_lst:
                            if len(addr.strip()) > 0:
                                address = address + " " + addr.strip()
                        address = address.strip()

                        addr_zip_code = address.split(" ")[-1]

                        dt_lst = content.findAll(
                            "div", {
                                "class":
                                "row ps-properties-propertyV2__units__summary"
                            })
                        for index_data in dt_lst:
                            try:
                                size_type = ''
                                size_type = index_data.find(
                                    "h4", {
                                        "class":
                                        "ps-properties-propertyV2__units__header"
                                    }).text.strip()
                                price_txt = ''
                                price_txt = index_data.find(
                                    "span", {
                                        "class":
                                        "ps-properties-propertyV2__units__prices__wrapper"
                                    })
                                price = price_txt.text.strip().split('/')[0]

                                print(size_type + ",  " + price + ",  " +
                                      address + ", " + addr_zip_code + ", " +
                                      storage_link)

                                isexit = False
                                sql = "SELECT * FROM tbl_publicstorage WHERE address = %s  AND price = %s AND size = %s AND link = %s"
                                adr = (address, price, size_type, storage_link)
                                mycursor.execute(sql, adr)
                                myresult = mycursor.fetchall()
                                for x in myresult:
                                    isexit = True

                                if isexit == False:
                                    sql = "INSERT INTO tbl_publicstorage (address, price,size,zipcode,link) VALUES (%s, %s,%s, %s,%s)"
                                    val = (address, price, size_type,
                                           addr_zip_code, storage_link)
                                    mycursor.execute(sql, val)
                                    mydb.commit()

                            except:
                                pass
                    except:
                        pass
            except:
                pass

            try:
                driver.quit()
            except:
                pass

        except:
            pass
Example #25
0
def query_storage_sync():
    """
    query firefox's browser.storage.sync to pomodoro data from firefox "tomator clock" plugin
    """
    config = configparser.ConfigParser()
    config.read('config.ini')

    options = Options()
    options.headless = True

    profile = webdriver.FirefoxProfile(config['pomodoros']['profile_path'])
    driver = webdriver.Firefox(firefox_profile=profile, options=options)
    driver.implicitly_wait(100)

    try:
        url = config['pomodoros']['url']
        output_path = config['pomodoros']['output_path']

        driver.get(url)

        query = """
            const getStorageData = key =>
              new Promise((resolve, reject) =>
                browser.storage.sync.get(key, result =>
                  browser.runtime.lastError
                    ? reject(Error(browser.runtime.lastError.message))
                    : resolve(result)
                )
              )

            const timeline = getStorageData('timeline')
            return timeline
        """

        output = driver.execute_script(query.strip())

        if not output:
            driver.quit()
            raise ValueError("results are empty!")
            return
    except Exception as e:
        driver.quit()
        raise e

    elements = filter(lambda d: d['type'] == 'tomato', output['timeline'])
    days = defaultdict(int)
    minutes = defaultdict(int)

    for element in elements:
        date_obj = datetime.strptime(
            element['date'],
            "%a %b %d %Y %H:%M:%S %Z%z (Eastern Daylight Time)")
        minute = datetime.strftime(date_obj, "%Y-%m-%d %H:%M")
        day = datetime.strftime(date_obj, "%Y-%m-%d")
        days[day] += 1
        minutes[minute] = days[day]

    override_output_path = False
    if not os.path.exists(output_path):
        override_output_path = True
    else:
        with open(output_path, "r") as r:
            r.readline()  # skip header
            processed = dict(map(lambda x: x.strip().split(','),
                                 r.readlines()))

    if override_output_path or len(processed.keys()
                                   & days.keys()) >= len(processed):
        with open(output_path, "w") as w:
            w.write("date,value\n")

            for d, v in sorted(days.items()):
                w.write(f"{d},{v}\n")

        with open("all_pomodoros.bsv", "w") as w:
            w.write("time,value\n")

            for d, v in sorted(minutes.items()):
                w.write(f"{d},{v}\n")

    else:
        print("something's up")

    driver.quit()
    return
Example #26
0
def index():

    if (request.method == 'POST'
            and request.form.get('choices-single-default') == 'Twitter'):
        query = request.form['query']

        url = 'https://mobile.twitter.com/hashtag/' + query
        page = requests.get(url)

        soup = BeautifulSoup(page.text, 'html.parser')

        tweet = []
        newlink = []

        for tag in soup.find_all('a'):
            try:
                temp = (tag['href'])
                temp = temp[:-4]
                temp = 'https://twitter.com' + temp
                tweet.append(temp)
            except:
                pass

        for element in tweet:
            if 'status' in element:
                element = re.sub(':', '%3A', element)
                element = re.sub('/', '%2F', element)
                newlink.append(element)

        newlink = list(set(newlink))
        options = Options()
        options.headless = True
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        browser = webdriver.Firefox(options=options)
        # # Step 2) Navigate to Facebook

        final = []
        newlink1 = []
        for element in newlink:
            y = re.search(".*[0-9]$", element)
            if (y):
                if "=" not in element and "?" not in element and "_" not in element:
                    newlink1.append(element)

        for element in newlink1:
            try:
                url1 = 'https://publish.twitter.com/?query=' + element + '&widget=Tweet'
                browser.get(url1)

                x = browser.find_element_by_xpath(
                    '//code[@class="EmbedCode-code"]')
                final.append(x.text)

            except:
                pass

        f = open("templates/twitter.html", "w", encoding='utf-8')
        f.write(
            '<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><style>body{display: table;margin: auto;}</style><link rel="icon" href="https://cdn0.iconfinder.com/data/icons/social-flat-rounded-rects/512/twitter-512.png" type="image/icon type"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Twitter</title></head><body>'
        )
        f.write(
            '<img src="{{ url_for("static", filename="images/twitter-logo.png")}}" alt="logo" height="60px" width="200px"><br>'
        )
        f.write("<h1>Showing results for " + query + "</h1>")
        for element in final:
            f.write(element)
            f.write("<br>")
        f.write("</body></html>")
        f.close()

        browser.quit()

        return render_template('twitter.html')

    elif (request.method == 'POST'
          and request.form.get('choices-single-default') == 'Facebook'):

        options = Options()
        options.headless = True
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        browser = webdriver.Firefox(options=options)

        browser.get("http://www.facebook.com")

        username = browser.find_element_by_id("email")
        password = browser.find_element_by_id("pass")
        submit = browser.find_element_by_id("loginbutton")
        username.send_keys("*****@*****.**")
        password.send_keys("facebook@123")

        submit.click()
        query = request.form['query']

        url = "https://www.facebook.com/search/posts/?q=%23" + query + "&epa=SERP_TAB"
        browser.get(url)

        temp = browser.find_elements_by_xpath('//a')

        link = []
        newlink = []
        for element in temp:
            link.append(str(element.get_attribute("href")))

        for element in link:
            try:
                if "posts" in element and "#" not in element and "=" not in element:
                    newlink.append(element)
            except:
                pass

        newlink = list(set(newlink))
        newlink.insert(0, query)
        browser.quit()
        #print(newlink)
        return render_template('facebook.html', data=newlink)

    elif (request.method == 'POST'
          and request.form.get('choices-single-default') == 'Instagram'):

        options = Options()
        options.headless = True
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        browser = webdriver.Firefox(options=options)
        query = request.form['query']

        url = "https://www.instagram.com/explore/tags/" + query + "/top/"
        browser.get(url)
        link = []
        link.append(query)
        #soup = BeautifulSoup(page.text, 'html.parser')

        #temp = browser.find_elements_by_xpath("//div[contains(concat(' ', normalize-space(@class), ' '), 'KL4Bh')]")
        temp = browser.find_elements_by_xpath('//img[@class="FFVAD"]')

        for element in temp:
            link.append(element.get_attribute("src"))

        browser.quit()
        return render_template('insta.html', data=link)

    else:
        return render_template('index.html')
Example #27
0
def scrape_dea(zips):
    '''
        Purpose: Function that will scrape the DEA Diversion Control Division
                 website in order to obtain an up-to-date listing of all 
                 controlled substance public disposal locations. This will be
                 accomplished by entering a zip code and specifying a search
                 radius of 50 miles which will then return an html table of 
                 all dropbox addresses within that search radius.
    
        Input:   zips (list): list of zip codes
        
        Output:  dropboxDF (Dataframe): Dataframe of dropbox locations
    '''
    count = 0

    # Open a Headless Firefox web browser and direct it to the DEA's dropbox search page
    options = Options()
    options.set_headless(headless=True)
    browser = webdriver.Firefox(firefox_options=options)
    browser.get('https://apps.deadiversion.usdoj.gov/pubdispsearch')
    browser.implicitly_wait(100)

    # storage variable for table column names
    columnNames = []

    # final storage container for dropbox locations
    dropboxList = []

    # For every zip code in the US, run the dropbox location search on the site
    for code in zips:
        count += 1
        if count % 100 == 0:
            print(count)

        try:
            # Input the zip code into the page
            zipElem = browser.find_element_by_id('searchForm:zipCodeInput')
            zipElem.clear()  # clear the box in case any previous data exists
            zipElem.send_keys(code)

            # Specify the maximum radius of 50 miles
            desired_button = browser.find_element_by_xpath(
                '/html/body/div[1]/div[2]/div/div/div[2]/form/div[10]/table/tbody/tr/td[7]/div/div[2]/span'
            )
            desired_button.click()

            # Click the submit button
            search_button = browser.find_element_by_id(
                'searchForm:submitSearchButton')
            search_button.click()

            # Use beautifulSoup to extract the dropbox data from the generated page
            html = browser.page_source
            soup = BeautifulSoup(html, 'lxml')
            dropboxTable = soup.findAll('table', role='grid')[0]

            # On the first iteration, grab column names from the dropbox location table
            if code == zipList[0]:
                tableHeader = dropboxTable.find('tr')
                th = tableHeader.findAll('th')
                for col in th:
                    columnNames.append(col.text)

            # For every column in a row in the dropbox location table, grab the data
            #   and place it in the list `rowList`.  After each row is read, add that
            #   data to the master list `dropboxList`.
            for tr in dropboxTable.findAll('tr')[1:]:
                rowList = []
                for td in tr.findAll('td'):
                    rowList.append(td.text)
                dropboxList.append(rowList)

            # Move back to the search page and start over
            browser.back()
        except:
            pass

    return dropboxList
Example #28
0
def update(subscribe_list):
    # load driver and cookies
    vdis = Xvfb()
    vdis.start()
    try:
        os.remove('geckodriver.log')
    except:
        pass
    options = Options()
    options.log.level = "trace"
    driver = webdriver.Firefox(options=options)
    with open('cookies.json', 'r') as f:
        cookies = f.read()
    cookies = json.loads(cookies)
    driver.get('https://mp.weixin.qq.com/')
    for i in cookies:
        driver.add_cookie(i)
    time.sleep(delay / 3)
    update_pool = {}
    try:
        # open editor page
        driver.get('https://mp.weixin.qq.com/')
        get_by_css(driver, '#footer.mp-foot')
        real_url = driver.current_url
        if real_url.split('qq.com')[1] == '/':
            raise ValueError('cookies error!')
        token = urllib.parse.parse_qs(real_url)['token'][0]
        editor_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&createType=10&token=' + token + '&lang=zh_CN'
        driver.get(editor_url)
        get_by_css(driver, '#js_text_editor_tool_link').click()

        # search for articles
        for entry in subscribe_list:
            update_pool[entry] = []
            othergzh_button = get_by_css(
                driver, '.weui-desktop-btn.weui-desktop-btn_default')
            if othergzh_button != 0:
                othergzh_button.click()
            input_box = get_by_css(
                driver, '.weui-desktop-form__input_append-in > input')
            input_box.send_keys(entry)
            input_box.send_keys(Keys.ENTER)
            flag = 0
            for i in range(5):
                gzh_entry = get_by_css(
                    driver,
                    'ul.inner_link_account_list > li:nth-child({})'.format(i +
                                                                           1))
                if gzh_entry == 0:
                    break
                if get_by_css(
                        driver,
                        'ul.inner_link_account_list > li:nth-child({}) strong'.
                        format(i + 1)).text == entry:
                    flag = 1
                    break
            if flag == 0:
                update_pool[entry].append({
                    "title": "no gzh found",
                    "link": "http://example.com",
                    "author": entry,
                    "date": "1970-01-01"
                })
                continue
            gzh_entry.click()
            article_entries = get_by_css(driver, '.inner_link_article_item', 1)
            for article_entry in article_entries:
                link_element = get_by_css(article_entry,
                                          'span:nth-child(3) > a')
                title_element = get_by_css(
                    article_entry,
                    'div.inner_link_article_title > span:nth-child(2)')
                date_element = get_by_css(article_entry,
                                          'div.inner_link_article_date')
                link = link_element.get_attribute('href')
                title = title_element.get_attribute('innerHTML')
                date = date_element.get_attribute('innerHTML')
                update_pool[entry].append({
                    "title": title,
                    "link": link,
                    "author": entry,
                    "date": date
                })
    except ValueError as msg:
        update_pool = str(msg)
    finally:
        pass

    driver.close()
    vdis.stop()
    return (update_pool)
Example #29
0
def browser():
    options = Options()
    # options.headless = True
    browser = webdriver.Firefox(options=options)
    browser.delete_all_cookies()
    yield browser
Example #30
0
 def setUpClass(cls):
     super().setUpClass()
     options = Options()
     options.headless = bool(os.environ.get("CI"))
     cls.selenium = webdriver.Firefox(options=options)