class CaptchaPage():
    def __init__(self):
        print "Captcha Page Initializing"

        parser = ConfigParser.ConfigParser()
        base_path = os.path.join(os.environ['HOME'], '.mozilla/firefox/')
        parser.read(os.path.join(base_path, "profiles.ini"))
        profile_path = os.path.join(base_path, filter(lambda x: x[0].lower() == 'path', parser.items('Profile0'))[0][1])
        try:
            profile = FirefoxProfile(profile_path)
        except OSError:
            raise Exception("You must execute the following command:\nsudo chmod +r -R %s" % profile_path)
        self.driver = Firefox(profile)

        self.driver.get("file://%s/index.html" % os.getcwdu())

    def get_url_sound(self):
        self.driver.find_element_by_xpath('//*[@id="recaptcha_switch_audio"]').click()
        return self.driver.find_element_by_xpath('//*[@id="recaptcha_audio_download"]').get_attribute('href')

    def get_recaptcha_challenge_field(self):
        return self.driver.find_element_by_xpath('//*[@id="recaptcha_challenge_field"]').get_attribute('value')

    def get_captcha_textbox(self):
        print "Getting Captcha Textbox"
        return Textbox(self.driver.find_element_by_xpath('//*[@id="recaptcha_response_field"]'))

    def get_submit_button(self):
        print "Getting Submit Form Button"
        return Button(self.driver.find_element_by_xpath("/html/body/form/input"))

    def close(self):
        print "Closing Captcha Page"
        self.driver.close()
def main(argv=sys.argv[1:]):
    parser = argparse.ArgumentParser()
    parser.add_argument('--url', default='http://127.0.0.1:8000/static/index.html')
    args = parser.parse_args(argv)

    url = args.url

    browser = WebDriver()
    browser.get(url)
    tags = browser.find_elements_by_css_selector('li')
    for tag in tags:
        print(tag.text)
    browser.close()
class FunctionalTests(LiveServerTestCase):
    """Base para os testes funcionais."""

    def setUp(self):
        """Inicializa serviços necessários para execução dos testes funcionais."""
        self.driver = Firefox()
        self.driver.maximize_window()
        self.driver.implicitly_wait(5)

    def tearDown(self):
        """Finaliza serviços."""
        self.driver.close()

    def get_live_url(self, url_name):
        """Obtém url_name em relação ao servidor de testes."""
        return '{}{}'.format(self.live_server_url, reverse(url_name))
class ContentRetrieverUsingSelenium:
    def __init__(self, timeout):
        self.browser = Firefox()
        self.timeout = timeout
    
    def getContentOfPage(self, url):
        self.browser.get(url)
        
        time.sleep(self.timeout)
        
        page_source = self.browser.page_source
        page_source = page_source.encode('gbk', 'ignore')
        
        return (self.browser.current_url, BeautifulSoup(page_source))
    
    def close(self):
        self.browser.close()
Beispiel #5
0
def read_url(url):
    driver = Firefox(options=options)
    driver.maximize_window()
    driver.get(url)
    time.sleep(4)
    height = driver.execute_script("return document.body.scrollHeight")
    print(height)

    position = 0

    while position < height:
        driver.execute_script(f"window.scrollTo(0, {position});")
        delta = random.randint(50, 500)
        position += delta
        duration = delta // 20
        # print(height, position, delta, duration)
        time.sleep(duration)

    driver.close()
## id/pwd 정보 set 후 login
id_txt.send_keys('korea7030')
pw_txt.send_keys('akachiki10!')
login_btn.submit()

chrome.implicitly_wait(50)

for article in article_url:
    chrome.get(article)
    chrome.switch_to.frame('cafe_main')
    soup_body = BeautifulSoup(chrome.page_source, "lxml")
    try:
        with open('content.csv', 'a', encoding='utf8') as f:
            writer = csv.writer(f)
            article_body = soup_body.find('div', {"id": "tbody"})
            print(article_body.text)
            writer.writerow([article_body.text])

            reply_tag = soup_body.find_all('span', class_='comm_body')
            for reply in reply_tag:
                with open('reply.csv', 'a', encoding='utf8') as f:
                    writer_reply = csv.writer(f)
                    print("reply text : -----------" + reply.text)
                    writer_reply.writerow([reply.text])

    except AttributeError as e:
        print("Error row : " + article)

        # print(reply_tag)
chrome.close()
Beispiel #7
0
#!/usr/bin/python3
import requests
from bs4 import BeautifulSoup
from selenium.webdriver import Firefox

b = Firefox()
b.get("https://service.cloud.teu.ac.jp/moodle/course/view.php?id=7661")

soup = BeautifulSoup(b.page_source, "html.parser")
print(soup.find_all("a"))
b.close()
button.click()

# # Identify all states in the list, read as text using Selenium
list_item = browser.find_element_by_class_name('drop-down-list')
states = list_item.text

# sel_st = raw_input('Type in 2 letter st abbreviation: ')
find_st = browser.find_element_by_link_text('CO')
find_st.click()
sleep(1)
list_region = browser.find_element_by_xpath('//*[@id="select-region"]/div[2]') # find and click Choose Region button
list_region.click()
cur_state = browser.find_element_by_xpath('//*[@id="select-region"]/div[3]')

# text_region = cur_state.find_elements_by_tag_name('a') # find all regions by <a> tag
l_regions = cur_state.text

# creates text file with all the region
regions = open('Regions.text', 'w')
regions.write(l_regions)

print l_regions
# Place region points on map
gmaps2.gmaps_mapit(l_regions)


# url = './mymap.html'
# webbrowser.open_new_tab(url)

browser.close()
Beispiel #9
0
                           help='Number of adults: default 0',
                           default=0)

    url, is_round_trip = get_url(argparser.parse_args())

    opts = Options()
    opts.set_headless()
    driver = Firefox(options=opts)

    print("Departure   Arrival     Price       Duration   ")
    if is_round_trip:
        print("-> Return Flight Details")

    get_flights(driver, url, is_round_trip)

    driver.close()

# --------------------------------------------------------------------------------------------------------------------------
# Source: https://github.com/Shruti-Pattajoshi/Travel-Website-Scraping-for-the-Cheapest-Fares/blob/master/flights_scraping.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd

import time
import datetime

import smtplib
from email.mime.multipart import MIMEMultipart
Beispiel #10
0
from selenium.webdriver import Firefox
from time import sleep

browser = Firefox()
url = 'http://selenium.dunossauro.live/aula_06_a.html'

browser.get(url)

browser.find_elements_by_css_selector('div.form-group')

browser.find_elements_by_css_selector(
    'div.form-group + br'  #br irmão de div com class form-group
)[1].get_attribute('id')

# da tag div com a classe form-group pegue o filho com id "dentro-nome"
b.find_element_by_css_selector('div.form-group > #dentro-nome')

sleep(10)
browser.close()
browser.quit()
Beispiel #11
0
    def find(self):
        # CONNECT THE DATABASE
        connector = sqlite3.connect('games-db')
        cursor = connector.cursor()
        cursor.execute('DROP TABLE allResults')
        cursor.execute('CREATE TABLE allResults(win TEXT,'
                       ' home_team TEXT, away_team TEXT, home_score DECIMAL,'
                       ' away_score DECIMAL, home_odd REAL,'
                       ' draw_odd REAL, away_odd REAL)')
        # SET UP THE DRIVER
        options = FirefoxOptions()
        options.headless = True
        driver = Firefox(options=options,
                         executable_path='C://Windows/geckodriver.exe')
        driver.get(self.WEB_LINKS["football"])
        time.sleep(1)
        driver.find_element_by_css_selector(
            '#live-table > div.tabs > div.calendar > div:nth-child(1) > div'
        ).click()
        time.sleep(1)
        html = driver.execute_script(
            'return document.documentElement.outerHTML;')

        # CLOSE THE BROWSER
        driver.close()

        # GET THE DATA
        soup = bs4.BeautifulSoup(html, 'html.parser')
        matches = soup.find_all(class_=re.compile('event__match'))
        all_games = [
            list(game)[2:] for game in matches if 'Finished' in str(game)
        ]

        # WORK WITH THE DATA
        for game in all_games:
            items = {
                "home_team": "",
                "away_team": "",
                "home_score": "",
                "away_score": "",
                "home_odd": "",
                "draw_odd": "",
                "away_odd": ""
            }
            for element in game:
                if 'participant--home' in str(element):
                    pattern = r'(\"\>([A-z0-9]+.+)\<[s][v][g][ ]|\"\>[A-z0-9].+\<\/[d][i])'
                    home_team = re.search(pattern, str(element))
                    home_team_token = home_team.group(1)[2:].split('<')
                    items["home_team"] = home_team_token[0]
                elif 'participant--away' in str(element):
                    pattern = r'(\"\>([A-z0-9]+.+)\<[s][v][g][ ]|\"\>[A-z0-9].+\<\/[d][i])'
                    away_team = re.search(pattern, str(element))
                    team_away_token = away_team.group(1)[2:].split('<')
                    items["away_team"] = team_away_token[0]
                elif 'event__scores' in str(element):
                    pattern = r'[n]\>(\d+)\<\/'
                    tokens = re.findall(pattern, str(element))
                    items["home_score"] = int(tokens[0])
                    items["away_score"] = int(tokens[1])
                elif 'o_1' in str(element):
                    pattern = r'\"\>(\d{1,2}\.\d{2})\<\/[s]'
                    try:
                        home_odd = re.search(pattern, str(element))
                        items["home_odd"] = home_odd.group(1)
                    except AttributeError:
                        items["home_odd"] = "1.00"
                elif 'o_0' in str(element):
                    pattern = r'\"\>(\d{1,2}\.\d{2})\<\/[s]'
                    try:
                        draw_odd = re.search(pattern, str(element))
                        items["draw_odd"] = draw_odd.group(1)
                    except AttributeError:
                        items["draw_odd"] = "1.00"
                elif 'o_2' in str(element):
                    pattern = r'\"\>(\d{1,2}\.\d{2})\<\/[s]'
                    try:
                        away_odd = re.search(pattern, str(element))
                        items["away_odd"] = away_odd.group(1)
                    except AttributeError:
                        items["away_odd"] = "1.00"

        # INSERT THE DATA INTO THE DATABASE
            cursor.execute(
                'INSERT INTO allResults(home_team, away_team, home_score,'
                ' away_score, home_odd, draw_odd, away_odd) VALUES'
                ' (?, ?, ?, ?, ?, ?, ?)',
                (items["home_team"], items["away_team"], items["home_score"],
                 items["away_score"], items["home_odd"], items["draw_odd"],
                 items["away_odd"]))
        connector.commit()
        connector.close()
Beispiel #12
0
class BandLeader():
    def __init__(self, csvpath=None):
        # Database states
        self.database_path = csvpath
        self.database = []
        self._current_track_record = None

        # Load database from disk if possible
        if isfile(self.database_path):
            with open(self.database_path, newline='') as dbfile:
                dbreader = csv.reader(dbfile)
                next(dbreader)  # To ignore the header line
                self.database = [TrackRec._make(rec) for rec in dbreader]

        # Create a headless browser
        opts = Options()
        opts.headless = True
        self.browser = Firefox(options=opts)
        self.browser.get(BANDCAMP_FRONTPAGE)

        # Track list related state
        self._current_track_number = 1
        self.track_list = []
        self.tracks()

        # Database maintenance thread
        self.thread = Thread(target=self._maintain)
        self.thread.daemon = True  # Kills the thread when the main process dies
        self.thread.start()

    def save_db(self):
        with open(self.database_path, 'w', newline='') as dbfile:
            dbwriter = csv.writer(dbfile)
            dbwriter.writerow(list(TrackRec._fields))
            for entry in self.database:
                dbwriter.writerow(list(entry))

    def _maintain(self):
        while True:
            self._update_db()
            sleep(20)  # Check every 20 seconds

    def _update_db(self):
        try:
            check = (self._current_track_record is not None
                     and (len(self.database) == 0
                          or self.database[-1] != self._current_track_record)
                     and self.is_playing())

            if check:
                self.database.append(self._current_track_record)
                self.save_db()
        except Exception as e:
            print('error while updating the db: {}'.format(e))

    def tracks(self):
        '''
        Query the page to populate a list of available tracks
        '''

        # Sleep to give the browser time to render and finish any animations
        sleep(1)

        # Get the container for the visible track list
        discover_section = self.browser.find_elements_by_class_name(
            'discover-results')
        left_x = discover_section[0].location['x']
        right_x = left_x + discover_section[0].size['width']

        # Filter the items in the list to include only those we can click
        discover_items = self.browser.find_elements_by_class_name(
            'discover-item')
        self.track_list = [
            t for t in discover_items
            if t.location['x'] >= left_x and t.location['x'] < right_x
        ]

        # Print the available tracks to the screen
        for (i, track) in enumerate(self.track_list):
            print('[{}]'.format(i + 1))
            lines = track.text.split('\n')
            print('Album : {}'.format(lines[0]))
            print('Artist : {}'.format(lines[1]))
            if len(lines) > 2:
                print('Genre : {}'.format(lines[2]))

    def catalogue_pages(self):
        '''
        Print the available pages in the catalogue that are presently accessible
        '''

        print('PAGES')
        for e in self.browser.find_elements_by_class_name('item-page'):
            print(e.text)
        print('')

    def more_tracks(self, page='next'):
        '''
        Advance the catalogue and repopulates the track list. We can pass in a number
        to advance any of the available pages
        '''

        next_btn = [
            e for e in self.browser.find_elements_by_class_name('item-page')
            if e.text.lower().strip() == str(page)
        ]

        if next_btn:
            next_btn[0].click()
            self.tracks()

    def play(self, track=None):
        '''
        Play a track. If no track number is supplied, the presently selected track will play
        '''

        if track is None:
            self.browser.find_elements_by_class_name('playbutton')[0].click()
        elif type(track) is int and track <= len(
                self.track_list) and track >= 1:
            self._current_track_number = track
            self.track_list[self._current_track_number - 1].click()

        sleep(0.5)
        if self.is_playing():
            self._current_track_record = self.currently_playing()

    def play_next(self):
        '''
        Plays the next available track
        '''

        if self._current_track_number < len(self.track_list):
            self.play(self._current_track_number + 1)
        else:
            self.more_tracks()
            self.play(1)

    def pause(self):
        '''
        Pauses the playback
        '''

        self.play()

    def is_playing(self):
        '''
        Returns `True` if a track is presently playing
        '''

        playbtn = self.browser.find_elements_by_class_name('playbutton')
        return playbtn[0].get_attribute('class').find('playing') > -1

    def currently_playing(self):
        '''
        Returns the record for the currently playing track or None if nothing is playing
        '''

        try:
            if self.is_playing():
                title = self.browser.find_elements_by_class_name(
                    'title')[0].text
                album_detail = self.browser.find_elements_by_css_selector(
                    '.detail-album > a')
                album_title = album_detail[0].text
                album_url = album_detail[0].get_attribute('href').split('?')[0]
                artist_detail = self.browser.find_elements_by_css_selector(
                    '.detail-artist > a')
                artist = artist_detail[0].text
                artist_url = artist_detail[0].get_attribute('href').split(
                    '?')[0]
                return TrackRec(title, artist, artist_url, album_title,
                                album_url, ctime())
        except Exception as e:
            print('There was an error: {}'.format(e))

        return None

    def quit(self):
        '''
        Quit out of Selenium and close browser
        '''

        self.browser.close()
        quit()
Beispiel #13
0
        }
        for x, y, w, z in zip(
            d.find_elements_by_css_selector("._3wU53n"),
            d.find_elements_by_css_selector("._2rQ-NK"),
            d.find_elements_by_xpath("//div[@class='_3BTv9X']/img"),
            d.find_elements_by_xpath("//a[@class='_31qSD5']"))
        if containsall(x.text.lower(),
                       pn.lower().split())
    })
    try:
        d.execute_script('''window.open("{}","_blank");'''.format(
            d.find_element_by_xpath(
                "//span[contains(text(),'Next')]/parent::a[@class='_3fVaIS']").
            get_attribute("href")))
        sleep(3)
        d.close()
        d.switch_to.window(d.window_handles[0])
        sleep(3)
    except:
        break
d.close()
try:
    os.remove(w + r"\Database\flipkart.json")
except:
    pass
with open(w + r"\Database\flipkart.json", "w") as p:
    json.dump(
        {
            a: b
            for i in m for a, b in i.items()
            if b["Price"] == min([j[n]["Price"] for j in m for n in j])
Beispiel #14
0
class WeixinSelenium(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        self.driver = Firefox()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            self.driver.implicitly_wait(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(
                e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError,
                IndexError):
            pass

    def get_query_words(self):
        query_words = []

        for docs in self.collection.find({}, {
                'rel': 1,
                'conp': 1
        }).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()
        return query_words

    @property
    def uids(self):
        return {
            docs['uid']
            for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs
        }

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [
            _t.get_attribute('t')
            for _t in self.driver.find_elements_by_css_selector('div.s-p')
        ]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')
                     ]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        try:
            index = words.index(cut_word)
            return index
        except ValueError:
            pass
        return 0

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(
                self.driver,
                20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException,
                NoSuchElementException):
            pass
        return False

    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words = self.get_query_words()
        ind = self.query_index(query_words, word)

        for index, word in enumerate(query_words[ind:], 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1,
                              (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break, new open browser!'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
                self.logger.info(
                    'Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.
                    format(next_ind, word, page, wt))
                self.driver.implicitly_wait(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException, ):
            pass
Beispiel #15
0
def query(phone_number, debug, proxy, tor_proxy, outfile):
    """

    :param phone_number: query this number
    :param debug: don't use headless mode (for debugging)
    :param proxy: use this proxy server
    :param tor_proxy: configure browser for tor proxy
    :param outfile: log to this file
    :return: --
    """
    opts = Options()
    if not debug:
        opts.set_headless()
        assert opts.headless  # Operating in headless mode

    profile = webdriver.FirefoxProfile()
    # set FF preference to socks proxy
    if proxy:
        print('Setting proxy...')
        proxy = proxy.split(':')
        proxy_host = proxy[0]
        proxy_port = proxy[1]
        proxy_port = int(proxy_port)
        profile.set_preference("network.proxy.type", 1)
        if not tor_proxy:
            profile.set_preference("network.proxy.http", proxy_host)
            profile.set_preference("network.proxy.http_port", proxy_port)
            profile.set_preference('network.proxy.https', proxy_host)
            profile.set_preference('network.proxy.https', proxy_port)
            profile.set_preference('network.proxy.ssl', proxy_host)
            profile.set_preference('network.proxy.ssl_port', proxy_port)
        profile.set_preference("network.proxy.socks", proxy_host)
        profile.set_preference("network.proxy.socks_port", proxy_port)
        profile.set_preference("network.proxy.socks_version", 5)
        profile.set_preference('network.proxy_dns', 'true')

    profile.update_preferences()
    browser = Firefox(options=opts, firefox_profile=profile)
    get_url = 'https://www.cid.ninja/phone-numbers/?query=#' + str(
        phone_number)
    browser.get(get_url)

    title = browser.title
    if title == 'Home - CID Ninja':
        print(Fore.RED +
              'Maximum lookups for this IP reached, use a new proxy')
        browser.close()
        raise ValueError('IP Blacklist')

    phone_number = browser.find_element_by_id('details-phone-number').text
    printlog('Phone Number: ' + phone_number, outfile)
    details_location = browser.find_element_by_id('details-location').text
    printlog('Location: ' + details_location, outfile)
    cid_name = browser.find_element_by_id('details-cnam').text
    printlog('CID Name: ' + cid_name, outfile)
    carrier_name = browser.find_element_by_id('details-carrier').text
    printlog('Carrier Name: ' + carrier_name, outfile)
    details_sms = browser.find_element_by_id('details-sms').text
    printlog('SMS Email: ' + details_sms, outfile)
    details_old_carrier = browser.find_element_by_id('details-carrier-o').text
    printlog('Old Carrier:' + details_old_carrier, outfile)
    details_mms = browser.find_element_by_id('details-mms').text
    printlog('MMS Email: ' + details_mms, outfile)
    details_tel_num = browser.find_element_by_id('details-tel-num').text
    printlog('Carrier Help Line: ' + details_tel_num, outfile)
    details_slogan = browser.find_element_by_id('details-slogan').text
    printlog('Carrier Slogan: ' + details_slogan, outfile)
    browser.close()
Beispiel #16
0
def run_get_courses():
    # run browser
    started = False
    while not started:
        try:
            browser = Firefox()
            started = True
        except:
            pass

    browser.set_page_load_timeout(8)
    browser.set_script_timeout(8)

    # get courses' numbers
    courses_ex = get_all_existing_courses(browser)
    with open('courses_ex.pickle', 'wb') as file:
        pickle.dump(courses_ex, file)

    courses = get_all_courses(browser, courses_ex)
    with open('courses1.pickle', 'wb') as file:
        pickle.dump(courses, file)
    browser.close()

    # with open('courses1.pickle', 'rb') as file:
    #    courses = pickle.load(file)
    print('{} courses loaded'.format(len(courses)))

    with open('faculties.pickle', 'rb') as file:
        faculties = pickle.load(file)

    courses[104223].requires = [[104013, 104016, 104131],
                                [104014, 104016, 104131],
                                [104020, 104016, 104131],
                                [104022, 104016, 104131],
                                [104281, 104016, 104131],
                                [104013, 104171, 104131],
                                [104014, 104171, 104131],
                                [104020, 104171, 104131],
                                [104022, 104171, 104131],
                                [104281, 104171, 104131],
                                [104013, 104016, 104135],
                                [104014, 104016, 104135],
                                [104020, 104016, 104135],
                                [104022, 104016, 104135],
                                [104281, 104016, 104135],
                                [104013, 104171, 104135],
                                [104014, 104171, 104135],
                                [104020, 104171, 104135],
                                [104022, 104171, 104135],
                                [104281, 104171, 104135],
                                [104013, 104016, 104285],
                                [104014, 104016, 104285],
                                [104020, 104016, 104285],
                                [104022, 104016, 104285],
                                [104281, 104016, 104285],
                                [104013, 104171, 104285],
                                [104014, 104171, 104285],
                                [104020, 104171, 104285],
                                [104022, 104171, 104285],
                                [104281, 104171, 104285]]
    courses = course_class.fill_required(courses)
    courses = course_class.fill_all_required(courses)
    courses = course_class.fill_faculties(courses, faculties)
    courses = course_class.fill_requirement_depth(courses)

    with open('courses.pickle', 'wb') as file:
        pickle.dump(courses, file)
    print('Processing finished')
Beispiel #17
0
class BaseTestCase(unittest.TestCase):
    def setUp(self):
        self.browser = Firefox()

    def tearDown(self):
        self.browser.close()
Beispiel #18
0
def seltabup(dirc, uname, destination):
    ee.Initialize()
    options = Options()
    options.add_argument('-headless')
    authorization_url = "https://code.earthengine.google.com"
    uname = str(username)
    passw = str(password)
    if os.name == "nt":
        driver = Firefox(executable_path=os.path.join(lp, "geckodriver.exe"),
                         firefox_options=options)
    elif os.name == "posix":
        driver = Firefox(executable_path=os.path.join(lp, "geckodriver"),
                         firefox_options=options)
    driver.get(authorization_url)
    time.sleep(5)
    username = driver.find_element_by_xpath('//*[@id="identifierId"]')
    username.send_keys(uname)
    driver.find_element_by_id("identifierNext").click()
    time.sleep(5)
    #print('username')
    passw = driver.find_element_by_name("password").send_keys(passw)
    driver.find_element_by_id("passwordNext").click()
    time.sleep(5)
    #print('password')
    try:
        driver.find_element_by_xpath(
            "//div[@id='view_container']/form/div[2]/div/div/div/ul/li/div/div[2]/p"
        ).click()
        time.sleep(5)
        driver.find_element_by_xpath(
            "//div[@id='submit_approve_access']/content/span").click()
        time.sleep(5)
    except Exception as e:
        pass
    cookies = driver.get_cookies()
    s = requests.Session()
    for cookie in cookies:
        s.cookies.set(cookie['name'], cookie['value'])
    driver.close()
    try:
        i = 1
        path, dirs, files = next(os.walk(dirc))
        file_count = len(files)
        #print(file_count)
        for item in os.listdir(dirc):
            if item.endswith('.zip'):
                r = s.get(
                    "https://code.earthengine.google.com/assets/upload/geturl")
                d = ast.literal_eval(r.text)
                upload_url = d['url']
                file_path = os.path.join(dirc, item)
                with open(file_path, 'rb') as f:
                    upload_url = d['url']
                    files = {'file': f}
                    resp = s.post(upload_url, files=files)
                    gsid = resp.json()[0]
                    asset_full_path = destination + '/' + item.split('.')[0]
                    #print(asset_full_path)
                    output = subprocess.check_output(
                        'earthengine upload table --asset_id ' +
                        str(asset_full_path) + ' ' + str(gsid),
                        shell=True)
                    print('Ingesting ' + str(i) + ' of ' + str(file_count) +
                          ' ' + str(os.path.basename(asset_full_path)) +
                          ' task ID: ' + str(output).strip())
                    i = i + 1
    except Exception as e:
        print(e)
            # Which table we  are processing
            title = table.find_previous_sibling('h3')
            if title:
                title = title.string
            else:
                title = table.parent.find_previous_sibling('h3')
                if title:
                    title = title.string
                else:
                    title = 'No inmediate name'

            df['region'] = pd.Series(data=[current_region] * len(df.index))
            df['description'] = pd.Series(data=[title] * len(df.index))

            # Decide in which list to put the extracted table
            if 'SLES Premium' in df.columns:
                suse_list.append(df)
            else:
                pricing_list.append(df)

            print('{0}: {1}'.format(title, df.shape))

    if not os.path.exists('./azure/data'):
        os.mkdir('./azure/data')

    save_df_list(suse_list, './azure/data/azure_pricing_vm_suse.csv')
    save_df_list(pricing_list, './azure/data/azure_pricing_vm_common.csv')

    driver.close()
Beispiel #20
0
class Site:
    def __init__(self, args):
        self.args = args

        self.site_name = type(self).__name__
        self.site_displayname = BashColor.HEADER + BashColor.BOLD + self.site_name + BashColor.END \
            if sys.stdout.isatty() else self.site_name

        self.config = ConfigParser()
        self.__read_config_file('credentials.cfg.orig')
        self.__read_config_file('credentials.cfg')
        self._parse_credentials()
        self._parse_configuration()

        self._init_browser()

    def __read_config_file(self, filename):
        self.config.read(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, filename)))

    def _parse_credentials(self):
        if os.environ.get(self.site_name.upper() + '_USERNAME'):
            self.USERNAME = os.environ.get(self.site_name.upper() + '_USERNAME')
        else:
            self.USERNAME = self.config[self.site_name]['USERNAME']
        if os.environ.get(self.site_name.upper() + '_PASSWORD'):
            self.PASSWORD = os.environ.get(self.site_name.upper() + '_PASSWORD')
        else:
            self.PASSWORD = self.config[self.site_name]['PASSWORD']

    def _parse_configuration(self):
        # this method should be overwritten by a site, if there are more configs to parse than just the credentials
        pass

    def _init_browser(self):
        if self.args and not self.args.show_browser:
            self.display = Xvfb()
            self.display.start()

        profile = FirefoxProfile()
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.manager.showWhenStarting", False)
        profile.set_preference("browser.download.dir", EXPORTS_FOLDER)
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv, application/zip")
        profile.set_preference("browser.helperApps.alwaysAsk.force", False)
        profile.set_preference("devtools.jsonview.enabled", False)
        profile.set_preference("media.volume_scale", "0.0")
        # https://github.com/mozilla/geckodriver/issues/858#issuecomment-322512336
        profile.set_preference("dom.file.createInChild", True)

        self.browser = Firefox(firefox_profile=profile)
        # http://stackoverflow.com/questions/42754877/cant-upload-file-using-selenium-with-python-post-post-session-b90ee4c1-ef51-4  # pylint: disable=line-too-long
        self.browser._is_remote = False  # pylint: disable=protected-access

        self.login()
        time.sleep(1)
        self._check_login_successful()

    def login(self):
        sys.stdout.write('===== ' + self.site_displayname + ': performing login')
        sys.stdout.flush()
        self.browser.get(self.LOGIN_PAGE)
        time.sleep(1)

        try:
            self._insert_login_credentials()
            self._click_login_button()
        except NoSuchElementException:
            time.sleep(2)  # wait for page to load and try again
            self._insert_login_credentials()
            self._click_login_button()

    def _check_login_successful(self):
        if len(self.browser.find_elements_by_xpath(self.LOGIN_BUTTON_SELECTOR)) > 0 \
                and len(self.browser.find_elements_by_xpath(self.LOGIN_USERNAME_SELECTOR)) > 0 \
                and len(self.browser.find_elements_by_xpath(self.LOGIN_PASSWORD_SELECTOR)) > 0:
            command_line.error("Login to %s failed." % self.site_name)
            sys.stdout.write("Please check if the credentials are correctly set in your credentials.cfg\r\n")
            sys.stdout.flush()
            self.kill_browser()
            sys.exit(1)

    def _insert_login_credentials(self):
        login_field_user = self.browser.find_element_by_xpath(self.LOGIN_USERNAME_SELECTOR)
        login_field_user.send_keys(self.USERNAME)
        login_field_password = self.browser.find_element_by_xpath(self.LOGIN_PASSWORD_SELECTOR)
        login_field_password.send_keys(self.PASSWORD)

    def _click_login_button(self):
        login_button = self.browser.find_element_by_xpath(self.LOGIN_BUTTON_SELECTOR)
        login_button.click()
        time.sleep(2)  # wait for page to load

    def kill_browser(self):
        self.browser.stop_client()
        self.browser.close()
        try:
            self.browser.quit()
        except WebDriverException:
            pass

        if self.args and not self.args.show_browser:
            self.display.stop()

    def get_json_from_html(self):
        response = self.browser.find_element_by_tag_name("pre").text.strip()
        return json.loads(response)
def eereposnap(destination, mode):
    options = Options()
    if mode == "active":
        print("Trying this in live browser")
    elif mode is None:
        options.add_argument("-headless")
    authorization_url = "https://code.earthengine.google.com/"
    try:
        uname = str(raw_input("Enter your Username:  "******"Enter your Username:  "******"Enter your password: "******"nt":
        driver = Firefox(
            executable_path=os.path.join(lp, "geckodriver.exe"), options=options
        )
    else:
        driver = Firefox(
            executable_path=os.path.join(lp, "geckodriver"), options=options
        )
    driver.get(authorization_url)
    username = driver.find_element_by_xpath('//*[@id="identifierId"]')
    username.send_keys(uname)
    driver.find_element_by_id("identifierNext").click()
    time.sleep(5)
    passw = driver.find_element_by_name("password").send_keys(passw)
    driver.find_element_by_id("passwordNext").click()
    time.sleep(5)
    try:
        driver.find_element_by_xpath(
            "//div[@id='view_container']/form/div[2]/div/div/div/ul/li/div/div[2]/p"
        ).click()
        time.sleep(5)
        driver.find_element_by_xpath(
            "//div[@id='submit_approve_access']/content/span"
        ).click()
        time.sleep(10)
    except Exception as e:
        pass
    source = driver.page_source
    soup = BeautifulSoup(source, "lxml")
    source = soup.find("script", text=re.compile("window._ee_flag_initialData"))
    try:
        json_data = json.loads(
            source.string.replace("window._ee_flag_initialData = ", "")
            .replace(";", "")
            .strip()
        )
        for items in json_data["preferences"]["FAST_REPO_LISTS"]:
            if items["access"] == "owner":
                owner.append("https://earthengine.googlesource.com/" + items["name"])
            if items["access"] == "reader":
                reader.append("https://earthengine.googlesource.com/" + items["name"])
            if items["access"] == "writer":
                writer.append("https://earthengine.googlesource.com/" + items["name"])
    except Exception as e:
        print(e)
    driver.get("https://earthengine.googlesource.com/")
    driver.find_element_by_link_text("Sign in").click()
    time.sleep(3)
    driver.find_element_by_xpath(
        "//div[@id='view_container']/div/div/div[2]/div/div/div/form/span/section/div/div/div/div/ul/li/div/div/div/div[2]/div"
    ).click()
    time.sleep(10)
    cookies = driver.get_cookies()
    session = requests.Session()
    for cookie in cookies:
        session.cookies.set(cookie["name"], cookie["value"])
    driver.close()
    if not len(writer) == 0:
        for items in writer:
            base_path = os.path.join(
                destination, "writer_" + str(pendulum.now()).split("T")[0]
            )
            if not os.path.exists(base_path):
                os.makedirs(base_path)
                base_path = os.path.join(
                    destination, "writer_" + str(pendulum.now()).split("T")[0]
                )
            r = session.get(items + str("/+archive/refs/heads/master.tar.gz"))
            if r.status_code == 200:
                filename = (
                    r.headers["Content-Disposition"].split("filename=")[1].split("/")[0]
                )
                local_path = os.path.join(base_path, filename + ".tar.gz")
                if not os.path.exists(local_path):
                    try:
                        print("Downloading to: " + str(local_path))
                        f = open(local_path, "wb")
                        for chunk in r.iter_content(chunk_size=512 * 1024):
                            if chunk:
                                f.write(chunk)
                        f.close()
                        shutil.unpack_archive(
                            local_path, local_path.replace("-refs.tar.gz", "")
                        )
                        os.remove(local_path)
                    except Exception as e:
                        print(e)
                else:
                    print("File already exists: " + str(local_path))
            else:
                sys.exit("Failed with " + r.status_code)
    if not len(reader) == 0:
        for items in reader:
            base_path = os.path.join(
                destination, "reader_" + str(pendulum.now()).split("T")[0]
            )
            if not os.path.exists(base_path):
                os.makedirs(base_path)
            r = session.get(items + str("/+archive/refs/heads/master.tar.gz"))
            if r.status_code == 200:
                filename = (
                    r.headers["Content-Disposition"].split("filename=")[1].split("/")[0]
                )
                local_path = os.path.join(base_path, filename + ".tar.gz")
                if not os.path.exists(local_path):
                    try:
                        print("Downloading to: " + str(local_path))
                        f = open(local_path, "wb")
                        for chunk in r.iter_content(chunk_size=512 * 1024):
                            if chunk:
                                f.write(chunk)
                        f.close()
                        shutil.unpack_archive(
                            local_path, local_path.replace("-refs.tar.gz", "")
                        )
                        os.remove(local_path)
                    except Exception as e:
                        print(e)
                else:
                    print("File already exists: " + str(local_path))
            else:
                sys.exit("Failed with " + r.status_code)
    if not len(owner) == 0:
        for items in owner:
            base_path = os.path.join(
                destination, "owner_" + str(pendulum.now()).split("T")[0]
            )
            if not os.path.exists(base_path):
                os.makedirs(base_path)
            r = session.get(items + str("/+archive/refs/heads/master.tar.gz"))
            if r.status_code == 200:
                filename = (
                    r.headers["Content-Disposition"].split("filename=")[1].split("/")[0]
                )
                local_path = os.path.join(base_path, filename + ".tar.gz")
                if not os.path.exists(local_path):
                    try:
                        print("Downloading to: " + str(local_path))
                        f = open(local_path, "wb")
                        for chunk in r.iter_content(chunk_size=512 * 1024):
                            if chunk:
                                f.write(chunk)
                        f.close()
                        shutil.unpack_archive(
                            local_path, local_path.replace("-refs.tar.gz", "")
                        )
                        os.remove(local_path)
                    except Exception as e:
                        print(e)
                else:
                    print("File already exists: " + str(local_path))
            else:
                sys.exit("Failed with " + r.status_code)
Beispiel #22
0
def click_on(id):
    WebDriverWait(ff,10).until(lambda x:ff.find_element_by_id(id))
    ff.find_element_by_id(id).click()

def get_price_list():
    global out
    price_list = ff.find_element_by_id("pricelist")
    rows       = price_list.find_elements_by_xpath("./tbody")[0]
    rows       = rows.get_property("innerHTML").encode("utf-8")
    out.write(rows)

    # for row in rows:
    #     pass

URL="https://csgo.steamanalyst.com/list"
URL2="https://dota2.steamanalyst.com/list"
OUTFILE="/tmp/table.xls"
out=open(OUTFILE,"w")
out.write("<table>")
ff = Firefox()
#ff = Firefox(capabilities={"marionette":True})
ff.get(URL)
for x in xrange(10):
    raw_input("Press Enter to continue")
    get_price_list()
    ff.find_element_by_id("pricelist_next").find_elements_by_tag_name("a")[0].click()
out.write("</table>")
out.close()
ff.close()
Beispiel #23
0
class Browser:
    max_wait = 10

    def __init__(self, name, headless=False):
        self.name = name
        self.headless = headless
        self.username = None
        self.start()

    def start(self):
        self.log('starting')
        options = Options()
        if self.headless:
            options.add_argument('--headless')
        self.driver = Firefox(options=options)
        self.elem = None
        self.log('started')

    def get(self, url):
        self.driver.get(url)

    def maximize(self):
        self.driver.maximize_window()
        self.log('maximize')

    def js(self, js):
        out = self.driver.execute_script(js)
        self.log('js', out=out)

    def bottom(self):
        self.js('window.scrollTo(0, document.body.scrollHeight);')

    def size(self, width=800, height=600):
        self.driver.set_window_size(width, height)
        self.log(f'width: {width}, height: {height}')

    def user(self):
        self.username = input('username: '******'password: '******'{self.name}.pkl', 'wb') as f:
            pickle.dump(cookies, f)
        self.log('save loaded')

    def load_cookies(self):
        with open(f'{self.name}.pkl', 'rb') as f:
            cookies = pickle.load(f)
            for cookie in cookies:
                self.driver.add_cookie(cookie)
        self.log('cookies loaded')

    def log(self, message, **kwargs):
        print(f'browser: {message}', kwargs)

    def html(self):
        html = self.driver.page_source
        self.log(html)

    def done(self):
        self.log('closing')
        self.elem = None
        self.username = None
        self.password = None
        self.driver.close()
        self.log('done')

    def pause(self, seconds):
        self.log('sleep', seconds=seconds)
        time.sleep(seconds)

    def find(self, selector):
        self.log('finding', selector=selector)
        wait = WebDriverWait(self.driver, self.max_wait)
        self.elem = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
        self.log('found', elem=self.elem)

    def type(self, value):
        self.elem.send_keys(value)
        if value == self.password:
            self.log('type password')
        else:
            self.log(f'type: {value}')

    def click(self):
        self.elem.click()
        self.log('click')

    def enter(self):
        self.type(Keys.ENTER)

    def screenshot(self, name, show=False):
        image = Image.open(BytesIO(self.elem.screenshot_as_png))
        fname = f'./{name}.png'
        image.save(fname)
        self.log(fname)
        if show:
            image.show()
class Scraper:
    """
    Scrapping instance, scrapes all Orders in the given year range and outputs it into FILE_NAME
    """
    def __init__(
            self,
            email: str,
            password: Optional[str],
            headless: bool,
            start: int,
            end: int,
            extensive: bool,
            progress_observer_callback: Callable[[float],
                                                 None] = None) -> None:
        assert email, "no E-Mail provided"
        assert '@' in email and '.' in email, "incorrect email layout"  # Todo replace by regex
        assert start <= end, "start year must be before end year"
        assert end >= 2010, "Amazon order history works only for years after 2009"
        assert end <= datetime.datetime.now(
        ).year, "End year can not be in the future"

        self.logger = logging.getLogger(__name__)
        self.progress_observer_callback: Callable[
            [float], None] = progress_observer_callback

        self.email = email
        self.password = password if password else file_handler.load_password()
        if not self.password:
            self.logger.error(
                colored("Password not given nor pw.txt found", 'red'))
            raise PasswordFileNotFound

        self.start_date: datetime.date = datetime.date(year=start,
                                                       month=1,
                                                       day=1)
        self.end_date: datetime.date = datetime.datetime.now().date() if end == datetime.datetime.now().year \
            else datetime.date(year=end, month=12, day=31)
        self.start_scraping_date: datetime.date = datetime.date(year=start,
                                                                month=1,
                                                                day=1)

        self.headless = headless
        self.extensive = extensive

        self.orders: List[Order] = []
        self.browser: WebDriver

        self._setup_scraping()
        self._get_orders()

        file_handler.save_file(
            FILE_NAME, json.dumps([order.to_dict() for order in self.orders]))
        self.browser.quit()

    def _notify_progress_observers(self, progress: float) -> None:
        if self.progress_observer_callback:
            self.progress_observer_callback(progress)

    def _setup_scraping(self) -> None:
        """
        prepares the WebDriver for scraping the data by:
            - setting up the WebDrive
            - log in the user with the given credentials
            - skipping the adding phone number dialog (should it appear)
        :raise LoginError if not possible to login
         """
        firefox_profile = FirefoxProfile()
        firefox_profile.set_preference("browser.tabs.remote.autostart", False)
        firefox_profile.set_preference("browser.tabs.remote.autostart.1",
                                       False)
        firefox_profile.set_preference("browser.tabs.remote.autostart.2",
                                       False)
        opts = Options()
        opts.headless = self.headless
        if opts.headless:
            self.logger.info(colored("Run in headless mode.", 'blue'))
        self.browser = Firefox(options=opts, firefox_profile=firefox_profile)
        self._navigate_to_orders_page()
        self._complete_sign_in_form()
        if not self._signed_in_successful():
            self.logger.error(
                colored(
                    "Couldn't sign in. Maybe your credentials are incorrect?",
                    'red'))
            print(
                colored(
                    "Couldn't sign in. Maybe your credentials are incorrect?",
                    'red'))
            self.browser.quit()
            raise LoginError
        self._skip_adding_phone_number()

    def _navigate_to_orders_page(self) -> None:
        """
        navigates to the orders page
        """
        self.browser.get(
            'https://www.amazon.de/gp/css/order-history?ref_=nav_orders_first')

    def _complete_sign_in_form(self) -> None:
        """ searches for the sign in form enters the credentials and confirms
            if successful amazon redirects the browser to the previous site """
        try:
            email_input = self.browser.find_element_by_id('ap_email')
            email_input.send_keys(self.email)

            password_input = self.browser.find_element_by_id('ap_password')
            password_input.send_keys(self.password)

            self.browser.find_element_by_name('rememberMe').click()

            sign_in_input = self.browser.find_element_by_id('signInSubmit')
            sign_in_input.click()
        except NoSuchElementException:
            self.logger.error(
                colored(
                    "Error while trying to sign in, couldn't find all needed form elements",
                    'red'))
            print(
                colored(
                    "Error while trying to sign in, couldn't find all needed form elements",
                    'red'))

    def _signed_in_successful(self) -> bool:
        """ simple check if we are still on the login page """
        return bool(
            self.browser.current_url != "https://www.amazon.de/ap/signin")

    def _skip_adding_phone_number(self) -> None:
        """ find and click the 'skip adding phone number' button if found on the current page """
        try:
            skip_adding_phone_link = self.browser.find_element_by_id(
                'ap-account-fixup-phone-skip-link')
            skip_adding_phone_link.click()
            self.logger.info(colored('skipped adding phone number', 'blue'))
        except NoSuchElementException:
            self.logger.info(
                colored('no need to skip adding phone number', 'blue'))

    def _is_custom_date_range(self) -> bool:
        """
        :param start: start date
        :param end: end date
        :return: whether the maximum date range is used or a custom user set range
        """
        return self.start_date.year != 2010 or self.end_date.year != datetime.datetime.now(
        ).year

    def _are_orders_for_year_available(self) -> bool:
        """
        checks if there are any orders in the current selected year
        :return: True if there were orders, False if not
        """
        return bool(
            self.browser.page_source.find('keine Bestellungen aufgegeben') ==
            -1)  # No error!

    def _is_next_page_available(self) -> bool:
        """
        as long as the next page button exists there is a next page
        :return: True if there is a next page, False if not"""
        pagination_element = self.browser.find_element_by_class_name(
            'a-pagination')
        try:
            return 'Weiter' not in pagination_element.find_element_by_class_name(
                'a-disabled').text
        except NoSuchElementException:
            return True

    @staticmethod
    def _is_digital_order(order_id: str) -> bool:
        """
        checks if the order is digital (e.g. Amazon Video or Audio Book)
        :param order_id: the id of the order to check
        :return: True if order is digital, False if not
        """
        return order_id[:3] == 'D01'

    def _is_paging_menu_available(self) -> bool:
        """
        :returns: whether there are multiple pages for the current year by searching for a paging menu
        """
        try:
            return self.browser.find_element_by_class_name(
                'a-pagination') is not None
        except NoSuchElementException:
            return False

    def _get_orders(self) -> None:
        """
        get a list of all orders in the given range (start and end year inclusive)
        to save network capacities it is checked if some orders got already fetched earlier in 'orders.json'

        """
        if self._is_custom_date_range():
            file_handler.remove_file(FILE_NAME)
        else:
            self.orders = file_handler.load_orders(FILE_NAME)

        if self.orders:
            self._scrape_partial()
        else:
            self._scrape_complete()
        self.orders = sorted(self.orders, key=lambda order: order.date)

    def _get_order_info(
            self, order_info_element: WebElement
    ) -> Tuple[str, float, datetime.date]:
        """
        :param order_info_element:
        :returns: the OrderID, price and date
        """
        order_info_list: List[str] = [
            info_field.text for info_field in
            order_info_element.find_elements_by_class_name('value')
        ]

        # value tags have only generic class names so a constant order in form of:
        # [date, price, recipient_address, order_id] or if no recipient_address is available
        # [date, recipient_address, order_id]
        # is assumed
        if len(order_info_list) < 4:
            order_id = order_info_list[2]
        else:
            order_id = order_info_list[3]

        # price is usually formatted as 'EUR x,xx' but special cases as 'Audible Guthaben' are possible as well
        order_price_str = order_info_list[1]
        if order_price_str.find('EUR') != -1:
            order_price = self._price_str_to_float(order_price_str)
        else:
            order_price = 0

        date_str = order_info_list[0]
        date = ut.str_to_date(date_str)
        return order_id, order_price, date

    def _scrape_complete(self) -> None:
        """
        scrapes all the data without checking for duplicates (when some orders already exist)
        """
        self.orders = self._scrape_orders()

    def _scrape_partial(self) -> None:
        """ scrape data until finding duplicates, at which point the scraping can be canceled since the rest
         is already there """
        self.orders = sorted(self.orders, key=lambda order: order.date)
        self.start_scraping_date = self.orders[-1].date

        scraped_orders: List[Order] = self._scrape_orders()

        # check for intersection of fetched orders
        existing_order_ids = list(
            map(lambda order: order.order_id, self.orders))
        new_orders: List[Order] = list(
            filter(lambda order: order.order_id not in existing_order_ids,
                   scraped_orders))
        self.orders.extend(new_orders)

    def _scrape_orders(self) -> List[Order]:
        """
        :returns: a list of all orders in between given start year (inclusive) and end year (inclusive)
        """
        orders: List[Order] = []
        # order filter option 0 and 1 are already contained in option 2 [3months, 6months, currYear, lastYear, ...]
        start_index = 2 + (datetime.datetime.now().year - self.end_date.year)
        end_index = 2 + (datetime.datetime.now().year -
                         self.start_scraping_date.year) + 1

        for order_filter_index in range(start_index, end_index):
            # open the dropdown
            ut.wait_for_element_by_id(self.browser, 'a-autoid-1-announce')
            self.browser.find_element_by_id('a-autoid-1-announce').click()

            # select and click on a order filter
            id_order_filter = f'orderFilter_{order_filter_index}'
            ut.wait_for_element_by_id(self.browser, id_order_filter)
            dropdown_element = self.browser.find_element_by_id(id_order_filter)
            dropdown_element.click()

            pages_remaining = self._are_orders_for_year_available()
            while pages_remaining:

                orders_on_page: List[Order] = self._scrape_page_for_orders()
                orders.extend(orders_on_page)

                current_date: datetime.date = orders_on_page[-1].date

                if orders_on_page and self.start_scraping_date > current_date:
                    break
                if self._is_paging_menu_available():
                    pagination_element = self.browser.find_element_by_class_name(
                        'a-pagination')
                else:
                    break

                pages_remaining = self._is_next_page_available()
                if pages_remaining:
                    next_page_link = pagination_element.find_element_by_class_name('a-last') \
                        .find_element_by_css_selector('a').get_attribute('href')
                    self.browser.get(next_page_link)

        return orders

    def _scrape_page_for_orders(self) -> List[Order]:
        """ :returns a list of all orders found on the currently open page """
        orders = []
        for order_element in self.browser.find_elements_by_class_name('order'):

            ut.wait_for_element_by_class_name(order_element,
                                              'order-info',
                                              timeout=3)
            order_info_element = order_element.find_element_by_class_name(
                'order-info')
            order_id, order_price, date = self._get_order_info(
                order_info_element)

            items = []
            # looking in an order there is a 'a-box' for order_info and and 'a-box' for each seller containing detailed
            # items info
            for items_by_seller in order_element.find_elements_by_class_name(
                    'a-box')[1:]:

                for index, item_element in enumerate(
                        items_by_seller.find_elements_by_class_name(
                            'a-fixed-left-grid')):
                    seller = self._get_item_seller(item_element)
                    title, link = self._get_item_title(item_element)
                    item_price = order_price if self._is_digital_order(order_id) else \
                        self._get_item_price(item_element, index, order_element)
                    categories = self._get_item_categories(
                        link) if self.extensive else dict()

                    items.append(
                        Item(item_price, link, title, seller, categories))

            orders.append(Order(order_id, order_price, date, items))

            current_date: datetime.date = orders[-1].date
            progress: float = self._get_progress(current_date=current_date)
            self._notify_progress_observers(progress)

        return orders

    @staticmethod
    def _get_item_seller(item_element: WebElement) -> str:
        """
        :param item_element: the item div
        :return: returns the seller
        """
        try:
            seller_raw: str = item_element.text.split('durch: ')[1]
            seller: str = seller_raw.split('\n')[0]
            return seller
        except IndexError:
            return 'not available'

    @staticmethod
    def _get_item_title(item_element: WebElement) -> Tuple[str, str]:
        """
        :param item_element: the item div
        :return: returns the title and link of an item
        """
        item_elements = item_element.find_element_by_class_name('a-col-right') \
            .find_elements_by_class_name('a-row')
        item_title_element = item_elements[0]
        title = item_title_element.text
        try:
            link = item_title_element.find_element_by_class_name(
                'a-link-normal').get_attribute('href')
        except NoSuchElementException:
            link = 'not available'

        return title, link

    def _get_item_price(self, item_element: WebElement, item_index: int,
                        order_element: WebElement) -> float:
        """
        :param item_element: the item div
        :param item_index: the index of the item in the order
        :param order_element: the order div
        :return: returns the price of an item
        """
        try:
            item_price_str = item_element.find_element_by_class_name(
                'a-color-price').text
            item_price = self._price_str_to_float(item_price_str)
        except (NoSuchElementException, ValueError):
            item_price = self._get_item_price_through_details_page(
                order_element, item_index)

        return item_price

    def _get_item_price_through_details_page(self, order_element: WebElement,
                                             item_index: int) -> float:
        """
        :param order_element: the order div
        :param item_index: the index of the item in the order
        :returns: the item price found on the order details page
        """
        item_price: float = 0

        try:
            order_details_link = order_element.find_element_by_class_name(
                'a-link-normal').get_attribute('href')

            self.browser.execute_script(
                f'''window.open("{order_details_link}","_blank");''')
            self.browser.switch_to.window(self.browser.window_handles[1])
            if not ut.wait_for_element_by_class_name(self.browser,
                                                     'od-shipments'):
                return item_price

            od_shipments_element = self.browser.find_element_by_class_name(
                'od-shipments')
            price_fields: List[
                WebElement] = od_shipments_element.find_elements_by_class_name(
                    'a-color-price')
            item_price = self._price_str_to_float(
                price_fields[item_index].text)

        except (NoSuchElementException, ValueError):
            item_price = 0
            self.logger.warning(
                colored(
                    f'Could not parse price for order:\n{order_element.text}',
                    'yellow'))

        finally:
            self.browser.close()
            self.browser.switch_to.window(self.browser.window_handles[0])
        return item_price

    def _get_item_categories(self, item_link: str) -> Dict[int, str]:
        """
        :param item_link: the link to the item itself
        :returns: a dict with the categories and the importance as key
        """
        categories: Dict[int, str] = dict()

        self.browser.execute_script(
            f'''window.open("{item_link}","_blank");''')
        self.browser.switch_to.window(self.browser.window_handles[1])

        if ut.wait_for_element_by_id(self.browser,
                                     'wayfinding-breadcrumbs_container'):
            categories = self._get_item_categories_from_normal()
            self.browser.close()
            self.browser.switch_to.window(self.browser.window_handles[0])
            return categories

        if ut.wait_for_element_by_class_name(self.browser,
                                             'dv-dp-node-meta-info'):
            categories = self._get_item_categories_from_video()
            self.browser.close()
            self.browser.switch_to.window(self.browser.window_handles[0])
            return categories

        self.browser.close()
        self.browser.switch_to.window(self.browser.window_handles[0])

        return categories

    def _get_item_categories_from_normal(self) -> Dict[int, str]:
        """
        :return: the categories for a normal ordered item
        """
        categories = dict()
        categories_element = self.browser.find_element_by_id(
            'wayfinding-breadcrumbs_container')
        for index, category_element in enumerate(
                categories_element.find_elements_by_class_name("a-list-item")):
            element_is_separator = index % 2 == 1
            if element_is_separator:
                continue
            depth = int(index // 2 + 1)
            categories[depth] = category_element.text
        return categories

    def _get_item_categories_from_video(self) -> Dict[int, str]:
        """
        :return: the genre of a movie as categories
        """
        categories = dict()
        text: str = self.browser.find_element_by_class_name(
            'dv-dp-node-meta-info').text
        genre = text.split("\n")[0]
        genre_list: List[str] = genre.split(", ")
        genre_list[0] = genre_list[0].split(" ")[1]
        for index, genre in enumerate(genre_list):
            categories[index] = genre

        categories[len(genre_list)] = 'movie'
        return categories

    @staticmethod
    def _price_str_to_float(price_str: str) -> float:
        """
        converts the price str to a float value
        :param price_str: the price in string format as it is scraped
        :return: the price as float
        """
        return float((price_str[4:]).replace(',', '.'))

    def _get_progress(self, current_date: datetime.date) -> float:
        """
        calculates the progress by months
        :returns the progress in percentage
        """
        total_days = self.end_date.day - self.start_scraping_date.day + \
                     (self.end_date.month - self.start_scraping_date.month) * 31 + \
                     (self.end_date.year - self.start_scraping_date.year) * 12 * 31
        scraped_days = self.end_date.day - current_date.day + \
                       (self.end_date.month - current_date.month) * 31 + \
                       (self.end_date.year - current_date.year) * 12 * 31
        progress: float = scraped_days / total_days if total_days > 0 else 1.0
        return progress if progress <= 1 else 1.0
Beispiel #25
0
class BrowserEngine:

    options = Options()
    profile = FirefoxProfile()  # Set preferences at the class level
    profile.set_preference("permissions.default.image",
                           2)  # Supposed to help with memory issues
    profile.set_preference("dom.ipc.plugins.enabled.libflashplayer.so", False)
    profile.set_preference("browser.cache.disk.enable", False)
    profile.set_preference("browser.cache.memory.enable", False)
    profile.set_preference("browser.cache.offline.enable", False)
    profile.set_preference("network.http.use-cache", False)
    profile.accept_untrusted_certs = True

    def __init__(self, wait=5, proxy=None, headless=True):
        self.proxy = None if not proxy else self.proxy(proxy)
        self.options.headless = headless
        self.driver = Firefox(options=self.options,
                              firefox_profile=self.profile,
                              desired_capabilities=self.proxy)
        self.driver.set_window_position(
            0, 0)  # TODO: Not sure if these help or not with optimization
        self.driver.set_window_size(1024, 768)
        self.wait = WebDriverWait(self.driver, wait)

    def proxy(self, proxy):
        proxy = Proxy({
            "proxyType": ProxyType.MANUAL,
            "httpProxy": proxy,
            "ftpProxy": proxy,
            "sslProxy": proxy,
            "noProxy": ""
        })
        capabilities = DesiredCapabilities.FIREFOX
        proxy.add_to_capabilities(capabilities)
        return capabilities

    def quit(self):
        self.driver.quit()

    def close(self):
        self.driver.close()

    def refresh(self):
        self.driver.refresh()

    def back(self):
        self.driver.execute_script("window.history.go(-1)")

    def clear_cookies(self):
        self.driver.delete_all_cookies()

    def get(self, url):
        self.driver.get(url)

    def find_element(self, type_, value):
        try:
            return self.wait.until(
                lambda driver: driver.find_element(getattr(By, type_), value))
        except TimeoutException:
            return False

    def populate_element(self, element, value):
        element.send_keys(value)

    def is_clickable(self, type_, value):
        return self.wait.until(
            EC.element_to_be_clickable((getattr(By, type_), value)))

    def click(self, button):
        button.click()

    def select_dropdown(self, element, value):
        select = Select(element)
        select.select_by_value(value)

    def submit(self, form):
        form.submit()

    def execute_script(self, code):
        self.driver.execute_script(code)

    def screenshot(self, filename):
        self.driver.get_screenshot_as_file(filename)
Beispiel #26
0
def webtest():
    options = Options() # get firefox webdriver options
    options.add_argument('-headless') # run tests in headless mode CMD
    firefox = Firefox(firefox_options=options) # intialize firefox web driver
    firefox.get('http://localhost:5000') # test against flask app
    firefox.close()
Beispiel #27
0
class DriverProperty(object):
    """
    selenium webdriverのラッパー
    """
    def __init__(self, base_url=None, headless=False, browser_name='chrome'):
        """
        :param bool headless: ヘッドレスオプション
        :return:
        """
        self.driver = None
        self.base_url = base_url
        self.options = Options()
        self.options.add_argument("--ignore-certificate-errors")
        self.options.add_argument("--allow-running-insecure-content")
        self.options.add_argument("--disable-web-security")
        if headless:
            self.options.add_argument("--headless")
            self.options.add_argument("--disable-gpu")
            self.options.add_argument("--disable-desktop-notifications")
            self.options.add_argument("--disable-extensions")
        self._open_browser(browser_name=browser_name.lower())

    def set_driver(self, driver=None):
        """
        driverの引き継ぎに使用する
        :param selenium.webdriver driver: 引き継ぎ対象
        :return: DriverProperty self
        """
        self.driver = driver
        return self

    def _open_browser(self, browser_name):
        """
        :return: webdriver
        """
        if browser_name == 'chrome':
            if self.options is not None:
                self.driver = Chrome(chrome_options=self.options)
            else:
                self.driver = Chrome()
        elif browser_name == 'ie':
            self.driver = Ie()
        elif browser_name == 'safari':
            self.driver = Safari()
        elif browser_name == 'edge':
            self.driver = Edge()
        elif browser_name == 'firefox':
            self.driver = Firefox()
        else:
            raise Exception('Faild input browser name')
        self.driver.get(self.base_url)
        return self.driver

    def visit(self, url):
        """
        :param str url:
        :return: self
        """
        if url is None:
            raise Exception('input url.')
        try:
            self.driver.get(url)
        except WebDriverException:
            print('No such a url')
            self.driver.quit()

    def current_url(self):
        """
        :return: string
        """
        return self.driver.current_url()

    def close(self):
        """
        driverはクローズしない
        :return:
        """
        self.driver.close()

    def refresh(self):
        """
        現在のURLを開き直す
        :return:
        """
        self.driver.refresh()

    def authentication(self, user_name, pass_word):
        """
        ログイン認証を行う
        :param str user_name: ユーザ名
        :param str pass_word: パスワード
        :return:
        """
        self.driver.switch_to.alert.authenticate(user_name, pass_word)

    def accept(self):
        """
        警告を承認
        :return:
        """
        try:
            Alert(self.driver).accept()
        except NoAlertPresentException:
            pass
Beispiel #28
0
class BrowserHandler:
    def __init__(self, args):
        self.args = args
        if self.args and not self.args.show_browser:
            self.display = Xvfb()
            self.display.start()

        log_level = self._define_log_level(self.args)
        capabilities = self._create_browser_capabilities(log_level)
        options = self._create_browser_options(log_level)
        profile = self._create_browser_profile()

        self.browser = Firefox(
            firefox_profile=profile,
            capabilities=capabilities,
            firefox_options=options,
            log_path="{timestamp}_geckodriver.log".format(timestamp=TIMESTAMP))
        # https://stackoverflow.com/questions/42754877/cant-upload-file-using-selenium-with-python-post-post-session-b90ee4c1-ef51-4  # pylint: disable=line-too-long
        self.browser._is_remote = False  # pylint: disable=protected-access
        self.browser.maximize_window()

    @staticmethod
    def _define_log_level(args):
        if args and args.verbose and args.verbose >= 3:
            log_level = 'trace'
        elif args and args.verbose and args.verbose == 2:
            log_level = 'debug'
        elif args and args.verbose and args.verbose == 1:
            log_level = 'info'
        else:
            log_level = 'warn'

        return log_level

    @staticmethod
    def _create_browser_capabilities(log_level):
        capabilities = DesiredCapabilities.FIREFOX.copy()
        capabilities["moz:firefoxOptions"] = {
            "log": {
                "level": log_level,
            },
        }
        return capabilities

    @staticmethod
    def _create_browser_options(log_level):
        options = Options()
        options.log.level = log_level
        return options

    @staticmethod
    def _create_browser_profile():
        profile = FirefoxProfile()
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.manager.showWhenStarting",
                               False)
        profile.set_preference("browser.download.dir", EXPORTS_FOLDER)
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk",
                               "text/csv, application/zip")
        profile.set_preference("browser.helperApps.alwaysAsk.force", False)
        profile.set_preference("devtools.jsonview.enabled", False)
        profile.set_preference("media.volume_scale", "0.0")
        # https://github.com/mozilla/geckodriver/issues/858#issuecomment-322512336
        profile.set_preference("dom.file.createInChild", True)

        return profile

    def kill(self):
        self.browser.stop_client()
        self.browser.close()
        try:
            self.browser.quit()
        except WebDriverException:
            pass

        if self.args and not self.args.show_browser:
            self.display.stop()
Beispiel #29
0
def test_selenium_login():
    options = Options()
    options.add_argument('-headless')
    firefox = Firefox(executable_path="./geckodriver", options=options)
    firefox.get("http://localhost:8080/app.py/")
    firefox.close()
Beispiel #30
0
class SasCrawler(CrawlerInterface):
    def __init__(self):
        self.logger = logging.getLogger(str(self.__class__))
        self.url = 'https://classic.flysas.com/'
        self.flight = {
            'from': 'ARN',
            'to': 'LHR',
            'date_from': '2018-11-05',
            'date_to': '2018-11-11',
        }
        logname = '{0}_{1}-{2}.log'.format(
            datetime.strftime(datetime.now(), '%Y-%m-%d_%H%M%S'),
            self.flight['from'], self.flight['to'])
        # set up logging to file
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)-8s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S',
                            filename='log_files/' + logname,
                            filemode='w')

        opts = Options()
        opts.headless = True
        self.browser = Firefox(options=opts)

        self.run()

    def run(self):
        self.load_search_form()
        self.fill_search_form()
        self.submit_search_form()
        self.browser.close()

    def load_search_form(self):
        self.browser.get(self.url)
        # select language
        self.browser.find_element_by_xpath(
            '//*[@id="lstMarkets"]/tbody/tr[8]/td[3]').click()

    def fill_search_form(self):
        # flight from
        elem = self.browser.find_element_by_name(
            'ctl00$FullRegion$MainRegion$ContentRegion$ContentFullRegion$ContentLeftRegion$CEPGroup1$CEPActive$cepNDPRevBookingArea$predictiveSearch$txtFrom'
        )
        elem.send_keys(self.flight['from'])
        self.browser.implicitly_wait(1.5)
        self.browser.find_element_by_css_selector(
            '#resultFrom .selected').click()

        # flight to
        elem = self.browser.find_element_by_name(
            'ctl00$FullRegion$MainRegion$ContentRegion$ContentFullRegion$ContentLeftRegion$CEPGroup1$CEPActive$cepNDPRevBookingArea$predictiveSearch$txtTo'
        )
        elem.send_keys(self.flight['to'])
        self.browser.implicitly_wait(1.5)
        self.browser.find_element_by_css_selector(
            '#resultTo .selected').click()

        # from (field)
        elem = self.browser.find_element_by_class_name('flOutDate')
        elem.click()

        self.find_outward_date()

        # return (field)
        elem = self.browser.find_element_by_class_name('flInDate')
        elem.click()

        self.find_return_date()

    def find_outward_date(self):
        # from (datepicker)
        datetime_object = self.get_datepicker_date()
        date_from = datetime.strptime(self.flight['date_from'], '%Y-%m-%d')
        while datetime_object.month != date_from.month:
            self.browser.find_element_by_class_name(
                'ui-datepicker-month-link').click()
            datetime_object = self.get_datepicker_date()
        for el in self.browser.find_elements_by_css_selector(
                '.ui-datepicker-calendar td'):
            if el.text.strip() == str(date_from.day):
                el.click()
                break

    def find_return_date(self):
        # return (datepicker)
        datetime_object = self.get_datepicker_date()
        date_to = datetime.strptime(self.flight['date_to'], '%Y-%m-%d')
        while datetime_object.month != date_to.month:
            self.browser.find_element_by_class_name(
                'ui-datepicker-month-link').click()
            datetime_object = self.get_datepicker_date()
        for el in self.browser.find_elements_by_css_selector(
                '.ui-datepicker-calendar td'):
            if el.text.strip() == str(date_to.day):
                el.click()
                break

    def submit_search_form(self):
        # submit form
        elem = self.browser.find_element_by_id(
            'ctl00_FullRegion_MainRegion_ContentRegion_ContentFullRegion_ContentLeftRegion_CEPGroup1_CEPActive_cepNDPRevBookingArea_Searchbtn_ButtonLink'
        )
        elem.click()

    def get_datepicker_date(self):
        cur_day = int(
            self.browser.find_element_by_class_name('ui-state-active').text)
        cur_month = self.browser.find_element_by_class_name(
            'ui-datepicker-month').text
        cur_year = int(
            self.browser.find_element_by_class_name('ui-datepicker-year').text)
        return datetime.strptime(
            '{year} {month} {day}'.format(year=cur_year,
                                          month=cur_month,
                                          day=cur_day), '%Y %B %d')

    def get_data(self):
        pass
class RegistrationWebTest(TestCase):
    """
    Test all facets of the registration process
    """

    @classmethod
    def clear_database(cls):
        """
        Clear the database before and after use
        """
        collection = cls.mongo.collection
        for user in ['UnittestExistingTestUser', 'UnittestNonExistingTestUser']:
            test_user = collection.find_one({
                'username': user,
            })
            if test_user:
                collection.remove(test_user)

    @classmethod
    def setUpClass(cls):
        """
        Setup test data, browser and server
        """
        cls.mongo = UserDatabaseConnectivity()
        cls.clear_database()
        test_user = {
            'username': '******',
            'salt': '000',
            'password': '******',
            'enabled': False,
        }
        cls.mongo.collection.save(test_user)
        cls.config = dict()
        prepare_test(cls)
        cls.base_url = 'http://{:s}:{:d}/static/index.xhtml'.format(cls.config['bind_ip'], cls.config['bind_port'])

    @classmethod
    def tearDownClass(cls):
        """
        Disconnect from mongo and cleanup browser, server, etc.
        """
        cls.clear_database()
        del cls.mongo
        cleanup(cls)

    def setUp(self):
        """
        Force a page refresh between tests
        """
        self.webdriver = Firefox()
        self.webdriver.implicitly_wait(10)

    def tearDown(self):
        """
        Throw test user out of database
        """
        self.webdriver.close()
        self.webdriver.quit()
        collection = self.mongo.collection
        test_user = collection.find_one({
            'username': '******',
        })
        if test_user:
            collection.remove(test_user)

    def __util_get_reg_button(self):
        """
        Get the registration form button
        """
        self.webdriver.get(self.base_url)
        self.webdriver.implicitly_wait(10)
        sleep(3)
        button = self.webdriver.find_element_by_xpath('//xhtml:button[@data-formaction="registrationForm"]')
        return button

    def __util_open_dialog(self):
        """
        Open the registration dialog
        """
        button = self.__util_get_reg_button()
        button.click()
        self.webdriver.implicitly_wait(10)
        sleep(5)

    def test_find_button(self):
        """
        Is the button there?
        """
        self.assertIsNotNone(self.__util_get_reg_button())

    def test_open_dialog(self):
        """
        Can we open the dialog?
        """
        dialog_xpath = '//xhtml:div[contains(@class, "bootstrap-dialog")]'
        # Test that there is no dialog open at the moment
        self.assertRaises(NoSuchElementException, self.webdriver.find_element_by_xpath, dialog_xpath)
        self.__util_open_dialog()
        dialog = self.webdriver.find_element_by_xpath(dialog_xpath)
        self.assertIsNotNone(dialog)

    def __util_get_form_and_username_field(self, reopen=True):
        """
        Find the form and the username field
        """
        if reopen:
            self.__util_open_dialog()
        form = self.webdriver.find_element_by_xpath(
            '//xhtml:div[contains(@class, "bootstrap-dialog") and contains(@class, "modal") and @id]'
            '//xhtml:div[@class="bootstrap-dialog-body"]'
            '//xhtml:form[@id="formlib_registration"]'
        )
        username_field = form.find_element_by_name('username')
        return form, username_field

    def test_enter_existing_username(self):
        """
        Test with an already existing username
        """
        form, username_field = self.__util_get_form_and_username_field()
        username_field.click()
        username_field.send_keys('UnittestExistingTestUser')
        username_field.send_keys(Keys.ENTER)
        self.webdriver.implicitly_wait(5)
        error_msg = form.find_element_by_xpath(
            '//xhtml:div[@data-fieldref="formlib_registration_username" and @role="alert"]'
        )
        self.assertTrue(error_msg.text.endswith('username_not_available'))

    @staticmethod
    def util_get_password_fields(form):
        """
        Find the two password fields in the form
        """
        sleep(3)
        pwd_field_1 = form.find_element_by_name('password1')
        pwd_field_2 = form.find_element_by_name('password2')
        return pwd_field_1, pwd_field_2

    def __util_enter_non_existing_username(self, username_field):
        """
        Enter a username that works
        """
        if username_field.is_enabled():
            username_field.click()
            username_field.send_keys('UnittestNonExistingTestUser')
            username_field.send_keys(Keys.ENTER)
            self.webdriver.implicitly_wait(5)
            sleep(3)

    def __util_test_single_pwd_error_message(self, form):
        """
        Check if there is only a single pwd error message
        """
        self.webdriver.implicitly_wait(5)
        sleep(1)
        error_message = form.find_element_by_xpath(
            '//xhtml:div[@data-fieldref="formlib_registration_password1" and @role="alert"]'
        )
        self.assertEqual('Password invalid', error_message.text)
        self.assertRaises(
            NoSuchElementException,
            form.find_element_by_xpath,
            '//xhtml:div[@data-fieldref="formlib_registration_password2" and @role="alert"]'
        )

    def test_non_exist_uname_pwd1_too_short(self):
        """
        Test with a too short password in password 1
        """
        form, username_field = self.__util_get_form_and_username_field()
        self.__util_enter_non_existing_username(username_field)
        pwd1, dummy = RegistrationWebTest.util_get_password_fields(form)
        pwd1.click()
        pwd1.send_keys('123')
        pwd1.send_keys(Keys.ENTER)
        self.__util_test_single_pwd_error_message(form)

    def test_non_exist_uname_pwd1_pwd2_too_short_but_eq(self):
        """
        Test with two passwords equal, but to short
        """
        form, username_field = self.__util_get_form_and_username_field()
        self.__util_enter_non_existing_username(username_field)
        pwd1, pwd2 = RegistrationWebTest.util_get_password_fields(form)
        for pwd in [pwd1, pwd2]:
            pwd.click()
            pwd.send_keys('123')
        pwd2.send_keys(Keys.ENTER)
        self.__util_test_single_pwd_error_message(form)

    def test_non_exist_uname_w_val_pwd1_a_inval_repeat(self):
        """
        Test with long enough passwords, but not equal
        """
        form, username_field = self.__util_get_form_and_username_field()
        self.__util_enter_non_existing_username(username_field)
        pwd1, pwd2 = RegistrationWebTest.util_get_password_fields(form)
        pwd1.click()
        pwd1.send_keys('test1234')
        pwd1.send_keys(Keys.ENTER)
        pwd2.send_keys('test1235')
        pwd2.send_keys(Keys.ENTER)
        self.assertRaises(
            NoSuchElementException,
            form.find_element_by_xpath,
            '//xhtml:div[@data-fieldref="formlib_registration_password1" and @role="alert"]'
        )
        error_message = form.find_element_by_xpath(
            '//xhtml:div[@data-fieldref="formlib_registration_password2" and @role="alert"]'
        )
        self.assertEqual('Passwords do not match', error_message.text)

    def test_full_registration_flow(self):
        """
        Test the registration flow completely
        """
        form, username_field = self.__util_get_form_and_username_field()
        self.__util_enter_non_existing_username(username_field)
        pwd1, pwd2 = RegistrationWebTest.util_get_password_fields(form)
        for pwd in [pwd1, pwd2]:
            pwd.click()
            pwd.send_keys('test1234')
            pwd.send_keys(Keys.ENTER)
        self.webdriver.implicitly_wait(5)
        for field in ['password1', 'password2']:
            self.assertRaises(
                NoSuchElementException,
                form.find_element_by_xpath,
                '//xhtml:div[@data-fieldref="formlib_registration_{:s}" and @role="alert"]'.format(field)
            )
        success_message = form.find_element_by_xpath(
            '//xhtml:div[contains(@class, "alert") and contains(@class, "alert-success") and @role="alert"]'
        )
        self.assertEqual('Registration successful', success_message.text)
        form.find_element_by_xpath(
            '//xhtml:button[contains(@class, "btn") and contains(@class, "btn-default")]'
        )
Beispiel #32
0
class BasicSpider:
    total = 0
    name = 'basic'
    ip_url = "https://icanhazip.com"

    # start_url = 'http://sh.lianjia.com/ershoufang/'

    def __init__(self, start_url, filename):
        self.driver = Firefox()
        self.start_url = start_url
        self.file = open(filename, "w")
        self.csvWriter = csv.writer(self.file, delimiter='\t')

        # initial dist_name urls
        self.lvl1_urls = set()
        self.lvl1_urls_retrived = set()
        self.lvl0_urls = set()
        self.lvl0_urls_retrived = set()
        self.lvl0_urls_retrived.add(self.start_url)
        self._get_start_urls(self.start_url)

    def scrapingAll(self):
        for url in self.lvl1_urls:
            self.parseCategroy(url)

    def parseCategroy(self, url=None):
        if not url:
            url = self.start_url
        self._sleep()
        self.driver.get(url)
        element = WebDriverWait(self.driver, 50).until(
            EC.presence_of_element_located((By.CLASS_NAME, "c-pagination")))

        for item in self.getItems():
            item.append(url)
            self.csvWriter.writerow(item)
        self.file.flush()
        # current = self.driver.find_element_by_xpath('//*[@class="c-pagination"]/'
        #                                             'a[@class="current"]')
        next = self.driver.find_elements_by_xpath(
            '//*[@class="c-pagination"]/'
            'a[@gahref="results_next_page"]')
        # currenturl = urljoin(url, current.get_attribute("href"))
        if len(next) == 1:
            nexturl = urljoin(url, next[0].get_attribute("href"))

            # timeout = random.randint(1,100)
            # if timeout > 80:
            #     self.refreshDriver()
            print(nexturl)
            self.parseCategroy(nexturl)
        else:
            print("in parseCategroy: %d next find, url -> %s" %
                  (len(next), url))
        # self.driver.close()

    def getItems(self):
        # ul = self.driver.find_element_by_xpath('//ul[@class="js_fang_list"]')
        bs = BeautifulSoup(self.driver.page_source)
        l = []
        for li in bs.find("ul", {"class": "js_fang_list"}).findAll("li"):
            prop_title = li.find("div", {"class": "prop-title"}).find("a")
            title = prop_title.text.replace(" ", "")
            url = urljoin(self.start_url, prop_title["href"])

            infos = li.findAll("div", {"class": "info-row"})
            price = infos[0].find("div", {
                "class": "info-col price-item main"
            }).text.replace("\n", "")

            address = ":".join([tag.text for tag in infos[1].findAll("a")])
            l.append([title, url, price, address])
        self.total += len(l)
        print("total current: ", self.total)
        return l

    def changeIP(self):
        with Controller.from_port(port=9051) as controller:
            controller.authenticate()
            controller.signal(Signal.NEWNYM)

    def refreshDriver(self):
        # make sure Tor and Proxies has installed and configure
        if hasattr(self, "driver"):
            self.driver.close()
        self.changeIP()

        proxy_address = "localhost:8118"
        proxy = Proxy()
        proxy.socksProxy = proxy_address

        profile = FirefoxProfile()
        proxy = Proxy({
            'proxyType': ProxyType.MANUAL,
            'httpProxy': proxy_address,
            'httpsProxy': proxy_address,
            'ftpProxy': proxy_address,
            'sslProxy': proxy_address,
            'noProxy': ""
        })
        profile.set_proxy(proxy)
        driver = Firefox(firefox_profile=profile)
        self.driver = driver

    def _get_start_urls(self, url):
        # get first dist_name level
        self._sleep()
        self.driver.get(url)
        self._get_level0_urls()

        if len(self.lvl0_urls):
            new_url = self.lvl0_urls.pop()
            print("new url: %s" % new_url)
            self.lvl0_urls_retrived.add(new_url)
            self._get_level1_urls()
            self._get_start_urls(new_url)

    def _get_level0_urls(self):
        level0_urls = set()
        districts = self.driver.find_element_by_id("plateList")
        for district in districts.find_elements_by_xpath(
                "//div[@class='level1']/a"):
            dist_url = urljoin(self.start_url, district.get_attribute("href"))
            if dist_url not in self.lvl0_urls_retrived:
                level0_urls.add(dist_url)
        self.lvl0_urls.update(level0_urls)
        print("refresh level0 %s" % level0_urls)
        return level0_urls

    def _get_level1_urls(self):
        districts = self.driver.find_element_by_id("plateList")
        level1_urls = set([
            i.get_attribute("href") for i in districts.find_elements_by_xpath(
                "//div[@class='level2-item']/a")
            if i.get_attribute("href") not in self.lvl0_urls
            and self.lvl0_urls_retrived
        ])
        self.lvl1_urls.update(level1_urls)
        return level1_urls

    def __del__(self):
        self.driver.close()
        self.file.close()

    def _sleep(self):
        time.sleep(random.randint(1, 5))

    def _conect_db(self):
        cnx = sqlite3.connect("urls.db")
        self.cnx = cnx
        self.cursor = cnx.cursor()

    def _insert(self, table_name, url):
        stmt = "insert into %s(url) values(?) " % table_name
        self.cursor.execute(stmt, [url])
        self.cnx.commit()

    def _retrive(self, table_name):
        stmt = "select url from %s" % table_name
        self.cursor.execute(stmt)
        return set([item[0] for item in self.cursor.fetchall()])

    def _exist(self, tablename, url):
        stmt = "select url from %s where url = %s" % (tablename, url)
        self.cursor.execute(stmt)
        return len(self.cursor.fetchall()) > 0
Beispiel #33
0
def getHydrawiseData(outFileDir):
    profile = FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2)
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk',
                           'application/vnd.ms-excel')
    profile.set_preference('browser.download.dir', outFileDir)

    opts = Options()
    opts.set_headless()
    assert opts.headless

    browser = Firefox(options=opts, firefox_profile=profile)
    browser.implicitly_wait(10)

    browser.get('https://app.hydrawise.com/config/login')

    assert (browser.title == 'Hydrawise')

    hydraUser = os.environ['HYDRA_USER']
    hydraPW = os.environ['HYDRA_PW']

    login_form = browser.find_elements_by_class_name('form-control')
    login_form[0].send_keys(hydraUser)
    login_form[1].send_keys(hydraPW)
    login_button = browser.find_element_by_class_name('login-btn')
    login_button.click()

    try:
        element = WebDriverWait(browser, 10).until(
            EC.title_is('Hydrawise Configuration'))
    except TimeoutException:
        print('logged in failed!')

    assert (browser.title == 'Hydrawise Configuration')

    # After logging in, short circuit all the button clicking by just loading up where we
    # want to go

    browser.get('https://app.hydrawise.com/config/reports')

    downloadButton = -1
    todayButton = -1
    dayButton = -1

    downloadRe = re.compile(r'>Download<')
    todayRe = re.compile(r'>today<')
    dayRe = re.compile(r'>day<')
    buttons = browser.find_elements_by_tag_name('button')
    for (i, button) in enumerate(buttons):
        html = button.get_attribute('outerHTML')
        if downloadRe.search(html):
            downloadButton = i
        if todayRe.search(html):
            todayButton = i
        if dayRe.search(html):
            dayButton = i

    buttons[dayButton].click()
    buttons[todayButton].click()
    buttons[downloadButton].click()

    browser.close()
Beispiel #34
0
class Insta_automate:
    def __init__(self, username, passwd):
        self.username, self.passwd = username, passwd

        self.open_instagram()
        print(its_ok + "Opened instagram")
        self.login()
        print(its_ok + "logged in")

    def open_instagram(self):
        if os.name == "nt":  # if its windows
            try:
                self.brows = Chrome(executable_path=chrome_path,
                                    chrome_options=headless_for_chrome())
            except WebDriverException:
                print(its_not_ok +
                      "Cannot find Chrome binary...\nTrying Firefox")
                self.brows = Firefox(service=Service(firefox_path),
                                     options=headles_for_firefox())
        else:
            try:
                self.brows = Firefox(service=Service(firefox_path),
                                     options=headles_for_firefox())
            except WebDriverException:
                print(its_not_ok + "Cannot find gecko...\nTrying install")
                install_to_os()
                print(
                    its_ok +
                    "Installed Successfully Again do you want to headles or not ?"
                )
                self.brows = Firefox(service=Service(firefox_path),
                                     options=headles_for_firefox())

        self.brows.maximize_window()

        self.brows.implicitly_wait(20)

        self.brows.get("https://www.instagram.com/")
        self.find_username = self.brows.find_element(
            By.NAME, "username").send_keys(self.username)
        self.find_passwd = self.brows.find_element(
            By.NAME, "password").send_keys(self.passwd)

    def login(self):
        self.login_but = self.brows.find_element(By.XPATH,
                                                 xpaths["login_but"]).click()
        self.dont_save_but = self.brows.find_element(By.CLASS_NAME,
                                                     "cmbtv").click()

    def go_to_my_profile(self):
        self.brows.get("https://www.instagram.com/{}/".format(username))
        self.get_profile_info(self.brows.page_source)

    def go_to_user_page(self, name_list):
        for name in name_list:
            self.brows.get("https://www.instagram.com/{}/".format(name))

    def get_followers_and_followings(self, name):
        self.brows.get("https://www.instagram.com/{}/".format(
            name))  #go to desired userpage
        print(
            f"Scrape {Fore.MAGENTA}{name}{Fore.RESET}'s follower or following users [Default is 1]"
        )

        print(f"""
               [{Fore.BLUE} {Fore.RESET}]  POST       : {Fore.GREEN} {self.get_profile_info()["post"]}      {Fore.YELLOW}(in maintenance) {Fore.RESET}
               [{Fore.BLUE}1{Fore.RESET}]  FOLLOWING  : {Fore.GREEN} {self.get_profile_info()["following"]} {Fore.RESET} 
               [{Fore.BLUE}2{Fore.RESET}]  FOLLOWERS  : {Fore.GREEN} {self.get_profile_info()["followers"]} {Fore.RESET} 
               """)

        self.which = input(f"\n  -->")
        if self.which == "1":
            self.phr = "3"
            self.total_ = self.get_profile_info()["following"]
        elif self.which == "2":
            self.phr = "2"
            self.total_ = self.get_profile_info()["followers"]
        else:
            self.phr = "3"
            self.total_ = self.get_profile_info()["following"]

        self.brows.find_element(By.XPATH, f"//ul/li[{self.phr}]/a/div").click(
        )  #click followers or followed button
        self.data = []

        if self.total_ < 24:
            self.total_ = 5
        else:
            self.total_ = round(self.total_ / 3)

        self.start_time = time.time()
        for i in range(self.total_):  #page down
            print(f" Loop : {self.total_}/{Fore.CYAN}{i}",
                  end="\r",
                  flush=True)

            self.brows.find_element(By.CLASS_NAME, "isgrP").send_keys(Keys.END)

            self.soup = BS(
                self.brows.page_source, "lxml"
            )  #in every loop after page down make new instance for new usernames

            self.all_span = [
                i.text
                for i in self.soup.find_all("span", attrs={"class": "Jv7Aj"})
            ][len(self.data):]
            for name in self.all_span:
                if name not in self.data:
                    self.data.append(name)
            print(
                f"\t[Users Scraped : {Fore.MAGENTA}{len(self.data)} {Fore.RESET}]  Sec: {Fore.YELLOW} {round(time.time() - self.start_time)}"
                .expandtabs(30),
                end="\r",
                flush=True)
        print(
            f"\t[Users Scraped : {Fore.MAGENTA}{len(self.data)} {Fore.RESET}] | Sec: {Fore.YELLOW} {round(time.time() - self.start_time)}"
            .expandtabs(30))

    def get_profile_info(self):
        self.info = {"post": "", "followers": "", "following": ""}
        self.soup = BS(self.brows.page_source, "lxml")

        for data, key in zip(self.soup.find_all("li"), self.info.keys()):
            checked_str = data.text.split()[0]
            try:
                self.info[key] = int(checked_str)
            except ValueError:
                self.info[key] = text_to_num(checked_str)

        return self.info

    def follow(self, username):
        self.brows.get("https://www.instagram.com/{}/".format(username))
        self.brows.find_element(By.XPATH, xpaths["follow_but"]).click()
        print(its_ok + f"Followed   > {Fore.BLUE}{username}")

    def unfollow(self, username):
        self.brows.get("https://www.instagram.com/{}/".format(username))
        self.brows.find_element(By.XPATH, xpaths["follow_but"]).click()
        self.notfy_but = WebDriverWait(self.brows, 10).until(
            EC.element_to_be_clickable(mark=(By.XPATH,
                                             xpaths["unfollow_notfy"])))
        self.notfy_but.click()
        print(its_ok + f"Unfollowed > {Fore.RED}{username}")

    def hack_with_foll_unfoll(self, user_list: list):
        self.follow_count = default_val("How much loop? [default 8] : ", "8")
        self.delay_unfollow = default_val(
            "Delay for unfollows [second] [default 10 sec] :", "10")
        self.delay_follow = default_val(
            "Delay for follows [second] [default 4 sec] :", "4")

        for i in range(self.follow_count):
            for name in user_list:
                try:
                    self.follow(name)
                except:
                    pass
            self.delay(self.delay_unfollow)

            for name in user_list:
                try:
                    self.unfollow(name)
                except:
                    pass
            self.delay(self.delay_follow)

            print(f"\nLoop : {i+1}/{self.follow_count}")

    def close_tab(self):
        self.brows.close()

    def close_all(self):
        self.brows.quit()

    @staticmethod
    def delay(delay):
        for i in range(delay, 0, -1):
            time.sleep(1)
            print(f" Waiting for [{Fore.YELLOW}{i}{Fore.RESET}]  ", end="\r")
Beispiel #35
0
class FirefoxTestCase(LiveServerTestCase):
    def create_app(self):
        app = create_app("__test__")
        print(app.instance_path)
        print(app.root_path)
        print(app.template_folder)
        return app

    def setUp(self):

        options = Options()
        options.add_argument('--port=5000')
        options.log.level = 'debug'
        self.driver = Firefox(options=options)
        self.driver.implicitly_wait(10)
        self.driver.get(self.get_server_url())

    def tearDown(self):
        self.driver.close()

    # Test that all navbar links are functional
    def test_navbar_links(self):
        driver = self.driver
        self.assertIn("Let's Get Fit", driver.title)

        foods_link = driver.find_element_by_link_text('Foods')
        foods_link.click()
        self.assertIn("CKC - Foods", driver.title)

        workouts_link = driver.find_element_by_link_text('Workouts')
        workouts_link.click()
        self.assertIn("CKC - Workouts", driver.title)

        gyms_link = driver.find_element_by_link_text('Gyms')
        gyms_link.click()
        self.assertIn("CKC - Gyms", driver.title)

        stores_link = driver.find_element_by_link_text('Stores')
        stores_link.click()
        self.assertIn("CKC - Stores", driver.title)

        about_link = driver.find_element_by_link_text('About')
        about_link.click()

        wait = WebDriverWait(driver, 10)
        element = wait.until(EC.title_is(('About Us')))
        self.assertIn("About Us", driver.title)

    # Test that food grid card title links lead to the correct instance that displays the name properly
    def test_food_grid_links(self):
        driver = self.driver
        self.assertIn("Let's Get Fit", driver.title)
        driver.find_element_by_link_text('Foods').click()

        self.assertIn("CKC - Foods", driver.title)  # Adding to make it wait

        food_item = driver.find_element_by_class_name('card-title')
        food_name = food_item.text
        food_item = driver.find_element_by_class_name('title-link')
        print(food_name)
        print(driver.title)
        food_item.click()

        wait = WebDriverWait(driver, 10)
        print(driver.title)
        element = wait.until(EC.title_is((food_name)))
        self.assertEqual(food_name, driver.find_element_by_tag_name('h1').text)

    # Test that workout grid card title links lead to the correct instance that displays the name properly
    def test_workout_grid_links(self):
        driver = self.driver
        self.assertIn("Let's Get Fit", driver.title)
        driver.find_element_by_link_text('Workouts').click()

        self.assertIn("CKC - Workouts", driver.title)
        workout_item = driver.find_element_by_class_name('card-title')
        workout_name = workout_item.text
        workout_item.click()

        wait = WebDriverWait(driver, 10)
        element = wait.until(EC.title_is((workout_name)))
        self.assertEqual(workout_name,
                         driver.find_element_by_tag_name('h1').text)

    # Test that gym grid card title links lead to the correct instance that displays the name properly
    def test_gym_grid_links(self):
        driver = self.driver
        self.assertIn("Let's Get Fit", driver.title)
        driver.find_element_by_link_text('Gyms').click()

        self.assertIn("CKC - Gyms", driver.title)
        gym_item = driver.find_element_by_class_name('card-title')
        gym_name = gym_item.text
        gym_item.click()

        wait = WebDriverWait(driver, 10)
        element = wait.until(EC.title_is((gym_name)))
        self.assertEqual(gym_name, driver.find_element_by_tag_name('h1').text)

    # Test that store grid card title links lead to the correct instance that displays the name properly
    def test_store_grid_links(self):
        driver = self.driver
        self.assertIn("Let's Get Fit", driver.title)
        driver.find_element_by_link_text('Stores').click()

        self.assertIn("CKC - Stores", driver.title)
        store_item = driver.find_element_by_class_name('card-title')
        store_name = store_item.text
        store_item = driver.find_element_by_class_name('card-img-top')
        print(store_name)
        print(driver.title)
        store_item.click()

        wait = WebDriverWait(driver, 10)
        print(driver.title)
        element = wait.until(EC.title_is((store_name)))
        self.assertEqual(store_name,
                         driver.find_element_by_tag_name('h1').text)

    # Test that going back and forth in the navigation history doesn't break the website
    def test_navigation_history(self):
        driver = self.driver
        wait = WebDriverWait(driver, 10)

        driver.find_element_by_link_text('About').click()
        element = wait.until(EC.title_is(('About Us')))

        driver.find_element_by_link_text('Foods').click()
        element = wait.until(EC.title_is(('CKC - Foods')))

        driver.find_element_by_link_text('Stores').click()
        driver.back()
        driver.forward()
        driver.back()
        driver.back()

        wait = WebDriverWait(driver, 10)
        element = wait.until(EC.title_is(('About Us')))
        self.assertEqual("About Calorie Killer Club",
                         driver.find_element_by_tag_name('h2').text)
Beispiel #36
0
    def execute(
        self,
        webdriver: Firefox,
        browser_params: BrowserParams,
        manager_params: ManagerParams,
        extension_socket: ClientSocket,
    ) -> None:
        logger.debug("BROWSER %i: Profile dumping is currently unsupported. "
                     "See: https://github.com/mozilla/OpenWPM/projects/2." %
                     browser_params.browser_id)
        return
        browser_profile_folder = browser_params.profile_path

        # ensures that folder paths end with slashes
        if browser_profile_folder[-1] != "/":
            browser_profile_folder = browser_profile_folder + "/"
        if tar_location[-1] != "/":
            tar_location = tar_location + "/"

        if not os.path.exists(tar_location):
            os.makedirs(tar_location)

        if compress:
            tar_name = "profile.tar.gz"
        else:
            tar_name = "profile.tar"

        # see if this file exists first
        # if it does, delete it before we try to save the current session
        if os.path.isfile(tar_location + tar_name):
            os.remove(tar_location + tar_name)

        # if this is a dump on close, close the webdriver and wait for checkpoint
        if close_webdriver:
            webdriver.close()
            sleep_until_sqlite_checkpoint(browser_profile_folder)

        # backup and tar profile
        if compress:
            tar = tarfile.open(tar_location + tar_name, "w:gz", errorlevel=1)
        else:
            tar = tarfile.open(tar_location + tar_name, "w", errorlevel=1)
        logger.debug("BROWSER %i: Backing up full profile from %s to %s" % (
            browser_params.browser_id,
            browser_profile_folder,
            tar_location + tar_name,
        ))
        storage_vector_files = [
            "cookies.sqlite",  # cookies
            "cookies.sqlite-shm",
            "cookies.sqlite-wal",
            "places.sqlite",  # history
            "places.sqlite-shm",
            "places.sqlite-wal",
            "webappsstore.sqlite",  # localStorage
            "webappsstore.sqlite-shm",
            "webappsstore.sqlite-wal",
        ]
        storage_vector_dirs = [
            "webapps",  # related to localStorage?
            "storage",  # directory for IndexedDB
        ]
        for item in storage_vector_files:
            full_path = os.path.join(browser_profile_folder, item)
            if (not os.path.isfile(full_path) and full_path[-3:] != "shm"
                    and full_path[-3:] != "wal"):
                logger.critical(
                    "BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
                    (browser_params.browser_id, full_path))
            elif not os.path.isfile(full_path) and (full_path[-3:] == "shm" or
                                                    full_path[-3:] == "wal"):
                continue  # These are just checkpoint files
            tar.add(full_path, arcname=item)
        for item in storage_vector_dirs:
            full_path = os.path.join(browser_profile_folder, item)
            if not os.path.isdir(full_path):
                logger.warning(
                    "BROWSER %i: %s NOT FOUND IN profile folder, skipping." %
                    (browser_params.browser_id, full_path))
                continue
            tar.add(full_path, arcname=item)
        tar.close()
Beispiel #37
0
        return str(company_soup.find('span', {
            'class': classItem
        }).text).strip()
    except:
        return None


# In[62]:

df = pd.read_excel('data.xlsx')
counter = 0
for index, row in tqdm(df.iterrows()):
    if counter == 20:
        import ctypes
        ctypes.windll.user32.MessageBoxW(0, "IP blocked", "Error", 1)
        driver.close()
        drivertwo.close()
        break
    #print(index, row['Company Name'])
    url = "https://www.dnb.com/business-directory/company-search.html?term=" + row[
        'Company Name'] + "&page=1"
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        counter = 0
        url = "https://www.dnb.com" + str(
            soup.find('div', {
                'class': 'primary_name'
            }).find('a')['href']).strip()
        drivertwo.get(url)
        company_soup = BeautifulSoup(drivertwo.page_source, 'html.parser')
class GetCompanyInfo(object):
    """
    爬取天眼查下的企业的信息
    """
    def __init__(self):
        """
        初始化爬虫执行代理,使用firefox访问
        """
        self.username = ''
        self.password = ''
        self.options = webdriver.FirefoxOptions()
        self.options.add_argument('-headless')  # 无头参数
        self.geckodriver = r'geckodriver'
        self.driver = Firefox(executable_path=self.geckodriver,
                              firefox_options=self.options)

        self.start_url = 'https://www.tianyancha.com'

    def test(self):
        """
        调试专用
        :return:
        """
        start_url = ''
        self.driver.get(start_url)

        for k, v in cookies.items():
            self.driver.add_cookie({'name': k, 'value': v})
        time.sleep(1)
        print(self.driver.page_source)
        self.driver.close()

    def login(self):
        """
        登录并检查状态
        :return:
        """
        try:
            self.driver.get(self.start_url)

            print(self.driver.get_cookies())

            username = self.index_login()
            username_pattern = username[:3] + ' **** ' + username[-4:]
            print(username_pattern)
            page = self.driver.page_source
            is_login = page.find(username_pattern)

            print(is_login)
            if is_login != -1:
                print('登录成功')
        except Exception as e:
            print(e)

    def index_login(self):
        """
        主页下的登录模式
        :return:
        """
        get_login = self.driver.find_elements_by_xpath(
            '//a[@class="media_port"]')[0]  # 登录/注册
        print(get_login.text)
        # url为login的input
        get_login.click()
        login_by_pwd = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div[2]/div')  # 切换到手机登录
        print(login_by_pwd.text)
        login_by_pwd.click()
        input1 = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div/div[2]/input')  # 手机号码

        input2 = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div/div[3]/input')  # 密码
        print(input1.get_attribute('placeholder'))
        print(input2.get_attribute('placeholder'))

        username, password = self._check_user_pass()
        input1.send_keys(username)
        input2.send_keys(password)

        login_button = self.driver.find_element_by_xpath(
            '//div[@class="bgContent"]/div[2]/div/div[5]')  # 点击登录
        print(login_button.text)
        time.sleep(1)  # 必须等待否则鉴别是爬虫
        login_button.click()
        return username

    def _check_user_pass(self):
        """
        检查是否有帐号密码
        :return:
        """
        if self.username and self.password:
            return self.username, self.password
        else:
            username = input('输入您的手机号码\n')
            password = input('输入您的密码\n')
            return username, password

    def login_page_login(self):
        """
        url:www.tianyancha.com/login
        在这个url下的登录模式
        :return:
        """
        input1 = self.driver.find_element_by_xpath(
            '//div[contains(@class,"in-block")'
            ' and contains(@class, "vertical-top")'
            ' and contains(@class, "float-right")'
            ' and contains(@class, "right_content")'
            ' and contains(@class, "mt50")'
            ' and contains(@class, "mr5")'
            ' and contains(@class, "mb5")'
            ']/div[2]/div[2]/div[2]/input')

        input2 = self.driver.find_element_by_xpath(
            '//div[contains(@class,"in-block")'
            ' and contains(@class, "vertical-top")'
            ' and contains(@class, "float-right")'
            ' and contains(@class, "right_content")'
            ' and contains(@class, "mt50")'
            ' and contains(@class, "mr5")'
            ' and contains(@class, "mb5")'
            ']/div[2]/div[2]/div[3]/input')
        print(input1.get_attribute('placeholder'))
        input1.send_keys("")
        print(input2.get_attribute('placeholder'))
        input2.send_keys('')

        login_button = self.driver.find_element_by_xpath(
            '//div[contains(@class,"in-block")'
            ' and contains(@class, "vertical-top")'
            ' and contains(@class, "float-right")'
            ' and contains(@class, "right_content")'
            ' and contains(@class, "mt50")'
            ' and contains(@class, "mr5")'
            ' and contains(@class, "mb5")'
            ']/div[2]/div[2]/div[5]')

        print(login_button.text)
        time.sleep(1)
        login_button.click()

    def get_company_info(self, company_name, company_onwer):
        """
        获取想要的公司信息
        :param company_name:
        :param company_onwer:
        :return:
        """
        try:
            time.sleep(1)
            index_input_company = self.driver.find_element_by_xpath(
                '//input[@id="home-main-search"]')  # 主页搜索框

            index_input_company.send_keys(company_name)
            self.driver.find_element_by_xpath(
                '//div[contains(@class, "input-group-addon")'
                ' and contains(@class, "search_button")'
                ' and contains(@class, " white-btn")'
                ']').click()  # 点击搜索
            # button_name = find_company_button.find_element_by_xpath('//span').text    # span中的文本应该为【天眼一下】
            # print(button_name)

            # time.sleep(1)
            company_list = self.driver.find_elements_by_xpath(
                '//div[contains(@class, "b-c-white")'
                ' and contains(@class, "search_result_container")'
                ']/div')  # 获取当前页面所有公司的div
            company_info = list()
            for each_company in company_list:
                company_name_from_web = each_company.find_element_by_tag_name(
                    'img').get_attribute('alt')
                company_url = each_company.find_element_by_tag_name(
                    'a').get_attribute('href')
                company_reg_money = each_company.\
                    find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(2) span').text
                company_reg_time = each_company.\
                    find_element_by_css_selector('div .search_row_new.pt20 div div:nth-child(3) span').text
                company_score = each_company.find_element_by_css_selector(
                    '.c9.f20').text
                company_info.append([
                    company_name_from_web, company_url, company_reg_money,
                    company_reg_time, company_score + '分'
                ])  # 获取URL
                print(company_info[-1])

            print('当前匹配公司数:', len(company_info))
            if company_info:
                for each_list in company_info:
                    if each_list[0] == company_name:
                        return '爬取成功: ' + str(each_list)
                        # self.driver.get(each_list[1])     # 进入公司详情页
                        # score = self.driver.find_element_by_class_name('td-score-img').get_attribute('alt')
                        # print(score)
                return '爬取成功'
            else:
                return '爬取失败'
        except Exception as e:
            print(e)

    def main(self):

        self.login()
        msg = self.get_company_info('*****软件有限公司', '')
        print(msg)
        print('crawl finish...')

        self.driver.close()
Beispiel #39
0
def getBhavData():
    try:
        url = "https://www.bseindia.com/markets/equity/EQReports/BhavCopyDebt.aspx?expandable=3&utm_campaign=website&utm_source=sendgrid.com&utm_medium=email"
        try:
            opts = Options()
            opts.set_headless()
            browser = Firefox(
                executable_path="/home/prajwal/project/cherryApp/geckodriver",
                options=opts)
            browser.get(url)
        except:
            print('failed to connect firefox')

        #gets iframe element using xpath
        iframeElements = browser.find_elements_by_xpath(
            '/html/body/form/div[3]/div/div[3]/div[2]/div/div[2]/div/div/table/tbody/tr/td/iframe'
        )
        browser.switch_to.frame(iframeElements[0])  #switches to iframe element
        html_page = browser.page_source  #gets page source of iframe

        soup = BeautifulSoup(html_page, 'html.parser')
        link = soup.find('a', attrs={'id': 'btnhylZip'}, href=True)
        download_link = link['href']
        *_, file_name = download_link.split(
            '/')  #gets file name from download url link
        print(file_name)

        current_dir = os.getcwd()
        download_dir = os.path.join(
            current_dir, 'bhavDownload')  #path for download directory
        os.makedirs(download_dir,
                    exist_ok=True)  #creates dirctory if not exits

        #dowload file from link
        wget.download(download_link, out=download_dir)
        download_filename = os.path.join(download_dir, file_name)
        print(download_filename)

        #unzips downloaded file to download_dir and extracts csv file
        zip_ref = zipfile.ZipFile(download_filename, 'r')
        zip_ref.extractall(download_dir)
        zip_ref.close()

        #gets downloaded CSV File name
        csv_filename, *_ = file_name.split('_')
        csv_filename = '.'.join((csv_filename, 'CSV'))
        print(csv_filename)

        # files=os.listdir(download_dir)
        # print(files)
        try:
            csv_filepath = os.path.join(download_dir, csv_filename)
            print(csv_filepath)
            with open(csv_filepath, 'r') as csvFile:
                print('file opened')
                reader = csv.DictReader(csvFile)
                print(reader)
                rdb = RedisDb('localhost', 'eqlist')
                conn = rdb.connect()
                print(conn)
                index_key = 'id'
                rdb.deleteEquityList(conn)
                for row in reader:
                    row = dict(row)
                    field = dict([
                        (i, row[i]) for i in
                        ['SC_CODE', 'SC_NAME', 'OPEN', 'CLOSE', 'LOW', 'HIGH']
                    ])
                    field['HIGH'] = float(field['HIGH'])
                    field['LOW'] = float(field['LOW'])
                    value = rdb.getNewId(index_key, conn)
                    print(value)
                    rdb.setequityListindex(conn, value)
                    rdb.setequityHash(conn, value, field)
                    print(rdb.getequityHash(conn, value))
                print(rdb.getequityListindex(conn))
        except:
            print('failed to read csv')

        time.sleep(5)
        print('downloaded file deleting started')

        try:
            #deletes all files in download dir
            files = os.listdir(download_dir)
            for file in files:
                os.remove(os.path.join(download_dir, file))
        except OSError:
            pass

        browser.close()  #closes browser
    except:
        print('faild to get data')
Beispiel #40
0
class WeixinSelenium(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        self.driver = Firefox()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            self.driver.implicitly_wait(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass

    def get_query_words(self):
        query_words = []

        for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()
        return query_words

    @property
    def uids(self):
        return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        try:
            index = words.index(cut_word)
            return index
        except ValueError:
            pass
        return 0

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words = self.get_query_words()
        ind = self.query_index(query_words, word)

        for index, word in enumerate(query_words[ind:], 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1, (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break, new open browser!'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
                self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt))
                self.driver.implicitly_wait(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Beispiel #41
0
class Scraper:
    """ A Simple Scraper Example using Selenium """

    def __init__(self, base_url, query_params):
        self.__take_results_backup()
        options = Options()
        options.add_argument("--headless")
        try:
            self.driver=Chrome(options=options)
        except Exception as e:
            print(f'Error occured during Chrome driver : {e}')
            self.driver=Firefox()
        self.driver.get(base_url + query_params)
        # set up the next page element
        self.nextpage_element=self.driver.find_element_by_css_selector(
                ".pager-next a")


    def __take_results_backup(self):
        if os.path.exists('outfile.csv'):
            stamp=f'outfile{time.asctime().replace(":", "-").replace(" ","_")}'
            shutil.move('outfile.csv', stamp)

    def __save_info(self, lines):
        """
        This method saves the recently collected information line from webpage
        """

        with open('outfile.csv', 'a') as f:
            for line in lines:
                f.write(line)

    def nextpage(self, css_locator):
        self.driver.find_element_by_css_selector(
                css_locator).click()

    def scrape_page(self):
        providers = self.driver.find_elements_by_css_selector(".provider-row")

        for provider in providers:
            try:
                name = provider.find_element_by_css_selector(
                        ".provider-base-info h3 a").text
                email = provider.find_element_by_css_selector(
                        ".provider-link-details .icon-mail+a").get_attribute(
                                'href').replace('mailto:','')
                website = provider.find_element_by_css_selector(
                        ".provider-link-details .website-link a").get_attribute('href')
                location = provider.find_element_by_css_selector(
                        ".provider-info__details div.list-item:nth-of-type(4)").text

                lineitem=f'{name.replace(",","-")},{email},{website},{location.replace(",","-")}'

                # append the results
                self.__save_info(lineitem + "\n")

            except NoSuchElementException:
                # skip information and continue scraping the page
                continue

            except Exception as e:
                # discontinue in case of unknown error
                raise ScrapePageError(f"Error occured during scrape page : {e}")

    def scrape(self):
        # scrape until nextpage function doesn't fail
        while True:
            print(f"scraping the website... ")
            try:
                self.scrape_page()
                self.nextpage(".pager-next a")

            except ScrapePageError as e:
                print(e)
                self.nextpage(".pager-next a")
                continue

            except Exception as e:
                print("Something went wrong: ", e)
                self.driver.close()
                break