Example #1
0
 def __init__(self, browser_wrapper):
     self._connected = False
     self._current_html = None
     self._browser_wrapper = browser_wrapper
     self._browser = StatefulBrowser()
     self._browser.addHeaders = [('User-Agent', 'Firefox'),
                                 ('Accept-Language', 'en-US,en;q=0.5')]
Example #2
0
    def prepare(self, **kwargs):
        self.browser = StatefulBrowser()
        self._mediatypes = kwargs.get("mediatypes")
        self._qualities = kwargs.get("qualities")
        self._templates = kwargs.get("templates")
        auth_package = kwargs.get("auth")
        if auth_package[0] == types.AuthType.COOKIES:
            jar = requests.cookies.RequestsCookieJar()
            session_values = auth_package[1]["session"]

            jar.set(
                "session",
                session_values["value"],
                domain=".cloud.blender.org",
                path="/",
            )

            self.browser.session.cookies = jar
            self.browser.open("https://cloud.blender.org/settings/profile"
                              ).status_code == 200
            profile_page = self.browser.get_current_page()
            try:
                assert profile_page.find(class_="py-1") is not None
            except AssertionError:
                echo.error_msg("Authentication was not successfull")
                exit(1)

            echo.debug_msg("Authentication successfull")
Example #3
0
 def extract_info(url):
     user_agent = UserAgent(fallback="Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0")
     browser = StatefulBrowser(user_agent=user_agent.random)
     browser.open(url)
     page = browser.get_current_page()
     result = re.search(r'"bitcoin:([\w\d]+)\?amount=(\d+\.\d+)&', str(page))
     return PaymentInfo(float(result.group(2)), result.group(1))
Example #4
0
def extract_article(browser: mechanicalsoup.StatefulBrowser,
                    url: str) -> (Article, []):
    browser.open(url)
    page = browser.get_current_page()

    script_data = page.select('script[type="application/ld+json"]')

    first_script_data = json.loads(script_data[1].text)
    second_script_data = json.loads(script_data[0].text)

    header = second_script_data["headline"].replace(u'\xa0', ' ')
    description = second_script_data["description"].replace(u'\xa0', ' ')
    category = find_category(first_script_data)
    published_at = datetime.strptime(second_script_data["datePublished"],
                                     '%Y-%m-%dT%H:%M:%S.%fZ')
    modified_at = datetime.strptime(second_script_data["dateModified"],
                                    '%Y-%m-%dT%H:%M:%S.%fZ')
    authors = retrieve_authors(page)
    paragraphs = retrieve_paragraphs(page)

    print(f"Currently working on article {header}")

    comments = retrieve_comments(url.replace("clanek", "diskuze"))

    article = Article(link=url,
                      header=header,
                      description=description,
                      category=category,
                      published_at=published_at,
                      modified_at=modified_at,
                      paragraphs=paragraphs)
    return article, authors, comments
Example #5
0
class WebpageSummarizer(object):
    """
    Generates summary of a given web page.
    """
    def __init__(self):
        self.browser = StatefulBrowser(user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0')
        self.browser.session.headers.update({'Upgrade-Insecure-Requests': '1'})

    def summarize_webpage(self, url, summarization_ratio):
        """
        Takes a web page URL and returns the title and a summary of the web page.
        :param url: Web page URL.
        :param summarization_ratio: Fraction of original text to include in the summary.
        :return: Web page title and summarized web page text.
        """
        title = summarized_text = ''
        try:
            self.browser.open(url)
            page = self.browser.get_current_page()
            # Find all the paragraphs because they contain the main web page text
            page_text = ' '.join(map(lambda p: p.text, page.find_all('p')))
            title = page.title.text.strip()
            # Generate a summary of the given web page text if it contains more than 10 sentences
            summarized_text = summarize(page_text, ratio=summarization_ratio).strip()
            if summarized_text == '':
                summarized_text = page_text
        except Exception as e:
            print(e)
        finally:
            self.browser.close()

        return title, summarized_text
Example #6
0
def click_ref_link(username: str, counter: int):
    try:
        url: str = f"https://ref.moneyguru.co/{username}"
        browser = StatefulBrowser()
        browser.open(url)
        if browser.url == "https://moneyguru.co":
            print(f'{counter}: clicked', url)
    except requests.exceptions.ConnectionError:
        print('You have network connection problem')
def get_network_fee():  # with web3.py he gives 520 gwei which is too much
    """
    Give an estimate of network fee for a simple ether transaction.
    from http://gasprice.dopedapp.com/
    :return: network cost
    """
    br = StatefulBrowser(user_agent="Firefox")
    page = br.open("http://gasprice.dopedapp.com/")
    response = page.json()
    gwei_price = float(response["safe_price_in_gwei"])
    return gwei_price * GWEI_TO_ETHER * NB_GAS_FOR_TRANSACTION
Example #8
0
def isbn2url(isbn: str) -> Optional[str]:
    """Return the ketab.ir book-url for the given isbn."""
    browser = StatefulBrowser(user_agent=USER_AGENT)
    browser.open('http://www.ketab.ir/Search.aspx')
    browser.select_form()
    browser['ctl00$ContentPlaceHolder1$TxtIsbn'] = isbn
    browser.submit_selected()
    first_link = browser.get_current_page().select_one('.HyperLink2')
    if first_link is None:
        return
    return browser.absolute_url(first_link['href'])
Example #9
0
def isbn2url(isbn: str) -> Optional[str]:
    """Return the ketab.ir book-url for the given isbn."""
    browser = StatefulBrowser(user_agent=USER_AGENT)
    browser.open('http://www.ketab.ir/Search.aspx')
    browser.select_form()
    browser['ctl00$ContentPlaceHolder1$TxtIsbn'] = isbn
    browser.submit_selected()
    first_link = browser.get_current_page().select_one('.HyperLink2')
    if first_link is None:
        return
    return browser.absolute_url(first_link['href'])
Example #10
0
 def __init__(self, target_session, target_args):
     self._target_session = target_session
     self._browser = StatefulBrowser(session=target_session)
     self._args = target_args
     self._target_url = None
     self._username_password = None
     self._form = None
Example #11
0
    def _re_login_with_form(self):

        browser = StatefulBrowser(session=self._target_session)

        self._do_login(browser, self._target_url, self._username_password, self._form)

        return browser
Example #12
0
    def extract_info(cls, url):
        """
        Extracts amount and BitCoin address from a UndergroundPrivate payment URL.
        :param url: the URL like https://spectrocoin.com/en/order/view/1045356-0X6XzpZi.html
        :return: a tuple of the amount in BitCoin along with the address
        """
        user_agent = UserAgent(
            fallback=
            "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"
        )
        browser = StatefulBrowser(user_agent=user_agent.random)
        browser.open(url)
        soup = browser.get_current_page()

        amount = soup.select_one('div.payAmount').text.split(" ")[0]
        address = soup.select_one('div.address').text
        return PaymentInfo(float(amount), address)
Example #13
0
def search_postcode(searchterm: str):
    """ this does a search against the Australia Post site to
    grab postcode/suburb/state data

    params.searchterm: the search string
    type.searchterm: str

    :returns: postcode=int, suburb=str, state=str
    :rtype: dict

    """
    browser = StatefulBrowser(soup_config={'features': 'lxml'})
    # Uncomment for a more verbose output:
    # browser.set_verbose(2)

    # build the URL for search
    searchurl = "https://auspost.com.au/postcode/{}".format(
        searchterm.replace(' ', '%20'))

    # grab the page
    try:
        browser.open(searchurl)
        # get the page contents
        page = browser.get_current_page()
        # find the lis within the ol
        lis = page.find_all('ol')[0].find_all('li')
        # pull out the data
        data_lis = [li for li in lis if 'id="result' in str(li)]
        for list_element in data_lis:
            # this is the data found in June 2017
            #<span class="suburb-map-postcode">POSTCODE</span>
            #<h2>SUBURB, STATE</h2>
            postcode = list_element.find_all('span')
            if postcode:
                postcode = postcode[0].contents[0]
                secondfield = list_element.find_all('h2')
                if secondfield:
                    suburb, state = secondfield[0].contents[0].split(",")
                    if postcode and state.strip() and suburb.strip():
                        yield {
                            'postcode': int(postcode),
                            'state': state.strip(),
                            'suburb': suburb.strip(),
                        }
    except Exception:
        raise ConnectionError("Failed to open the url '{}'".format(searchurl))
	def get_browser(self):
		browser = StatefulBrowser()
		url = 'https://tdserebro.ru/login?changeCity=1526384'
		browser.open(url)
		browser.select_form('form.login_form')
		browser['_username'] = tdserebro_login
		browser['_password'] = tdserebro_password
		browser.submit_selected()

		self.browser = browser
Example #15
0
    def login(
            self,
            username,  # type: Text
            password,  # type: Text
            customer,  # type: Text
    ):  # type: (...) -> Connection
        browser = StatefulBrowser()
        response = browser.open(self.url)
        response.raise_for_status()
        assert 'login' in (browser.get_url() or '').lower()

        browser.select_form('#loginForm')
        browser['UserName'] = username
        browser['Password'] = password
        browser['CustomerIdentifier'] = customer
        browser['Language'] = 'ENG'
        response = browser.submit_selected()
        response.raise_for_status()

        html_response = HtmlResponse.from_response(response)
        if html_response.soup.find(id='loginForm'):
            # Still on the login form, so login must have failed
            raise LoginFailed('Login failed')

        return Connection(browser, self)
Example #16
0
def create_browser():
    browser = StatefulBrowser()

    browser.open(f'{main_url}/wp-login.php')

    browser.select_form('form[name="loginform"]')
    browser['log'] = login
    browser['pwd'] = password

    browser.submit_selected()

    return browser
Example #17
0
def get_current_usd_to_cny():
    """
    Get the current China mainland bank transfer buying rate for USD to CNY.

    Casting returned objects to string can ensure they do not inadvertently contain
    a BS4 object, which presents difficulties in pickling which can be hard to debug.
    If you do use newt.db, scrape objects can be tested with newt.db.jsonpickle
    `dumps` function, to ensure that the object can be both pickled and also
    serialized to json by newt.db and indexed/saved in a PostgreSQL jsonb field.

    :return: str(): rate in CNY, str(): time string
    """
    browser = StatefulBrowser()
    browser.open('http://www.boc.cn/sourcedb/whpj/enindex.html')
    trs = browser.get_current_page().find_all("tr")
    cells = _get_usd_row_cells(trs)
    rate = cells[0].text
    time = cells[5].text
    time = time.split()
    time = time[0] + ' ' + time[1]
    return str(rate), str(time)
Example #18
0
def scrape_HTML(url):
    """Scrapes the HTML from W4MPJobs"""
    browser = StatefulBrowser()
    page = browser.open(url)
    form = Form(page.soup.form)

    # Selects all on the number of results radio button
    number_results_data = {"ctl00$MainContent$RadioButtonList2": 9999}
    form.set_radio(number_results_data)

    # Selects NWM or more on salary radio button
    salary_data = {"ctl00$MainContent$rblSalary": "nmwormore"}
    form.set_radio(salary_data)

    # Selects outside London on the location radio button – other options commented out
    location_data = {"ctl00$MainContent$rblJobs": "outside"}
    # location_data = {"ctl00$MainContent$rblJobs": "inlondon"}
    # location_data = {"ctl00$MainContent$rblJobs": "both"}
    form.set_radio(location_data)

    # Submits the form
    response = browser.submit(form, page.url)

    # Gets response as text
    response = response.text

    # Closes the browser
    browser.close()

    return response
Example #19
0
    def test_is_logged_in(self, requests_mock):
        requests_mock.get(TEST_URL + '/Account/Login',
                          text=(HTML / 'login.html').read_text())
        requests_mock.post(
            TEST_URL + '/Account/Login',
            cookies={'.ASPXAUTH': 'XXX'},
            text=(HTML / 'home.html').read_text(),
        )
        requests_mock.get(
            TEST_URL + '/Account/LogOff',
            cookies={'.ASPXAUTH': None},
            text=(HTML / 'login.html').read_text(),
        )
        browser = StatefulBrowser()
        session = Session(TEST_URL, browser)
        assert not session.is_logged_in
        session.log_in('joe.bloggs', 'abc123')

        # The ``requests-mock`` library currently doesn't mock cookies in sessions properly.
        # In the meantime, mock the cookie by directly setting it on the ``browser`` object.
        # https://github.com/jamielennox/requests-mock/issues/17
        browser.get_cookiejar().set_cookie(
            create_cookie(name='.ASPXAUTH', value='XXX'))

        assert session.is_logged_in
        session.log_out()

        # As above.
        browser.get_cookiejar().set_cookie(
            create_cookie(name='.ASPXAUTH', value=None))

        assert not session.is_logged_in
Example #20
0
def retreive_download_url(url, filename=None):
    '''
        Retrieve files url from page body.

        If filename is given, filter by it.
    '''

    try:
        br = StatefulBrowser()
        response = br.open(url)
        soup = response.soup
        search_tag = soup.find('ul', {'class': 'resource-list'})
        title = soup.find('h1', {'itemprop': 'name'}).text.strip()

        if filename is None:
            urls = many_files(search_tag)
        else:
            urls = single_file(search_tag, filename)
        return title, urls

    except Exception as e:
        raise Exception('Bad URL')
Example #21
0
    def set_browser(self):
        user_agents = (
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1'
            ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50'
            ' (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
            'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)'
            ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2'
            ' (KHTML, like Gecko) Chrome/6.0',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)'
            ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)')
        session = req_session()
        session.headers.update({'Referer': 'https://www.deviantart.com/'})
        if self.mature:
            session.cookies.update({'agegate_state': '1'})
        session.mount('https://', req_adapters.HTTPAdapter(max_retries=3))

        self.browser = StatefulBrowser(session=session,
                                       user_agent=choice(user_agents))
Example #22
0
def creaBrowser(config=Namespace()):
    browser = StatefulBrowser(soup_config={'features': "html.parser"},
                              raise_on_404=True,
                              user_agent="SMparser",
                              )

    if 'verbose' in config:
        browser.set_verbose(config.verbose)

    if 'debug' in config:
        browser.set_debug(config.debug)

    return browser
Example #23
0
    def set_browser(self):
        user_agents = (
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1'
            ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50'
            ' (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
            'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)'
            ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2'
            ' (KHTML, like Gecko) Chrome/6.0',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)'
            ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)'
        )
        session = req_session()
        session.headers.update({'Referer': 'https://www.deviantart.com/'})
        if self.mature:
            session.cookies.update({'agegate_state': '1'})
        session.mount('https://', req_adapters.HTTPAdapter(max_retries=3))

        self.browser = StatefulBrowser(session=session,
                                       user_agent=choice(user_agents))
Example #24
0
    def test_save_logs(self, requests_mock, tmp_path: Path):
        requests_mock.get(TEST_URL + '/Account/Login',
                          text=(HTML / 'login.html').read_text())
        requests_mock.post(
            TEST_URL + '/Account/Login',
            cookies={'.ASPXAUTH': 'XXX'},
            text=(HTML / 'home.html').read_text(),
        )
        requests_mock.get(
            TEST_URL + '/Account/LogOff',
            cookies={'.ASPXAUTH': None},
            text=(HTML / 'login.html').read_text(),
        )
        browser = StatefulBrowser()
        session = Session(TEST_URL, browser)
        session.log_in('joe.bloggs', 'abc123')

        # The ``requests-mock`` library currently doesn't mock cookies in sessions properly.
        # In the meantime, mock the cookie by directly setting it on the ``browser`` object.
        # https://github.com/jamielennox/requests-mock/issues/17
        browser.get_cookiejar().set_cookie(
            create_cookie(name='.ASPXAUTH', value='XXX'))

        session.log_out()

        # As above.
        browser.get_cookiejar().set_cookie(
            create_cookie(name='.ASPXAUTH', value=None))

        session.save_logs(str(tmp_path))
        files = list(sorted(tmp_path.iterdir()))
        assert len(files) == 3
        assert files[0].name.endswith('Z-login-0.txt')
        assert files[1].name.endswith('Z-home-0.txt')
        assert files[2].name.endswith('Z-logout-0.txt')
        with files[0].open() as log:
            assert isinstance(datetime.fromisoformat(next(log).strip()),
                              datetime)
            assert next(log).startswith('GET ' + TEST_URL)
            assert next(log).startswith('200 None')
            assert next(log) == '\n'
        assert files[0].read_text().endswith((HTML / 'login.html').read_text())
Example #25
0
    def login(self, url, username, password):
        """ Login in the given url with given username and password
        returns a browser instance which has an active logged in sesssion
        raises exception if any error occurs, or if credentials are wrong
        """

        logging.info("Logging in with the given credentials")
        try:
            # Create a browser instance
            b = Browser()
            b.open(url)
            b.select_form(nr=0)
            
            # Enter the username password
            b["login_user"] = username
            b["password"] = password

            # Login with given credentials
            response = b.submit_selected()
            self.verifyCredentials(response.soup)
        except Exception as e:
            logging.error(e)
            raise Exception("Login Failed: Invalid Credentials")
        return b
Example #26
0
def _get_network_cost(speed):
    br = StatefulBrowser(user_agent='Firefox')
    page = br.open('https://bitcoinfees.21.co/api/v1/fees/recommended')
    response = page.json()
    satoshirate = float(response[speed])
    return satoshirate
Example #27
0
def query_TRILEGAL(RA: float, Dec: float):
    """
    Begins TRILEGAL query.
    Args:
        RA, Dec: Coordinates of the target.
    Returns:
        output_url (str): URL of page with query results.
    """
    # fill out and submit online TRILEGAL form
    browser = StatefulBrowser()
    browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal_1.6")
    browser.select_form(nr=0)
    browser["gal_coord"] = "2"
    browser["eq_alpha"] = str(RA)
    browser["eq_delta"] = str(Dec)
    browser["field"] = "0.1"
    browser["photsys_file"] = "tab_mag_odfnew/tab_mag_TESS_2mass.dat"
    browser["icm_lim"] = "1"
    browser["mag_lim"] = "21"
    browser["binary_kind"] = "0"
    browser.submit_selected()
    print("TRILEGAL form submitted.")
    sleep(5)
    if len(browser.get_current_page().select("a")) == 0:
        browser = StatefulBrowser()
        browser.open("http://stev.oapd.inaf.it/cgi-bin/trilegal_1.5")
        browser.select_form(nr=0)
        browser["gal_coord"] = "2"
        browser["eq_alpha"] = str(RA)
        browser["eq_delta"] = str(Dec)
        browser["field"] = "0.1"
        browser["photsys_file"] = "tab_mag_odfnew/tab_mag_2mass.dat"
        browser["icm_lim"] = "1"
        browser["mag_lim"] = "21"
        browser["binary_kind"] = "0"
        browser.submit_selected()
        # print("TRILEGAL form submitted.")
        sleep(5)
        if len(browser.get_current_page().select("a")) == 0:
            print("TRILEGAL too busy, \
                using saved stellar populations instead.")
            return None
        else:
            this_page = browser.get_current_page()
            data_link = this_page.select("a")[0].get("href")
            output_url = "http://stev.oapd.inaf.it/" + data_link[3:]
            return output_url
    else:
        this_page = browser.get_current_page()
        data_link = this_page.select("a")[0].get("href")
        output_url = "http://stev.oapd.inaf.it/" + data_link[3:]
        return output_url
Example #28
0
def scrapeScene(filename, date, url):
    ret = []
    browser = StatefulBrowser(session=None)
    browser.open("https://ifeelmyself.com/public/main.php")
    cookie_obj = create_cookie(name='tags_popup_shown',
                               value='true',
                               domain='ifeelmyself.com')
    browser.session.cookies.set_cookie(cookie_obj)
    if url:
        debugPrint("Url found, using that to scrape")
        browser.open(url)
        response = browser.page
        table = response.find(
            class_=["blog_wide_news_tbl entry ppss-scene", "entry ppss-scene"])
        if table:
            ret = extract_info(table)
    else:
        debugPrint("Analyzing filename...")
        artist_id_match = re.search(r"(f\d{3,5})", filename, re.I)
        if artist_id_match:
            artist_id = artist_id_match.group(0)
            video_id = re.search(r"-(\d+)", filename, re.I).group(1)
            browser.open("https://ifeelmyself.com/public/main.php?page=search")
            browser.select_form()
            browser['keyword'] = artist_id
            browser['view_by'] = "news"
            browser.submit_selected()
            response = browser.page
            debugPrint("Searching for video_id")
            debugPrint(artist_id + "-" + video_id)
            tables = response.find_all(class_=[
                "blog_wide_news_tbl entry ppss-scene", "entry ppss-scene"
            ])
            for table in tables:
                img = str(table.find("img")['src'])
                debugPrint(f"Image:{str(img)}")
                if (f"/{video_id}/{artist_id}-" in img) and img.endswith(
                    ("vg.jpg", "hs.jpg")):
                    debugPrint("Found a single match video!")
                    # Extract data from this single result
                    ret = extract_info(table)
                    break
            else:
                sys.stderr.write("0 matches found! Checking offset")
                pages = int(
                    response.find_all("a",
                                      class_="pagging_nonsel")[-1].get_text())
                if pages:
                    for offset in range(10, pages * 10, 10):
                        browser.open(
                            "https://ifeelmyself.com/public/main.php?page=search_results&offset="
                            + str(offset))
                        response = browser.page
                        tables = response.find_all(class_=[
                            "blog_wide_news_tbl entry ppss-scene",
                            "entry ppss-scene"
                        ])
                        for table in tables:
                            img = str(table.find("img"))
                            debugPrint(f"Image:{img}")
                            if (f"/{video_id}/{artist_id}-"
                                    in img) and img.endswith(
                                        ("vg.jpg", "hs.jpg")):
                                ret = extract_info(table)
                                break
                else:
                    sys.stderr.write("0 matches found!, check your filename")

        else:
            debugPrint("Name changed after downloading")
            filename = filename.lower()
            extract_from_filename = re.match(
                r"^([0-9\.]{6,10})?(?<title>.+)\s(?<artist>\w+)(\.mp4)?$",
                filename)
            if extract_from_filename:
                title = extract_from_filename.group('title')
                #if date:
                #    date_dbY = datetime.strptime(date, '%d.%m.%Y').date().strftime('%d %b %Y')
                #    month = datetime.strptime(date, '%d.%m.%Y').date().strftime('%B')
                #    year = datetime.strptime(date, '%d.%m.%Y').date().strftime('%Y')
                #    debugPrint("Date: "+date_dbY)
                if title:
                    title = title.lower().replace("ifeelmyself", "")
                    title = title.replace("-", "")
                    title = title.replace("by", "")
                    debugPrint(f"Title: {title}")
                browser.open(
                    "https://ifeelmyself.com/public/main.php?page=search")
                browser.select_form()
                debugPrint("Searching..")
                browser['keyword'] = title
                browser['view_by'] = "news"
                browser.submit_selected()
                response = browser.page
                #Obtaining and counting the results. Ideally you only have a single result
                matches = response.find_all(
                    "a", href='javascript:;'
                )  #This a href javascript contains all the titles
                if len(matches) == 1:
                    debugPrint("Found a single match!")
                    table = response.find(class_=[
                        "blog_wide_news_tbl entry ppss-scene",
                        "entry ppss-scene"
                    ])
                else:
                    if len(matches) == 0:
                        sys.stderr.write("0 matches found! Check filename")
                        print("{}}")
                        exit
                    if len(matches) > 1:
                        debugPrint(
                            "Multiple videos found, maybe refine search term?")
                        index = [
                            i for i, s in enumerate(matches) if title in str(s)
                        ]
                        tables = response.find_all(class_=[
                            "blog_wide_news_tbl entry ppss-scene",
                            "entry ppss-scene"
                        ])
                        table = tables[0]  #Getting first
                if table:
                    ret = extract_info(table)
            else:
                debugPrint("Not a supported filename")
                print("{}")
                exit
    return ret
Example #29
0
class Dagr:
    """deviantArt gallery ripper class"""

    NAME = basename(__file__)
    __version__ = "0.71.3"
    MAX_DEVIATIONS = 1000000  # max deviations
    ART_PATTERN = (r"https://www\.deviantart\.com/"
                   r"[a-zA-Z0-9_-]*/art/[a-zA-Z0-9_-]*")

    def __init__(self):
        # Internals
        self.init_mimetypes()
        self.browser = None
        self.errors_count = dict()

        # Configuration
        self.directory = getcwd() + "/"
        self.mature = False
        self.overwrite = False
        self.reverse = False
        self.test_only = False
        self.verbose = False

        # Current status
        self.deviant = ""

    def init_mimetypes(self):
        mimetypes_init()
        # These MIME types may be missing from some systems
        add_mimetype('image/vnd.adobe.photoshop', '.psd')
        add_mimetype('image/photoshop', '.psd')
        add_mimetype('application/rar', '.rar')
        add_mimetype('application/x-rar-compressed', '.rar')
        add_mimetype('application/x-rar', '.rar')
        add_mimetype('image/x-canon-cr2', '.tif')
        add_mimetype('application/x-7z-compressed', '.7z')
        add_mimetype('application/x-lha', '.lzh')

    def load_configuration(self):
        my_conf = configparser.ConfigParser()
        # Try to read global then local configuration
        my_conf.read([expanduser("~/.config/dagr/dagr_settings.ini"),
                      path_join(getcwd(), "dagr_settings.ini")])
        if my_conf.has_option("DeviantArt", "MatureContent"):
            self.mature = my_conf.getboolean("DeviantArt", "MatureContent")
        if my_conf.has_option("Dagr", "OutputDirectory"):
            self.directory = abspath(
                expanduser(my_conf.get("Dagr", "OutputDirectory"))
                ) + "/"

    def start(self):
        if not self.browser:
            # Set up fake browser
            self.set_browser()

    def set_browser(self):
        user_agents = (
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1'
            ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50'
            ' (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
            'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9',
            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)'
            ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2'
            ' (KHTML, like Gecko) Chrome/6.0',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)'
            ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)'
        )
        session = req_session()
        session.headers.update({'Referer': 'https://www.deviantart.com/'})
        if self.mature:
            session.cookies.update({'agegate_state': '1'})
        session.mount('https://', req_adapters.HTTPAdapter(max_retries=3))

        self.browser = StatefulBrowser(session=session,
                                       user_agent=choice(user_agents))

    def get(self, url, file_name=None):
        if (file_name and not self.overwrite and
                glob(file_name + ".*")):
            print(glob(file_name + ".*")[0] + " exists - skipping")
            return None

        if isinstance(url, Tag):
            # Download and save soup links
            get_resp = self.browser.download_link(url, file_name)
        else:
            # Direct URL
            get_resp = self.browser.session.get(url)
            if file_name:
                with open(file_name, "wb") as local_file:
                    local_file.write(get_resp.content)

        if get_resp.status_code != req_codes.ok:
            raise DagrException("incorrect status code - " +
                                str(get_resp.status_code))

        if file_name is None:
            return get_resp.text

        if get_resp.headers.get("last-modified"):
            # Set file dates to last modified time
            mod_time = mktime(parsedate(get_resp.headers.get("last-modified")))
            utime(file_name, (mod_time, mod_time))

        if get_resp.headers.get("content-type"):
            content_type = get_resp.headers.get("content-type").split(";")[0]
            file_ext = guess_extension(content_type)
            if file_ext:
                rename(file_name, file_name + file_ext)
            else:
                raise DagrException('unknown content-type - ' + content_type)

        return file_name

    def find_link(self, link):
        filelink = None
        filename = basename(link)
        mature_error = False
        self.browser.open(link)
        # Full image link (via download link)
        link_text = re.compile("Download( (Image|File))?")
        img_link = None
        for candidate in self.browser.links("a"):
            if link_text.search(candidate.text) and candidate.get("href"):
                img_link = candidate
                break

        if img_link and img_link.get("data-download_url"):
            return (filename, img_link)

        if self.verbose:
            print("Download link not found, falling back to direct image")

        current_page = self.browser.get_current_page()
        # Fallback 1: try meta (filtering blocked meta)
        filesearch = current_page.find("meta", {"property": "og:image"})
        if filesearch:
            filelink = filesearch['content']
            if basename(filelink).startswith("noentrythumb-"):
                filelink = None
                mature_error = True
        if not filelink:
            # Fallback 2: try collect_rid, full
            filesearch = current_page.find("img",
                                           {"collect_rid": True,
                                            "class": re.compile(".*full")})
            if not filesearch:
                # Fallback 3: try collect_rid, normal
                filesearch = current_page.find("img",
                                               {"collect_rid": True,
                                                "class":
                                                    re.compile(".*normal")})
            if filesearch:
                filelink = filesearch['src']

        if current_page.find(
                "span", {"itemprop": "title"}).text == "Literature":
            filelink = self.browser.get_url()
            return (filename, filelink)

        if not filelink:
            if mature_error:
                if self.mature:
                    raise DagrException("maybe not an image")
                else:
                    raise DagrException("maybe a mature deviation/" +
                                        "not an image")
            else:
                raise DagrException("all attemps to find a link failed")

        return (filename, filelink)

    def handle_download_error(self, link, link_error):
        error_string = str(link_error)
        print("Download error (" + link + ") : " + error_string)
        if error_string in self.errors_count:
            self.errors_count[error_string] += 1
        else:
            self.errors_count[error_string] = 1

    def get_pages(self, mode, base_url):
        pages = []
        for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24):
            html = ""
            url = base_url + str(i)

            try:
                html = self.get(url)
            except DagrException:
                print("Could not find " + self.deviant + "'s " + mode)
                return pages

            prelim = re.findall(Dagr.ART_PATTERN, html,
                                re.IGNORECASE | re.DOTALL)

            for match in prelim:
                if match not in pages:
                    pages.append(match)

            done = re.findall("(This section has no deviations yet!|"
                              "This collection has no items yet!)",
                              html, re.IGNORECASE | re.S)

            if done:
                break

            print(self.deviant + "'s " + mode + " page " +
                  str(int((i / 24) + 1)) + " crawled...")

        if not self.reverse:
            pages.reverse()

        return pages

    def get_images(self, mode, mode_arg, pages):
        base_dir = self.directory + self.deviant + "/" + mode
        if mode_arg:
            base_dir += "/" + mode_arg

        try:
            da_make_dirs(base_dir)
        except OSError as mkdir_error:
            print(str(mkdir_error))
            return

        # Find previously downloaded pages
        existing_pages = []
        try:
            with open(base_dir + "/.dagr_downloaded_pages", "r") as filehandle:
                existing_pages = json.load(filehandle)
        except FNF_ERROR:
            # May not exist (new directory, ...)
            pass
        if not self.overwrite:
            pages = [x for x in pages if x not in existing_pages]

        print("Total deviations to download: " + str(len(pages)))
        for count, link in enumerate(pages, start=1):
            if self.verbose:
                print("Downloading " + str(count) + " of " +
                      str(len(pages)) + " ( " + link + " )")
            filename = ""
            filelink = ""
            try:
                filename, filelink = self.find_link(link)
            except (KeyboardInterrupt, SystemExit):
                raise
            except DagrException as link_error:
                self.handle_download_error(link, link_error)
                continue

            if not self.test_only:
                try:
                    self.get(filelink, base_dir + "/" + filename)
                except DagrException as get_error:
                    self.handle_download_error(link, get_error)
                    continue
                else:
                    if link not in existing_pages:
                        existing_pages.append(link)
            else:
                print(filelink)

        # Update downloaded pages cache
        with open(base_dir + "/.dagr_downloaded_pages", "w") as filehandle:
            json.dump(existing_pages, filehandle)

    def deviant_get(self, mode, mode_arg=None):
        print("Ripping " + self.deviant + "'s " + mode + "...")

        base_url = "https://www.deviantart.com/" + self.deviant.lower() + "/"

        if mode == "favs":
            base_url += "favourites/?catpath=/&offset="
        elif mode == "collection":
            base_url += "favourites/" + mode_arg + "?offset="
        elif mode == "scraps":
            base_url += "gallery/?catpath=scraps&offset="
        elif mode == "gallery":
            base_url += "gallery/?catpath=/&offset="
        elif mode == "album":
            base_url += "gallery/" + mode_arg + "?offset="
        elif mode == "query":
            base_url += "gallery/?q=" + mode_arg + "&offset="
        elif mode == "category":
            base_url += "gallery/?catpath=" + mode_arg + "&offset="

        pages = self.get_pages(mode, base_url)
        if not pages:
            print(self.deviant + "'s " + mode + " had no deviations.")
            return
        print("Total deviations in " + self.deviant + "'s " +
              mode + " found: " + str(len(pages)))

        self.get_images(mode, mode_arg, pages)

        print(self.deviant + "'s " + mode + " successfully ripped.")

    def group_get(self, mode):
        print("Ripping " + self.deviant + "'s " + mode + "...")

        base_url = 'https://www.deviantart.com/' + self.deviant.lower() + '/'
        if mode == "favs":
            base_url += "favourites/"
        elif mode == "gallery":
            base_url += "gallery/"

        folders = []

        i = 0
        while True:
            html = self.get(base_url + '?offset=' + str(i))
            k = re.findall('class="ch-top" href="' + base_url +
                           '([0-9]*/[a-zA-Z0-9_-]*)"',
                           html, re.IGNORECASE)
            if k == []:
                break

            new_folder = False
            for match in k:
                if match not in folders:
                    folders.append(match)
                    new_folder = True
            if not new_folder:
                break
            i += 10

        # no repeats
        folders = list(set(folders))

        if not folders:
            print(self.deviant + "'s " + mode + " is empty.")

        print("Total folders in " + self.deviant + "'s " +
              mode + " found: " + str(len(folders)))

        if self.reverse:
            folders.reverse()

        pages = []
        for folder in folders:
            label = folder.split("/")[-1]
            print("Crawling folder " + label + "...")
            pages = self.get_pages(mode, base_url + folder + '?offset=')

            if not self.reverse:
                pages.reverse()

            self.get_images(mode, label, pages)

        print(self.deviant + "'s " + mode + " successfully ripped.")

    def print_errors(self):
        if self.errors_count:
            print("Download errors count:")
            for error in self.errors_count:
                print("* " + error + " : " + str(self.errors_count[error]))
Example #30
0
class UserScraper(object):
    """
    Scrapes fakeaddressgenerator.com for fake user data.
    It also adds some basic additional information for server configuration.
    """

    attributes = [
        'Full Name', 'Street', 'City', 'State Full', 'Zip Code',
        'Phone Number', 'Company', 'Username'
    ]

    pages = {
        'NL':
        'http://www.fakeaddressgenerator.com/World/Netherlands_address_generator',
        'US': 'http://www.fakeaddressgenerator.com/World/us_address_generator',
        'UK': 'http://www.fakeaddressgenerator.com/World/uk_address_generator',
        'CA': 'http://www.fakeaddressgenerator.com/World/ca_address_generator',
    }

    def __init__(self, country='NL'):
        self.country_code = country
        self.browser = StatefulBrowser()
        self.page = UserScraper.pages.get(country)

    def get_user(self):
        self.browser.open(self.page)
        attrs = {}

        for attr in self.attributes:
            attrs[attr] = self._get_attribute(attr)

        attrs['country_code'] = self.country_code
        attrs['password'] = ''.join(
            random.choice(string.ascii_letters + string.digits)
            for _ in range(12))
        attrs['email'] = 'authentic8989+' + attrs['Username'] + '@gmail.com'
        attrs['rootpw'] = attrs['password']
        attrs['ns1'] = 'ns1'
        attrs['ns2'] = 'ns2'
        attrs['hostname'] = attrs['Username'] + '.hostname.com'
        attrs['testnet'] = 'off'

        return self._map_to_config(attrs)

    @staticmethod
    def _map_to_config(attrs):
        config = {}
        # Treat full name separately because it needs to be split
        if 'Full Name' in attrs:
            config['user'] = {}
            config['user']['firstname'] = attrs['Full Name'].split('\xa0')[0]
            config['user']['lastname'] = attrs['Full Name'].split('\xa0')[-1]

        # Map the possible user attributes to their config names and sections
        mapping = {
            'Street': ('address', 'address'),
            'City': ('address', 'city'),
            'State Full': ('address', 'state'),
            'Zip Code': ('address', 'zipcode'),
            'Phone Number': ('user', 'phonenumber'),
            'Company': ('user', 'companyname'),
            'Username': ('user', 'username'),
            'country_code': ('address', 'countrycode'),
            'password': ('user', 'password'),
            'email': ('user', 'email'),
            'rootpw': ('server', 'root_password'),
            'ns1': ('server', 'ns1'),
            'ns2': ('server', 'ns2'),
            'hostname': ('server', 'hostname'),
            'testnet': ('user', 'testnet')
        }

        for attr in attrs.keys():
            if attr in mapping.keys():
                section, key = mapping[attr]
                if section not in config:
                    config[section] = {}
                config[section][key] = attrs[attr]
        return config

    def _get_attribute(self, attribute):
        return self.browser.get_current_page() \
            .find(string=attribute) \
            .parent.parent.parent \
            .find('input') \
            .get('value')
Example #31
0
class Session:
    BASE_URL = 'https://m.facebook.com'

    def __init__(self, browser_wrapper):
        self._connected = False
        self._current_html = None
        self._browser_wrapper = browser_wrapper
        self._browser = StatefulBrowser()
        self._browser.addHeaders = [('User-Agent', 'Firefox'),
                                    ('Accept-Language', 'en-US,en;q=0.5')]

    def __del__(self):
        self._dispose()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._dispose()

    @property
    def connected(self):
        return self._connected

    def log_in(self, username, password):
        try:
            # Log in to non-mobile site is more reliable
            self._browser_wrapper.open(self._browser,
                                       'https://www.facebook.com')
            self._browser.select_form('form[id="login_form"]')
            self._browser['email'] = username
            self._browser['pass'] = password
            self._browser_wrapper.submit_selected(self._browser)
            # Check if we really are in account profile page
            if self._browser.get_current_page().find('form',
                                                     action='/search/top/'):
                self._connected = True
        except:
            raise LogInError(f'Unable to log in as {username}')
        return self

    def log_out(self):
        if self._connected:
            self._browser.close()
            self._connected = False

    def profile_info(self, id_):
        """Retrieve informations for a given profile."""
        self._ensure_connected()
        try:
            self._browser_wrapper.open(self._browser,
                                       f'{Session.BASE_URL}/{id_}')
            name = self._sanitize_title(
                self._browser.get_current_page().find('title').text)
            image = parse_image(self._browser.get_current_page(), name)
            info = parse_info(self._browser.get_current_page())
            return name, image, info
        except:
            return None

    def search(self, query):
        """
        Execute search of a given text returning a tuple with ID,
        descriptions and URI.
        """
        url_query = '+'.join(query.split())
        url_path = f'/search/top/?q={url_query}' \
            if self._connected else f'/public/{url_query}'
        try:
            self._browser_wrapper.open(
                self._browser, f'{Session.BASE_URL}{url_path}{url_query}')
            return parse_search(self._browser.get_current_page(),
                                Session.BASE_URL)
        except:
            return None

    def _ensure_connected(self):
        if not self._connected:
            raise NotConnectedError('No active connection or required login')

    def _sanitize_title(self, title):
        # Handle cases like 'Some One - Home'
        if '-' in title:
            return title.split('-')[0].strip()
        return title

    def _dispose(self):
        if self._connected:
            self.log_out()
Example #32
0
 def __init__(self, country='NL'):
     self.country_code = country
     self.browser = StatefulBrowser()
     self.page = UserScraper.pages.get(country)
Example #33
0
from mechanicalsoup import StatefulBrowser

home_url = "http://testing.chandrashekar.info/"

username = "******"
password = "******"

login_url = "http://testing.chandrashekar.info/wp-login.php"

logged_in_url = "http://testing.chandrashekar.info/wp-admin/"

add_new_post_url = "http://testing.chandrashekar.info/wp-admin/post-new.php"

browser = StatefulBrowser()

browser.open(login_url)
assert browser.get_url() == login_url

browser.select_form()
browser["log"] = username
browser["pwd"] = password
browser.submit_selected()
assert browser.get_url() == logged_in_url
print(browser.get_url())

browser.follow_link("post-new.php")
assert browser.get_url() == add_new_post_url
print(browser.get_url())