Example #1
0
    def dic_parse(self, session, url, html):
        def innerHtml(ele):
            return ele.decode_contents(formatter="html")

        soup = BeautifulSoup(html, "lxml")
        ticker = self.url_ticker_pat.search(url).group(1)
        exchange = "TSX"

        on_yahoo = soup.find('section', attrs={'data-test': 'lookup-page'
                                               }) is None
        session.query(Listings).filter(Listings.exchange == exchange,
                                       Listings.ticker == ticker).update(
                                           {Listings.onyahoo: on_yahoo})

        if not on_yahoo:  # if quote not found, exit
            LOGGER.error("Failed to find quote for {} skipping".format(url))
            return

        div_test = soup.find('section', attrs={'data-test': 'qsp-statistics'})
        if div_test is None:
            LOGGER.error("Unknown error for {} skipping".format(url))
            return

        db_dic = {}
        for table in div_test.find_all('table'):
            for row in table.find_all('tr'):
                td_list = row.find_all('td')
                title = innerHtml(td_list[0].find('span'))
                val = innerHtml(td_list[1]) if td_list[1].find(
                    'span') is None else innerHtml(td_list[1].find('span'))
                if title in self.y_to_db_map:
                    db_dic[self.y_to_db_map[title]] = self.parse_numeric(val)

        if db_dic:
            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange
            exists = session.query(KeyStatistics).filter_by(
                **db_dic).scalar() is not None

            if exists:
                LOGGER.info("Skipping {} due to prior existence".format(url))
            else:
                db_dic["update_date"] = self.today

                stmt = insert(KeyStatistics).values(
                    db_dic).on_conflict_do_nothing(
                        constraint='key_statistics_pkey', )
                session.execute(stmt)
                session.commit()

                LOGGER.info("Done parsing {}".format(url))
        else:
            LOGGER.info("Skipping {}".format(url))
Example #2
0
    def dic_parse(self, db, url, html):
        def innerHtml(ele):
            return ele.decode_contents(formatter="html")

        soup = BeautifulSoup(html, "lxml")
        ticker = self.url_ticker_pat.search(url).group(1)
        exchange = "TSX"

        on_yahoo = soup.find('div', attrs={'data-test': 'unknown-quote'
                                           }) is None
        db.update("listings", ["onyahoo"], [on_yahoo],
                  "exchange=%s AND ticker=%s", [exchange, ticker])

        if not on_yahoo:  # if quote not found, exit
            LOGGER.error("Failed to find quote for", url, "skipping")
            return

        div_test = soup.find('div', attrs={'data-test': 'qsp-statistics'})
        if div_test is None:
            LOGGER.error("Unknown error for", url, "skipping")
            return

        db_dic = {}
        for table in div_test.find_all('table'):
            for row in table.find_all('tr'):
                td_list = row.find_all('td')
                title = innerHtml(td_list[0].find('span'))
                val = innerHtml(td_list[1]) if td_list[1].find(
                    'span') is None else innerHtml(td_list[1].find('span'))
                if title in self.y_to_db_map:
                    db_dic[self.y_to_db_map[title]] = self.parse_numeric(val)

        if db_dic:
            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange
            col_names, vals = list(db_dic.keys()), list(db_dic.values())
            where = db.create_conditional_string(col_names)
            if db.exists("key_statistics", where, vals):
                LOGGER.info("Skipping {} due to prior existence".format(url))
            else:
                col_names.append("update_date")
                vals.append(self.today)
                db.insert_into("key_statistics",
                               col_names,
                               vals,
                               multiple=False)
                LOGGER.info("Done parsing {}".format(url))
        else:
            LOGGER.info("Skipping {}".format(url))
Example #3
0
def get_html(urlQ, callback, xpath_hooks):
    """
    This page takes a url from the URL Queue (urlQ) and
    calls a callbac that will handle the page source.

    xpage_hooks is a list used to determine when the page is loaded,
    see the docs for more details (e.g. ["//div[@data-test='whatever']"] ).
    """
    svr = webkit_server.Server()
    svrconn = webkit_server.ServerConnection(server=svr)
    driver = dryscrape.driver.webkit.Driver(connection=svrconn)

    sess = dryscrape.Session(driver=driver)
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
    )
    sess.set_attribute("auto_load_images", False)

    valid_page_func = lambda: any(
        sess.at_xpath(xpath) for xpath in xpath_hooks)
    session = Session()

    while not urlQ.empty():
        url = urlQ.get()

        try:
            sess.visit(url)
        except webkit_server.InvalidResponseError:
            LOGGER.error(
                "Got invalid response from something? Skipping {}".format(url))
            continue

        try:
            sess.wait_for(valid_page_func, interval=1, timeout=15)
        except dryscrape.mixins.WaitTimeoutError:
            LOGGER.error("Timeout so skipping {}".format(url))
            continue

        response = sess.body()
        callback(session, url, response)
        sess.reset()

    svr.kill()
    session.close()
Example #4
0
    def handle_url(self, tickers, url, exchange):
        """
        Fetches the url and inserts the data into the appropriate cols in the DB.
        """
        LOGGER.info("Starting to add url: {} ...".format(url))

        req =  urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        db_list = []
        for row, ticker in zip(csv_r, tickers):
            assert(len(row) == len(self.url_flags))

            db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)}

            onyahoo = any(v is not None for v in db_dic.values())

            self.session.query(Listings).filter(Listings.exchange == exchange,
                                                Listings.ticker == ticker
            ).update({Listings.onyahoo: onyahoo})

            if not onyahoo: # not found, skip
                LOGGER.error("Failed to find quote for {} skipping".format(ticker))
                continue

            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange

            exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None
            if exists:
                LOGGER.info("Skipping {} due to prior existence".format(ticker))
                continue

            db_dic["update_date"] = self.today

            # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts
            # when checking constraints, RIP performance
            stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing(
                constraint = 'yahoo_key_statistics_pkey',
            )
            self.session.execute(stmt)
        self.session.commit()

        LOGGER.info("Done url.")