Example #1
0
def __crawl(driver):
    while True:
        while len(subHtmlUrlQueue) > 0:
            url = subHtmlUrlQueue.popleft()
            if url in visitedUrl:
                continue
            try:
                visitedUrl[url] = 1
                logger.info("visit page %s" % url)
                driver.get(url)
                WebDriverWait(
                    driver,
                    20).until(lambda x: x.find_elements_by_tag_name("script"))
                driver.switch_to.frame("contentFrame")
                atags = driver.find_elements_by_tag_name("a")
                hrefs = [a.get_attribute("href") for a in atags]
                links = filter_link(hrefs)
                song_links = get_song_link(links)
                logger.debug("get song_links %s" % len(song_links))
                notsong_links = get_nonsong_link(links)
                songUrlQueue.extend(song_links)
                subHtmlUrlQueue.extend(notsong_links)
            except Exception as e:
                logger.error("", exc_info=True)
                continue
        else:
            logger.info("empty page queue")
            time.sleep(1)
Example #2
0
def fetch_yahoo_responses() -> List[Tuple]:
    tickers: List[List] = []
    for model in [IncomeStatement, BalanceSheetStatement, CashFlowStatement]:
        tickers.append(
            fetch_isins_not_updated_financials(model))  # type: ignore
    tickers_unique: List[Tuple] = union_of_list_elements(*tickers)
    logger.info('Fetching financials from %s stocks' % len(tickers_unique))
    responses: List[Tuple[Any, ...]] = []
    for ticker_tuple in tickers_unique:
        if len(ticker_tuple) == 2:
            isin: str = ticker_tuple[0]
            yahoo_ticker: str = ticker_tuple[1]
            try:
                response = fetch_yahoo_data(
                    yahoo_ticker,
                    'balanceSheetHistory,incomeStatementHistory,cashflowStatementHistory'
                )
                logger.info('Succeeded getting ticker, isin: %s, %s' %
                            (yahoo_ticker, isin))
            except Exception:
                logger.error(
                    'Something went wrong getting ticker, isin: %s, %s' %
                    (yahoo_ticker, isin))
                logger.error(format_exc())
                continue
            responses.append((response, isin))
        else:
            continue
    return responses
Example #3
0
def clear_db():
    """Clear all rows in db"""
    con = sqlite3.connect(DB_PATH)
    try:
        with con:
            cur = con.cursor()
            cur.execute('DELETE FROM advego;')
        logger.info("All rows was successfully deleted")
    except con.Error as err:
        logger.exception(err.message)
Example #4
0
def insert_to_db(rows):
    """Write rows into db"""
    con = sqlite3.connect(DB_PATH)
    try:
        with con:
            cur = con.cursor()
            cur.executemany(
                "INSERT INTO advego ('filename', 'coef', 'is_cheat') VALUES (?, ?, ?)",
                (rows))
        logger.info("Data insert successfully")
    except con.Error as err:
        logger.exception(err.message)
Example #5
0
def create_db():
    """Create db if it doesn't exists"""
    con = sqlite3.connect(DB_PATH)
    try:
        with con:
            cur = con.cursor()
            cur.executescript("""
                CREATE TABLE IF NOT EXISTS advego (
                    "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
                    "filename" TEXT NOT NULL,
                    "coef" REAL NOT NULL,
                    "is_cheat" BOOL NOT NULL
                );
            """)
        logger.info("DB was created successfully")
    except con.Error as err:
        logger.exception(err.message)
Example #6
0
def run():
    logger.info("Start")
    create_db()
    clear_db()
    with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        try:
            file_list = [
                os.path.join(TEXT_FILES_PATH, f)
                for f in os.listdir(TEXT_FILES_PATH) if f.endswith(".txt")
            ]
        except OSError as err:
            logger.exception("Can't find txt files")
            raise TxtDirectoryException(str(err))
        for f in file_list:
            executor.submit(parse_file, f)

    insert_to_db(result_to_insert)
    logger.info("Finish")
 def load_data(data: List[Base]) -> None:
     if len(data) > 0:
         session = Session()
         for idx, record in enumerate(data):
             try:
                 session.merge(record)
             except Exception:
                 logger.info('Something went wrong: %s' % record)
                 logger.error(format_exc())
                 continue
             logger.debug(record)
             if idx > 0 and idx % 100 == 0:
                 session.commit()
                 logger.info('Chunked commit at %s records' % idx)
         session.commit()
         logger.info('Chunked commit at %s records' % idx)
         session.close()
     else:
         logger.info('No data to load')
Example #8
0
def crawl_songs():
    while True:
        while len(songUrlQueue) > 0:
            song_link = songUrlQueue.popleft()
            if song_link in visitedUrl:
                continue
            logger.info("visit song %s" % song_link)
            try:
                visitedUrl[song_link] = 1
                sid = song_link[song_link.find("=") + 1:]
                if songService.is_existed(sid):
                    logger.info("%s has existed" % sid)
                    continue
                info = extract.getSongInfo(song_link, song_driver)
                if not info:
                    continue
                songService.add(info)
            except Exception as e:
                logger.error("", exc_info=True)
                continue
        else:
            logger.info("empty song queue")
            time.sleep(1)