def __crawl(driver): while True: while len(subHtmlUrlQueue) > 0: url = subHtmlUrlQueue.popleft() if url in visitedUrl: continue try: visitedUrl[url] = 1 logger.info("visit page %s" % url) driver.get(url) WebDriverWait( driver, 20).until(lambda x: x.find_elements_by_tag_name("script")) driver.switch_to.frame("contentFrame") atags = driver.find_elements_by_tag_name("a") hrefs = [a.get_attribute("href") for a in atags] links = filter_link(hrefs) song_links = get_song_link(links) logger.debug("get song_links %s" % len(song_links)) notsong_links = get_nonsong_link(links) songUrlQueue.extend(song_links) subHtmlUrlQueue.extend(notsong_links) except Exception as e: logger.error("", exc_info=True) continue else: logger.info("empty page queue") time.sleep(1)
def fetch_yahoo_responses() -> List[Tuple]: tickers: List[List] = [] for model in [IncomeStatement, BalanceSheetStatement, CashFlowStatement]: tickers.append( fetch_isins_not_updated_financials(model)) # type: ignore tickers_unique: List[Tuple] = union_of_list_elements(*tickers) logger.info('Fetching financials from %s stocks' % len(tickers_unique)) responses: List[Tuple[Any, ...]] = [] for ticker_tuple in tickers_unique: if len(ticker_tuple) == 2: isin: str = ticker_tuple[0] yahoo_ticker: str = ticker_tuple[1] try: response = fetch_yahoo_data( yahoo_ticker, 'balanceSheetHistory,incomeStatementHistory,cashflowStatementHistory' ) logger.info('Succeeded getting ticker, isin: %s, %s' % (yahoo_ticker, isin)) except Exception: logger.error( 'Something went wrong getting ticker, isin: %s, %s' % (yahoo_ticker, isin)) logger.error(format_exc()) continue responses.append((response, isin)) else: continue return responses
def clear_db(): """Clear all rows in db""" con = sqlite3.connect(DB_PATH) try: with con: cur = con.cursor() cur.execute('DELETE FROM advego;') logger.info("All rows was successfully deleted") except con.Error as err: logger.exception(err.message)
def insert_to_db(rows): """Write rows into db""" con = sqlite3.connect(DB_PATH) try: with con: cur = con.cursor() cur.executemany( "INSERT INTO advego ('filename', 'coef', 'is_cheat') VALUES (?, ?, ?)", (rows)) logger.info("Data insert successfully") except con.Error as err: logger.exception(err.message)
def create_db(): """Create db if it doesn't exists""" con = sqlite3.connect(DB_PATH) try: with con: cur = con.cursor() cur.executescript(""" CREATE TABLE IF NOT EXISTS advego ( "id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, "filename" TEXT NOT NULL, "coef" REAL NOT NULL, "is_cheat" BOOL NOT NULL ); """) logger.info("DB was created successfully") except con.Error as err: logger.exception(err.message)
def run(): logger.info("Start") create_db() clear_db() with ThreadPoolExecutor(max_workers=cpu_count()) as executor: try: file_list = [ os.path.join(TEXT_FILES_PATH, f) for f in os.listdir(TEXT_FILES_PATH) if f.endswith(".txt") ] except OSError as err: logger.exception("Can't find txt files") raise TxtDirectoryException(str(err)) for f in file_list: executor.submit(parse_file, f) insert_to_db(result_to_insert) logger.info("Finish")
def load_data(data: List[Base]) -> None: if len(data) > 0: session = Session() for idx, record in enumerate(data): try: session.merge(record) except Exception: logger.info('Something went wrong: %s' % record) logger.error(format_exc()) continue logger.debug(record) if idx > 0 and idx % 100 == 0: session.commit() logger.info('Chunked commit at %s records' % idx) session.commit() logger.info('Chunked commit at %s records' % idx) session.close() else: logger.info('No data to load')
def crawl_songs(): while True: while len(songUrlQueue) > 0: song_link = songUrlQueue.popleft() if song_link in visitedUrl: continue logger.info("visit song %s" % song_link) try: visitedUrl[song_link] = 1 sid = song_link[song_link.find("=") + 1:] if songService.is_existed(sid): logger.info("%s has existed" % sid) continue info = extract.getSongInfo(song_link, song_driver) if not info: continue songService.add(info) except Exception as e: logger.error("", exc_info=True) continue else: logger.info("empty song queue") time.sleep(1)