def main(): # redis subscriber index:0-3 client = mon.mongo_connection('linode1', 'mongo') with open('/Users/huangyiling/Github/stock/double_check_stock1.csv', newline='') as file: rows = csv.reader(file) for stock_id in rows: stock_id = ''.join(stock_id) print(f"get stock {stock_id}") coll_stock = mon.mongo_collection( client, 'stocks', f"stock{stock_id}") # for year in range(2010, 2021): # 測試當年度是否有資料 # test_month = 12 # test_url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(test_month).zfill(2)}01&stockNo={stock_id}""" # print(f"test stock {stock_id} in {year} exist ?") # test_docs = crawler.crawler(test_url) # if test_docs: # print("=> Yes, exist!") for month in range(1, 13): url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=2020{str(month).zfill(2)}01&stockNo={stock_id}""" print(f"-- Crawler >>> {url}") documents = crawler.crawler(url) if documents: # print(documents) for item in documents: # 記錄爬取的股票資料並寫入mongo mon.insert_document(coll_stock, item) print( f'stock: {stock_id} in 2020{str(month).zfill(2)} insert done.') time.sleep(10) print( f'stock: {stock_id} in 2020{str(month).zfill(2)} crawl done.')
def main(): # redis subscriber index:0-3 redisConnect = red.redis_connection('linode1', 'redis', db=0) client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') while True: # get keys and values from mongo keys = red.redis_get_all_kv(redisConnect) for key in keys: amount = int(os.environ.get("amount")) # amount of subscriber index = int(os.environ.get("index")) # subscriber num num = int(key.split('No_')[-1]) # redis key # 決定subscriber要取用哪筆資料 if num % int(amount) == int(index): stock_id = red.redis_get_value(redisConnect, key) print(f"get stock {stock_id}") red.redis_delete_key(redisConnect, key) # 取出stock_id就從redis刪掉 coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 2 }}) # 表示已經從redis刪掉 coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") for year in range(2010, 2022): for month in range(1, 13): if year == 2021 and month > 2: break url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={str(year)}{str(month).zfill(2)}01&stockNo={stock_id}""" print(f"-- Crawler >>> {url}") documents = crawler.crawler(url) if documents: for item in documents: # 記錄爬取的股票資料並寫入mongo mon.insert_document(coll_stock, item) coll_stockInfo.update_one({'_id': stock_id}, { '$set': { 'monthStatus': str(year) + str(month).zfill(2) } }) # 當月爬完 print( f'stock: {stock_id} in {year}{month} insert done.' ) time.sleep(10) print( f'stock: {stock_id} in {year}{month} crawl done.') coll_stockInfo.update_one({'_id': stock_id}, {'$set': { 'yearStatus': year }}) # 當年爬完 coll_stockInfo.update({'_id': stock_id}, {'$set': { 'crawlerStatus': 3 }}) # 表示這支股票已經都爬完
def create_urls(stock_ids): mongoClient = mon.mongo_connection('linode1', 'mongo') mongoCollection = mon.mongo_collection(mongoClient, 'stocks', 'crawlerURL') for stock_id in stock_ids: for year in year_list: for month in month_list: url = f'https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year + month.zfill(2)}01&stockNo={stock_id}' doc = { '_id': stock_id + year + month.zfill(2), 'url': url, 'crawlerStatus': 0 } mon.insert_document(mongoCollection, doc)
def stock_crawler(stock_ids_list): for stocks in stock_ids_list: stock_ids = stocks['stocks_list'] client = mon.mongo_connection('linode1', 'mongo') for stock_id in stock_ids: collection = mon.create_collection(client, 'stocks', f'stock{stock_id}') for year in range(10, 21): for month in [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12' ]: stock_url = f"https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date=20{str(year)}{month}01&stockNo={stock_id}" res = requests.get(stock_url, headers=headers) soup = BeautifulSoup(res.text, "lxml") table = soup.find_all('table')[0] df = pd.read_html(str(table))[0] for index in range(len(df)): date = df.iat[index, 0] # 交易日 date_ad = str(1911 + int(date.split('/')[0])) + ''.join( date.split('/')[1:]) volume = int(df.iat[index, 1]) # 交易量(股數) price = float(df.iat[index, 2]) # 成交金額 open_ = float(df.iat[index, 3]) # 開盤價 high = float(df.iat[index, 4]) # 最高價 low = float(df.iat[index, 5]) # 最低價 close_ = float(df.iat[index, 6]) # 收盤價 change_ori = df.iat[index, 7] # 高低價差 if change_ori == 'X0.00': change = float(0.00) else: change = float(change_ori) trades = int(df.iat[index, 8]) # 成交筆數 doc = { '_id': stock_id + date_ad, 'trade_date': date_ad, 'volume': volume, 'price': price, 'open': open_, 'high': high, 'low': low, 'close': close_, 'change': change, 'trades': trades } print(doc) mon.insert_document(collection, doc) # df.to_csv(f'/Users/huangyiling/Desktop/stock/2330/stock{stock_id}_20{str(i)}{j}.csv') time.sleep(20)
def industry_crawler(): url = 'https://www.cnyes.com/twstock/stock_astock.aspx?ga=nav' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' } res = requests.get(url, headers=headers) # print(res.status_code) soup = BeautifulSoup(res.text, 'html.parser') industries = soup.select('div[id="kinditem_0"]>ul[class="kdlist"]>li') # get all industries for industry in industries: industry_name = industry.a.text print(industry_name) industry_url = 'https://www.cnyes.com/twstock/' + industry.a["href"] print(industry_url) industry_id = industry_url.split('groupId=')[-1].split('&stitle')[0] # get all stocks from the industry res_stock = requests.get(industry_url, headers=headers) # print(res_stock.status_code) soup_stock = BeautifulSoup(res_stock.text, 'html.parser') stocks = soup_stock.select('div[class="TableBox"]>table>tr') stock_list = [] stock_dict = dict() for stock in stocks[1:]: stock_info = stock.find_all('td') stock_id = stock_info[1].text # print(stock_id) stock_name = stock_info[2].text # print(stock_name) stock_list.append(stock_id) stock_dict[stock_id] = stock_name industry_key_id = 'industry_' + industry_id doc = { '_id': industry_key_id, 'industry': industry_kv[industry_name], 'industry_name': industry_name, 'stocks_list': stock_list, 'stocks_count': len(stock_list), 'stocks': stock_dict } # print(doc) mongo_client = mon.mongo_connection('linode1', 'mongo') mongo_collection = mon.mongo_collection(mongo_client, 'stocks', 'stockIndustry') mon.insert_document(mongo_collection, doc) time.sleep(20)
def stockInfo(): client = mon.mongo_connection('linode1', 'mongo') coll_stockIndustry = mon.mongo_collection(s client, 'stocks', 'stockIndustry') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') for item in mon.find_all_mongo(coll_stockIndustry): # print(item) ids = item['stocks_list'] for stock_id in ids: doc = { '_id': stock_id, 'industry': item['_id'], 'name': 'name', 'abbreviation': item['stocks'][stock_id], 'dailyStatus': 0, 'monthStatus': 0, 'yearStatus': 0} print(doc) mon.insert_document(coll_stockInfo, doc)
def crawler_daily(): counts = 0 # notify daily updation starts goo.main('stock_crawler', 'Stocks Daily Updation Starts!') # start time t1 = datetime.datetime.now() # set daily status zero for default client = mon.mongo_connection('linode1', 'mongo') coll_stockInfo = mon.mongo_collection(client, 'stocks', 'stockInfo') coll_stockInfo.update_many({}, {'$set': {'dailyStatus': 0}}) # today today = datetime.date.today() #-datetime.timedelta(1) year = today.strftime("%Y") month = today.strftime("%m") day = today.strftime("%d") # get all stocks' id for content in allStockID.all_stock_id(): stock_id = content['_id'] print(stock_id) retry = 0 url = f"""https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=html&date={year}{month}01&stockNo={stock_id}""" coll_stock = mon.mongo_collection(client, 'stocks', f"stock{stock_id}") while retry < 3: try: contents = crawler.crawler(url) # print(contents) for item in contents: # daily record to mongo mon.insert_document(coll_stock, item) # crawlering and writing to mongo done, set daily status as datetime coll_stockInfo.update_one( {'_id': stock_id}, {'$set': { 'dailyStatus': f"{year+month+day}" }}) counts += 1 time.sleep(10) break except Exception as e: print(e) time.sleep(10) retry += 1 if retry == 3: # sent notify with googlebot goo.main('stock_crawler', f"{stock_id}, {year,month,day} Wrong: {e}") wcsv.writeToCsv( f'./dataStore/DailyCrawlerException_{today}', [stock_id, year, month, day]) continue # check daily update done if coll_stockInfo.find({ 'dailyStatus': { '$ne': f"{year+month+day}" } }, { '_id': 1 }).count() != 0: crawler_daily() # notify daily updation done cost_time = datetime.datetime.now() - t1 goo.main( 'stock_crawler', f"{datetime.date.today()}: Daily Updation Finished!\nCheck amount of stock: {counts}, except: {938-counts}\nCost_time: {cost_time}" ) return