class MongoHandler: def __init__(self): self.log = Log(MongoHandler) conn = MongoClient(CONFIG['DB_ip'], CONFIG['DB_port']) self.save_collection = conn[CONFIG['DB_name']][ CONFIG['collection_name']] self.log.info("MongoDB connection to {0} collection. - {1}".format( CONFIG['collection_name'], self.save_collection)) if not CONFIG['is_input_keywords']: self.category_collection = conn[ CONFIG['DB_name']]['stock_category'] self.log.info( "MongoDB connection to category collection. - {0}".format( self.category_collection)) def get_search_keywords(self): if CONFIG['is_input_keywords']: raise self.log.warning("Check is_input_keywords option") query = {} category_info = list(self.category_collection.find(query)) return category_info def add_news_data(self, news_dict): self.save_collection.insert_many(news_dict)
class DataHandler: def __init__(self): self.log = Log(DataHandler) self.mongo = MongoHandler() self.company_info = None self.company_list = None check_target_location = CONFIG['company_name_location'] if check_target_location == 'DB': self.get_target_company = types.MethodType( self._get_company_by_mongo, self) elif check_target_location == 'File': self.get_target_company = types.MethodType( self._get_company_by_file, self) def get_target_company(self): pass def save_stock_data(self, stock_df): self.mongo.update_stock_data(stock_df) def _get_company_by_mongo(self, obj): self.log.debug("Get company information by database(MongoDB)") self.company_info = pd.DataFrame(self.mongo.get_company()) self.company_list = self.company_info[['company', 'code']] def _get_company_by_file(self, obj): pass
def __init__(self): self.log = Log(MongoHandler) conn = MongoClient(CONFIG['DB_ip'], CONFIG['DB_port']) self.save_collection = conn[CONFIG['DB_name']][ CONFIG['save_collection_name']] self.target_collection = conn[CONFIG['DB_name']][ CONFIG['target_collection_name']] self.log.info( "MongoDB save collection {0}, target collection. - {1}".format( self.save_collection, self.target_collection))
def __init__(self): self.log = Log(DataHandler) self.mongo = MongoHandler() self.company_info = None self.company_list = None check_target_location = CONFIG['company_name_location'] if check_target_location == 'DB': self.get_target_company = types.MethodType( self._get_company_by_mongo, self) elif check_target_location == 'File': self.get_target_company = types.MethodType( self._get_company_by_file, self)
class MongoHandler: def __init__(self): self.log = Log(MongoHandler) conn = MongoClient(CONFIG['DB_ip'], CONFIG['DB_port']) self.save_collection = conn[CONFIG['DB_name']][ CONFIG['save_collection_name']] self.target_collection = conn[CONFIG['DB_name']][ CONFIG['target_collection_name']] self.log.info( "MongoDB save collection {0}, target collection. - {1}".format( self.save_collection, self.target_collection)) def get_company(self): query = {} category_info = list(self.target_collection.find(query)) return category_info def update_stock_data(self, stock_df): updates = [] for idx, row in tqdm(stock_df.iterrows(), total=len(stock_df)): updates.append( UpdateOne( { 'Date': row['Date'], 'Code': row['Code'] }, { '$set': { 'Company': row['Company'], 'Type': row['Type'], 'Code': row['Code'], 'Date': row['Date'], 'High': row['High'], 'Low': row['Low'], 'Open': row['Open'], 'Close': row['Close'], 'candleCenter': row['candleCenter'], 'Volume': row['Volume'] # , 'Adj Close': row['Adj Close'] } }, upsert=True)) self.log.debug("update list count- {0}".format(len(updates))) self.save_collection.bulk_write(updates) def add_stock_data(self, stock_df): self.save_collection.insert_many(stock_df.to_dict('records'))
def __init__(self): self.log = Log(MongoHandler) conn = MongoClient(CONFIG['DB_ip'], CONFIG['DB_port']) self.save_collection = conn[CONFIG['DB_name']][ CONFIG['collection_name']] self.log.info("MongoDB connection to {0} collection. - {1}".format( CONFIG['collection_name'], self.save_collection)) if not CONFIG['is_input_keywords']: self.category_collection = conn[ CONFIG['DB_name']]['stock_category'] self.log.info( "MongoDB connection to category collection. - {0}".format( self.category_collection))
class DataHandler: def __init__(self): self.log = Log(DataHandler) self.mongo_handler = MongoHandler() def get_search_keywords(self): df = pd.DataFrame(self.mongo_handler.get_search_keywords()) search_keywords = list( zip(*map(df.get, df[ ['code', 'company', 'business_code', 'business']]))) # search_keywords = df['company'].unique() self.log.debug("search keywords count - {0}".format( len(search_keywords))) return search_keywords def get_range_search_date(self): now_date = datetime.now() e_date = now_date.strftime('%Y.%m.%d.%H.%M') s_date = (now_date - timedelta(hours=2)).strftime('%Y.%m.%d.%H.%M') self.log.debug("start date - {0}, end date - {1}".format( s_date, e_date)) return s_date, e_date def save_file(self, df, keyword, size): path = CONFIG['save_file_path'] if not os.path.exists(path): os.makedirs(path) self.log.debug("save file name - {0}_{1}.xlsx".format(keyword, size)) df.to_excel(path + '/{0}_{1}.xlsx'.format(keyword, size)) def save_db(self, df): self.mongo_handler.add_news_data(df.to_dict('records'))
def __init__(self, class_obj): self.log = Log(class_obj)
import yaml from Utils.utils import Log from Handlers.data_handler import DataHandler from Crawlers.naver_news_crawler import NaverNewsCrawler yaml.warnings({'YAMLLoadWarning': False}) with open("config.yaml", "rt", encoding="utf-8") as stream: CONFIG = yaml.load(stream)['NewsCrawler'] if __name__ == '__main__': log = Log(__name__) data_handler = DataHandler() naver_crawler = NaverNewsCrawler(data_handler) if CONFIG['is_input_keywords']: search_keywords = CONFIG['keywords'] else: search_keywords = data_handler.get_search_keywords() if CONFIG['iterate']: s_date, e_date = data_handler.get_range_search_date() else: s_date = CONFIG['start_date'] e_date = CONFIG['end_date'] url = naver_crawler.get_target_url(s_date, e_date) naver_crawler.execute_crawler(search_keywords, url)
def __init__(self): self.log = Log(DataHandler) self.mongo_handler = MongoHandler()