class StockService: def __init__(self, stockRepository: StockRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository) -> None: self.stockRepository = stockRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.logger = Logger("StockService") async def getStockData(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]: return await self.stockRepository.getStockData(market, startDate, endDate) def crawlingMarcapStockData(self, dtoList: List[StockRunCrawling]) -> None: self.logger.info("crawlingMarcapStockData", str(len(dtoList))) for dto in dtoList: if dto.taskId == "marcap": async def marcapTaskWorker(runDto: StockRunCrawling, pool: Pool, taskPool: TaskPool) -> None: try: self.logger.info("runCrawling&marcapTaskWorker", "start") marcapCrawler = MarcapCrawler() taskUniqueId = runDto.taskUniqueId self.crawlerRepository.addCrawler( taskUniqueId, marcapCrawler) self.createListners(marcapCrawler.ee) self.logger.info("runCrawling&marcapTaskWorker", f"taskWorker:{taskUniqueId}") await marcapCrawler.crawling(runDto) taskPool.removeTaskPool(pool) self.crawlerRepository.removeCrawler(taskUniqueId) except asyncio.CancelledError: self.logger.info("convertFactorFileToDbTask", "cancel") except Exception: self.logger.error("convertFactorFileToDbTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(runDto, traceback.format_exc()) workerTask = Task(dto.taskUniqueId, marcapTaskWorker, {"runDto": dto}) if self.tasksRepository.taskRunner: if self.tasksRepository.isExistTask( dto.taskId, dto.taskUniqueId): return startDate = datetime.strptime(dto.startDateStr, "%Y%m%d") endDate = datetime.strptime(dto.endDateStr, "%Y%m%d") taskDates = [ (startDate + timedelta(days=x)).strftime("%Y%m%d") for x in range((endDate - startDate).days + 1) ] task = ProcessTask( **{ "market": dto.market, "startDateStr": dto.startDateStr, "endDateStr": dto.endDateStr, "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": len(taskDates), "tasks": deque(taskDates), "restCount": len(taskDates), "tasksRet": deque(([0] * len(taskDates))), }) task.state = "find worker" self.tasksRepository.addTask(task) self.tasksRepository.runTask(workerTask) self.logger.info("runMarcapTask", f"runTask {task.json()}") def createListners(self, ee: EventEmitter) -> None: ee.on(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, self.onResultOfStockData) ee.on(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, self.onConnectingWebDriver) ee.on(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, self.onStartCrawling) ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, self.onDownloadStart) ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, self.onDownloadComplete) ee.on(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, self.onParsingComplete) ee.on(EVENT_MARCAP_CRAWLING_ON_ERROR, self.onError) ee.on(EVENT_MARCAP_CRAWLING_ON_CANCEL, self.onCancelled) # 주식 종목 데이터 크롤링 결과값을 db에 저장한다. def onResultOfStockData(self, dto: StockCrawlingDownloadTask, retDto: StockMarketCapitalResult) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "insert to database" self.tasksRepository.updateTask(task) async def completeMarcapTask() -> None: await self.stockRepository.insertMarcap(retDto) self.tasksRepository.completeStockCrawlingTask(True, retDto, dto) asyncio.create_task(completeMarcapTask()) # 크롤링 중 웹드라이버와 연결되었을 때 이벤트 def onConnectingWebDriver(self, dto: StockRunCrawling) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "connecting webdriver" self.tasksRepository.updateTask(task) self.logger.info("onConnectingWebDriver", task.taskUniqueId) # 크롤링이 시작되었을 떄 이벤트 def onStartCrawling(self, dto: StockRunCrawling) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "start crawling" self.tasksRepository.updateTask(task) self.logger.info("onStartCrawling", task.taskUniqueId) # 크롤링 데이터 다운로드가 시작되었을 때 이벤트 def onDownloadStart(self, dto: StockCrawlingDownloadTask) -> None: # self.logger.info("onDownloadStart: "+dto.json()) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download start" self.tasksRepository.updateTask(task) self.logger.info("onDownloadStart", task.taskUniqueId) # 크롤링 데이터 다운로드가 완료되었을 때 이벤트 def onDownloadComplete(self, dto: StockCrawlingDownloadTask) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download complete" self.tasksRepository.updateTask(task) self.logger.info("onDownloadComplete", task.taskUniqueId) # 크롤링 데이터 변환이 완료되었을 때 이벤트 def onParsingComplete(self, isSuccess: bool, retdto: StockMarketCapitalResult, dto: StockCrawlingDownloadTask) -> None: self.logger.info("onParsingComplete") self.logger.info(f"taskId:{dto.taskId} taskUniqueId{dto.taskUniqueId}") tar = self.tasksRepository.tasksdto.tasks[dto.taskId]["list"] self.logger.info(f"taskDTO: {tar}") if not isSuccess: self.tasksRepository.completeStockCrawlingTask( isSuccess, retdto, dto) # 크롤링이 취소되었을 때 이벤트 def onCancelled(self, dto: StockRunCrawling) -> None: self.logger.info("onCancelled") # self.tasksRepository.updateAllTask() # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) # self.tasksRepository.fail(task, task.restCount) # task.state = "cancelled" # self.tasksRepository.updateTask(task) # self.logger.info("onCancelled", task.taskUniqueId) # 크롤링이 에러가났을 때 이벤트 def onError(self, dto: StockRunCrawling, errorMsg: str) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) self.tasksRepository.fail(task, task.restCount) task.state = "error" task.errMsg = errorMsg self.tasksRepository.updateTask(task) self.logger.error("onError", task.taskUniqueId)
class MarcapCrawler(object): def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.logger = Logger("MarcapCrawler") def createUUID(self) -> str: return str(uuid.uuid4()) async def connectWebDriver(self, addr: str, uuid: str) -> WebDriver: chrome_options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values.automatic_downloads': 1, 'download.default_directory': f"/home/seluser/Downloads/{uuid}" } chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Remote( command_executor=addr, options=chrome_options, ) driver.set_page_load_timeout(60) driver.set_script_timeout(60) self.logger.info("connectWebDriver", "create driver") return driver def connectLocalDriver(self, addr: str, uuid: str) -> WebDriver: chrome_options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values.automatic_downloads': 1, 'download.default_directory': f"/Users/iseongjae/Documents/PersonalProjects/fin-web/fin-crawling-server/server/downloads/{uuid}" } chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(executable_path="/Users/iseongjae/Downloads/chromedriver", chrome_options=chrome_options) return driver async def crawling(self, dto: StockRunCrawling) -> None: driver = None downloadObserver = None try: uuid = self.createUUID() self.logger.info("crawling", uuid) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, dto) downloadObserver = DownloadObserver() path = await asyncRetryNonBlock(5, 1, downloadObserver.makePath, uuid) downloadObserver.startObserver(path, self.ee) self.logger.info("crawling", "create observer and start") print("startObserver") driver = await asyncRetryNonBlock(5, 1, self.connectWebDriver, dto.driverAddr, uuid) print("connectWebDriver") driver.get("http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020101") try: alert = WebDriverWait(driver, timeout=3).until(EC.alert_is_present()) alert.accept() except Exception as e: print("예외발생:"+str(e)) print("start:"+dto.startDateStr) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, dto) WebDriverWait(driver, timeout=20, poll_frequency=1).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mktId_0_1"))) date = datetime.strptime(dto.startDateStr, "%Y%m%d") endDate = datetime.strptime(dto.endDateStr, "%Y%m%d") while date <= endDate: dateStr = date.strftime("%Y%m%d") downloadTask = StockCrawlingDownloadTask(**{ "dateStr": dateStr, "market": dto.market, "uuid": uuid, "taskId": dto.taskId, "taskUniqueId": dto.taskUniqueId }) self.logger.info("crawling", f"create downloadTask taskId: {dto.taskId} market: {dto.market} date: {dateStr} taskUniqueId: {dto.taskUniqueId}") print(downloadTask.json()) downloadObserver.event_handler.setDownloadTask(downloadTask) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, downloadTask) await asyncRetryNonBlock(5, 1, self.downloadData, downloadTask, downloadObserver, driver) # await self.downloadData(downloadTask, downloadObserver, driver) date = date + timedelta(days=1) except Exception as e: raise e finally: if downloadObserver: downloadObserver.stopObserver() if driver: driver.quit() async def downloadData(self, downloadTask: StockCrawlingDownloadTask, downloadObserver: DownloadObserver, driver: WebDriver) -> None: self.logger.info("downloadData") if driver is None: return # pymitter before = driver.execute_script("return $('.CI-MDI-UNIT-TIME').text()") if downloadTask.market == "kospi": driver.execute_script('$("#mktId_0_1").click()') elif downloadTask.market == "kosdaq": driver.execute_script('$("#mktId_0_2").click()') elif downloadTask.market == "konex": driver.execute_script('$("#mktId_0_3").click()') # driver.implicitly_wait(1) driver.execute_script(f'$("#trdDd")[0].value = "{downloadTask.dateStr}"') # driver.implicitly_wait(1) driver.execute_script('$(".btn_component_search").click()') # driver.implicitly_wait(1) after = before while before == after: after = driver.execute_script('return $(".CI-MDI-UNIT-TIME").text()') await sleepNonBlock(0.5) # driver.implicitly_wait(1) print("before:"+before) print("after:"+after) await sleepNonBlock(3) WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[class='CI-MDI-UNIT-DOWNLOAD']"))) driver.execute_script("$('[class=\"CI-MDI-UNIT-DOWNLOAD\"]').click()") WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[data-type='csv']"))) driver.execute_script("$(\"[data-type='csv']\").click()") print("wait:"+downloadTask.dateStr) loop = asyncio.get_running_loop() queue: asyncio.Queue = asyncio.Queue(maxsize=1, loop=loop) async def fileResultOfData(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: result = {} result["event"] = event result["downloadTask"] = downloadTask await queue.put(result) @self.ee.once(FILE_SYSTEM_HANDLER(downloadTask.uuid)) def downloadComplete(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: loop.create_task(fileResultOfData(event, downloadTask)) try: result = await asyncio.wait_for(queue.get(), timeout=30) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, downloadTask) await asyncio.create_task(self.makeMarcapData(result["event"], result["downloadTask"])) except Exception as e: raise e finally: queue.task_done() def convertFileToDto(self, path: str, dto: StockMarketCapitalResult) -> None: lines = [] with open(path, "r", encoding="utf-8") as f: # p = Path(f.name) # dto.date = p.stem lines = f.readlines() for i in range(1, len(lines)): data = lines[i].replace('"', '').split(",") if dto.market == "kospi": marcap = StockMarketCapital(**{ "date": dto.date, "market": dto.market, "code": data[0].strip(), "name": data[1].strip(), "close": data[2].strip(), "diff": data[3].strip(), "percent": data[4].strip(), "open": data[5].strip(), "high": data[6].strip(), "low": data[7].strip(), "volume": data[8].strip(), "price": data[9].strip(), "marcap": data[10].strip(), "number": data[11].strip() }) else: marcap = StockMarketCapital(**{ "date": dto.date, "market": dto.market, "code": data[0].strip(), "name": data[1].strip(), "close": data[3].strip(), "diff": data[4].strip(), "percent": data[5].strip(), "open": data[6].strip(), "high": data[7].strip(), "low": data[8].strip(), "volume": data[9].strip(), "price": data[10].strip(), "marcap": data[11].strip(), "number": data[12].strip() }) # print("append marcap: " + str(marcap)) dto.data.append(marcap) async def isExistFile(self, path: str, ext: str = ".csv") -> bool: isExist = path.endswith(ext) restTimes = 3 while not isExist and restTimes >= 0: await sleepNonBlock(1) isExist = path.endswith(ext) restTimes -= 1 return isExist async def parseReceivedFile(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: retdto = StockMarketCapitalResult() date = downloadTask.dateStr market = downloadTask.market retdto.date = date retdto.market = market isExist = await self.isExistFile(event.src_path) if not isExist: return print("created: " + date) await sleepNonBlock(0.5) dest_path = f'{os.path.dirname(event.src_path)}/{market+"-"+date}.csv' if os.path.isfile(dest_path): return self.changeCharSet(event.src_path) os.rename(event.src_path, dest_path) self.convertFileToDto(dest_path, retdto) retdto.result = "success" self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, True, retdto, downloadTask) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, downloadTask, retdto) self.logger.info("parseFile", f"success, {downloadTask.taskUniqueId}") async def makeMarcapData(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: try: await asyncRetry(3, 1, self.parseReceivedFile, event, downloadTask) except Exception: retdto = StockMarketCapitalResult() retdto.result = "fail" retdto.errorMsg = traceback.format_exc() self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, False, retdto, downloadTask) self.logger.error("parseFile", f"fail, {downloadTask.taskUniqueId} error: {traceback.format_exc()}") finally: self.logger.info("parseFile...") def changeCharSet(self, path: str) -> None: lines = None with open(path, "r", encoding="euc-kr") as f: lines = f.readlines() with open(path, 'w', encoding="utf-8") as f: f.writelines(lines)
class FactorDartMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("FactorDartMongoDataSource") async def getFactor(self, year: str = "*", month: str = "*", code: str = "*") -> list: try: findObj: Dict[str, Any] = {} self.mergeFindObj(findObj, "dataYear", year) self.mergeFindObj(findObj, "dataMonth", month) self.mergeFindObj(findObj, "code", code) cursor = self.factorDart.find(findObj) fields = [ "code", "dataMonth", "dataName", "dataYear", "dataId", "dataValue", "name" ] return list( map( lambda data: FactorData( **{field: data[field] for field in fields}), list(cursor))) except Exception: self.logger.error("getFactor", traceback.format_exc()) return list() async def insertFactor(self, li: List[FactorDao]) -> None: try: if not self.isSetupMarcap(): self.setupMarcap() for one in li: data = one.dict() data["updatedAt"] = getNow() self.factorDart.update_one( { "code": data["code"], "dataYear": data["dataYear"], "dataMonth": data["dataMonth"], "dataName": data["dataName"], }, { "$set": data, "$setOnInsert": { "createdAt": getNow() } }, upsert=True) except Exception: self.logger.error("insertFactor", traceback.format_exc()) def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "success"}, {"state": "fail"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({ "$or": [{ "state": "success" }, { "state": "fail" }] }).count() res = ListLimitResponse( **{ "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getCompletedTask", traceback.format_exc()) return []
class FactorService: def __init__(self, manager: ConnectionManager, factorRepository: FactorRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository, taskService: 'TaskService') -> None: self.manager = manager self.factorRepository = factorRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.taskService = taskService self.logger = Logger("FactorService") async def getFactor(self, code: str, year: str, month: str, source: str) -> List[FactorData]: return await self.factorRepository.getFactor(code, year, month, source) def crawlingFactorDartData(self, dto: DartApiCrawling) -> None: async def crawlingFactorDartDataTask(pool: Pool, taskPool: TaskPool) -> None: # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) try: crawler = DartApiCrawler() self.crawlerRepository.addCrawler(dto.taskUniqueId, crawler) self.createFactorDartListener(crawler.ee) await crawler.crawling(dto) self.crawlerRepository.removeCrawler(dto.taskUniqueId) except asyncio.CancelledError: self.logger.info("crawlingFactorDartDataTask", "cancel") except Exception: self.logger.error("crawlingFactorDartDataTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(dto, traceback.format_exc()) finally: taskPool.removeTaskPool(pool) count = dto.endYear - dto.startYear + 1 task = ProcessTask( **{ "market": "", "startDateStr": dto.startYear, "endDateStr": dto.endYear, "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": count, "tasks": list(range(dto.startYear, dto.endYear + 1)), "restCount": count, "tasksRet": [0] * count, "state": "find worker" }) self.tasksRepository.addTask(task) workerTask = Task(dto.taskUniqueId, crawlingFactorDartDataTask) self.tasksRepository.runTask(workerTask) # file에 있는 factor를 db에 저장한다. def convertFactorFileToDb(self, dto: RunFactorFileConvert) -> None: self.logger.info("convertFactorFileToDb") async def convertFactorFileToDbTask(pool: Pool, taskPool: TaskPool) -> None: try: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) data = await asyncio.create_task( self.factorRepository.getFactorsInFile()) task.state = "make Factor Object" self.tasksRepository.updateTask(task) daoList = await batchFunction(100, data, self.makeFactorDaoList) task.state = "start insert db" self.tasksRepository.updateTask(task) self.logger.info("convertFactorFileToDbTask", f"insertCount: {str(len(daoList))}") await self.factorRepository.insertFactor(daoList) task.state = "complete" self.tasksRepository.completeFactorConvertFileToDbTask(task) except asyncio.CancelledError: self.logger.info("convertFactorFileToDbTask", "cancel") except Exception: self.logger.error("convertFactorFileToDbTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(dto, traceback.format_exc()) finally: taskPool.removeTaskPool(pool) task = ProcessTask( **{ "market": "", "startDateStr": "20070101", "endDateStr": "20191231", "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": 1, "tasks": ["convert"], "restCount": 1, "tasksRet": [0], "state": "start get file" }) self.tasksRepository.addTask(task) workerTask = Task(dto.taskUniqueId, convertFactorFileToDbTask) self.tasksRepository.runTask(workerTask) async def makeFactorDaoList(self, data: List[Dict]) -> List[FactorDao]: daoList = [] for one in data: dao = FactorDao( **{ "code": one["종목코드"], # 종목코드 "name": one["종목명"], # 종목이름 "dataYear": one["년"], # 결산년 "dataMonth": one["결산월"], # 결산월 "dataName": one["데이터명"], # 데이터명 "dataValue": ( one["데이터값"] * 1000) if one["단위"] == "천원" else one["데이터값"] # 데이터값 }) daoList.append(dao) return daoList def createFactorDartListener(self, ee: EventEmitter) -> None: ee.on(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, self.onDownloadingCodes) ee.on(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, self.onCrawlingFactorData) ee.on(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, self.onCompleteYear) ee.on(EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, self.onResultOfFactor) ee.on(EVENT_DART_API_CRAWLING_ON_CANCEL, self.onCancelled) def onDownloadingCodes(self, dto: DartApiCrawling) -> None: self.logger.info("onDownloadingCodes", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download Codes" self.tasksRepository.updateTask(task) def onCrawlingFactorData(self, dto: DartApiCrawling) -> None: self.logger.info("onCrawlingFactorData", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "crawling factor data" self.tasksRepository.updateTask(task) def onCompleteYear(self, dto: DartApiCrawling, year: int) -> None: self.logger.info("onCompleteYear", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) self.tasksRepository.completeFactorDart(task, year) def onResultOfFactor(self, dto: DartApiCrawling, year: int, obj: List) -> None: self.logger.info("onResultOfFactor", dto.taskUniqueId) listOfFactorDao = list( map( lambda one: FactorDao( **{ "code": one["crawling_code"], "name": one["crawling_name"], "dataYear": one["bsns_year"], "dataMonth": getMonthFromReprtCode(one["reprt_code"]), "dataName": one["account_nm"], "dataValue": one["thstrm_amount"], "dataId": one["account_id"] }), obj)) asyncio.create_task( self.factorRepository.insertFactorDart(listOfFactorDao)) def onCancelled(self, dto: DartApiCrawling) -> None: self.logger.info("onCancelled")
class DartApiCrawler(object): def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.isLock = False self.isCancelled = False self.logger = Logger("DartApiCrawler") def createUUID(self) -> str: return str(uuid.uuid4()) async def downloadCodes(self, isCodeNew: bool, apiKey: str) -> Dict: if "pytest" in sys.modules: # savepath = Path('factors/codes.zip') loadpath = Path('factors/codes') datapath = Path("factors/codes/CORPCODE.xml") else: # savepath = Path('app/static/factors/codes.zip') loadpath = Path('app/static/factors/codes') datapath = Path("app/static/factors/codes/CORPCODE.xml") if isCodeNew or not os.path.exists(datapath.resolve()): # user_agent = UserAgent(cache=False, use_cache_server=True) headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'", 'accept-language': 'ko' } params = {"crtfc_key": apiKey} url = "https://opendart.fss.or.kr/api/corpCode.xml" async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers=headers) as response: data = await response.read() ZipFile(io.BytesIO(data)).extractall(loadpath.resolve()) tree = ET.parse(datapath.resolve()) codes: Dict[str, Any] = {} for li in tree.findall("list"): el = li.find("stock_code") if el is not None: stockCode = el.text if isinstance(stockCode, str) and len(stockCode) == 6: codeEl = li.find("corp_code") nameEl = li.find("corp_name") if codeEl is not None: codes[stockCode] = {} codes[stockCode]["corp_code"] = codeEl.text if nameEl is not None: codes[stockCode]["corp_name"] = nameEl.text return codes async def crawling(self, dto: DartApiCrawling) -> None: # cpu bound 작업 try: if dto.startYear < 2015: dto.startYear = 2015 self.ee.emit(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, dto) codes = await asyncRetryNonBlock(5, 1, self.downloadCodes, isCodeNew=dto.isCodeNew, apiKey=dto.apiKey) # codes = self.downloadCodes(dto.isCodeNew, dto.apiKey) self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto) for year in range(dto.startYear, dto.endYear + 1): self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto) self.logger.info("crawling", str(len(codes))) for code in codes: # newDf = self.getYearDf(dart, code, codes, year) newDf = await asyncRetryNonBlock(5, 1, self.getYearDf, dto.apiKey, code, codes, year) if self.isCancelled: self.ee.emit(EVENT_DART_API_CRAWLING_ON_CANCEL, dto) if newDf is not None: self.logger.info("crawling", code) self.ee.emit( EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, dto, year, newDf.to_dict("records")) # yearDf = await self.getYearDf(dart, code, codes, year, yearDf) self.ee.emit(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, dto, year) self.logger.info("crawling", str(year)) except Exception as e: raise e async def getYearDf(self, apiKey: str, code: str, codes: Dict, year: int) -> pd.DataFrame: self.logger.info("getYearDf", f"crawling: {code}") df = None try: url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json' # user_agent = UserAgent(cache=False, use_cache_server=True) headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'", 'accept-language': 'ko', } params = { 'crtfc_key': apiKey, 'corp_code': codes[code]["corp_code"], 'bsns_year': year, # 사업년도 'reprt_code': "11011", # "11011": 사업보고서 'fs_div': "CFS", # "CFS":연결재무제표, "OFS":재무제표 } connector = aiohttp.TCPConnector(limit=50, force_close=True) async with aiohttp.ClientSession(connector=connector) as session: timeout = aiohttp.ClientTimeout(total=15) # async with session.get(url, params=params, headers=headers) as response: async with session.get(url, params=params, timeout=timeout, headers=headers) as response: data = await response.json() if 'list' not in data: return None df = pd.json_normalize(data, 'list') # df = dart.finstate_all(code, year) # df = await asyncio.create_task(dart.finstate_all(code, year)) # df = await loop.run_in_executor(self.pool, dart.finstate_all, code, year) except Exception as e: self.logger.error("getYearDf", traceback.format_exc()) raise e self.logger.info("df", str(df)) if df is not None: df["crawling_year"] = year df["crawling_code"] = code df["crawling_name"] = codes[code]["corp_name"] name = codes[code]["corp_name"] self.logger.info("getYearDf", f"{str(year)} {str(code)} {str(name)}") return df # allCodeDf = pd.concat([allCodeDf, df]) # return allCodeDf return None
class TaskMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("TaskMongoDataSource") def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "success"}, {"state": "fail"}, {"state": "complete"}, {"state": "error"}, {"state": "cancelled"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({ "$or": [{ "state": "success" }, { "state": "fail" }, { "state": "complete" }, { "state": "error" }, { "state": "cancelled" }] }).count() print("res:start") res = ListLimitDataDao( **{ "taskId": data["taskId"], "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getCompletedTask", traceback.format_exc()) return [] def getAllTaskState(self, taskId: str, market: str) -> list: try: cursor = self.task.find( { "taskId": taskId, "market": market # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}] }, projection=["tasks", "tasksRet"]) return list(cursor) except Exception: self.logger.error("getAllTaskState", traceback.format_exc()) return [] def upsertTask(self, value: dict) -> None: try: value["updatedAt"] = getNow() self.task.update_one({"taskUniqueId": value["taskUniqueId"]}, { "$set": value, "$setOnInsert": { "createdAt": getNow() } }, upsert=True) except Exception: self.logger.error("upsertTask", traceback.format_exc())
class StockMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("StockMongoDataSource") async def insertMarcap(self, li: List[StockMarketCapital]) -> None: try: if not self.isSetupMarcap(): self.setupMarcap() for one in li: asyncio.create_task(self.insertMarpcapOne(one)) except Exception: self.logger.error("insertMarcap", traceback.format_exc()) async def insertMarpcapOne(self, one: StockMarketCapital) -> None: try: data = one.dict() data["updatedAt"] = getNow() self.marcap.update_one({ "code": data["code"], "date": data["date"], "market": data["market"] }, { "$set": data, "$setOnInsert": {"createdAt": getNow()} }, upsert=True) except Exception: self.logger.error("insertMarpcapOne", traceback.format_exc()) async def getMarcap(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]: try: if not self.isSetupMarcap(): self.setupMarcap() cursor = self.marcap.find({"$and": [{"date": {"$gte": startDate, "$lte": endDate}}, {"market": market}]}) return list(map(lambda data: StockMarketCapital(**{ "date": data["date"], "market": data["market"], "code": data["code"], "name": data["name"], "close": data["close"], "diff": data["diff"], "percent": data["percent"], "open": data["open"], "high": data["high"], "low": data["low"], "volume": data["volume"], "price": data["price"], "marcap": data["marcap"], "number": data["number"] }), list(cursor))) except Exception: self.logger.error("getMarcap", traceback.format_exc()) return list() def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "complete"}, {"state": "error"}, {"state": "cancelled"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({"$or": [ {"state": "complete"}, {"state": "error"}, {"state": "cancelled"} ]} ).count() res = ListLimitResponse(**{ "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getCompletedTask", traceback.format_exc()) return [] def getAllTaskState(self, taskId: str, market: str) -> list: try: cursor = self.task.find({ "taskId": taskId, "market": market # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}] }, projection=["tasks", "tasksRet"]) return list(cursor) except Exception: self.logger.error("getAllTaskState", traceback.format_exc()) return [] def upsertTask(self, value: dict) -> None: try: value["updatedAt"] = getNow() self.task.update_one({ "taskUniqueId": value["taskUniqueId"] }, { "$set": value, "$setOnInsert": {"createdAt": getNow()} }, upsert=True) except Exception: self.logger.error("upsertTask", traceback.format_exc())