def __init__(self, id: str, func: Callable, param: Any = {}) -> None: super().__init__() self.id = id self.func = func self.param = param self.logger = Logger("Task") self.loop: Optional[AbstractEventLoop] = None
def __init__(self, stockRepository: StockRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository) -> None: self.stockRepository = stockRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.logger = Logger("StockService")
def __init__(self) -> None: super().__init__() self.logger = Logger("TaskRunner") self.queue: asyncio.Queue = asyncio.Queue() self.loop = asyncio.get_running_loop() self.pool = TaskPool(notifyCallback=self.notifyRmOnPool) self.notifyCallback = None
def __init__(self, mongod: TaskMongoDataSource) -> None: super().__init__() self.mongod = mongod self.logger = Logger("TasksRepository") self.taskEventEmitter = EventEmitter() self.tasksdto = ProcessTasks() self.taskRunner: Optional[TaskRunner] = None self.createTaskRunner()
def __init__(self, manager: ConnectionManager, factorRepository: FactorRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository, taskService: 'TaskService') -> None: self.manager = manager self.factorRepository = factorRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.taskService = taskService self.logger = Logger("FactorService")
class Task(object): def __init__(self, id: str, func: Callable, param: Any = {}) -> None: super().__init__() self.id = id self.func = func self.param = param self.logger = Logger("Task") self.loop: Optional[AbstractEventLoop] = None async def run(self, taskPool: TaskPool, pool: Pool) -> None: self.logger.info("run", "task run") if self.loop: self.param["taskPool"] = taskPool self.param["pool"] = pool await self.loop.create_task(self.func(**self.param))
def __init__(self, mongod: StockMongoDataSource, tasksRepository: TasksRepository) -> None: super().__init__() self.mongod = mongod self.tasksRepository = tasksRepository self.logger = Logger("StockRepository") self.ee = EventEmitter()
def __init__(self, factorMongod: FactorMongoDataSource, factorDartMongod: FactorDartMongoDataSource, filed: FactorFileDataSource) -> None: super().__init__() self.factorMongod = factorMongod self.factorDartMongod = factorDartMongod self.filed = filed self.logger = Logger("FactorRepository")
def __init__( self, manager: ConnectionManager, tasksRepository: TasksRepository, taskScheduler: TaskScheduler, factorService: FactorService, stockService: StockService, crawlerRepository: CrawlerRepository ) -> None: self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.manager = manager self.taskScheduler = taskScheduler self.factorService = factorService self.stockService = stockService self.logger = Logger("TaskService") self.ee = self.tasksRepository.taskEventEmitter self.setupEvents()
def marcapJob(marcapDtos: List[StockRunCrawling]) -> None: service: StockService = Locator.getInstance().get(StockService) logger = Logger("TaskService_marcapJob") for dto in marcapDtos: logger.info("#### schedule job start ####") logger.info("command" + dto.startDateStr + "~" + dto.endDateStr) dto.taskUniqueId = dto.taskId + dto.market+dto.startDateStr + dto.endDateStr + str(uuid.uuid4()) if dto.isNow: dto.startDateStr = getNowDateStr() dto.endDateStr = getNowDateStr() logger.info("real:" + dto.startDateStr + "~" + dto.endDateStr) service.crawlingMarcapStockData(marcapDtos)
class Pool(object): def __init__(self) -> None: super().__init__() self.isRun = False self.logger = Logger("Pool") self.task: Optional[Task] = None self.taskId = "" def setTask(self, task: Task) -> None: self.task = task self.taskId = task.id def run(self, taskPool: TaskPool) -> None: self.isRun = True self.logger.info("run", "task pool run") if self.task is not None: self.poolTask = asyncio.ensure_future(self.task.run( taskPool, self)) def cancel(self) -> None: self.isRun = False self.logger.info("cancel", "task pool cancel") if self.poolTask and not self.poolTask.cancelled(): self.poolTask.cancel()
class MarcapCrawler(object): def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.logger = Logger("MarcapCrawler") def createUUID(self) -> str: return str(uuid.uuid4()) async def connectWebDriver(self, addr: str, uuid: str) -> WebDriver: chrome_options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values.automatic_downloads': 1, 'download.default_directory': f"/home/seluser/Downloads/{uuid}" } chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Remote( command_executor=addr, options=chrome_options, ) driver.set_page_load_timeout(60) driver.set_script_timeout(60) self.logger.info("connectWebDriver", "create driver") return driver def connectLocalDriver(self, addr: str, uuid: str) -> WebDriver: chrome_options = webdriver.ChromeOptions() prefs = { 'profile.default_content_setting_values.automatic_downloads': 1, 'download.default_directory': f"/Users/iseongjae/Documents/PersonalProjects/fin-web/fin-crawling-server/server/downloads/{uuid}" } chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(executable_path="/Users/iseongjae/Downloads/chromedriver", chrome_options=chrome_options) return driver async def crawling(self, dto: StockRunCrawling) -> None: driver = None downloadObserver = None try: uuid = self.createUUID() self.logger.info("crawling", uuid) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, dto) downloadObserver = DownloadObserver() path = await asyncRetryNonBlock(5, 1, downloadObserver.makePath, uuid) downloadObserver.startObserver(path, self.ee) self.logger.info("crawling", "create observer and start") print("startObserver") driver = await asyncRetryNonBlock(5, 1, self.connectWebDriver, dto.driverAddr, uuid) print("connectWebDriver") driver.get("http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020101") try: alert = WebDriverWait(driver, timeout=3).until(EC.alert_is_present()) alert.accept() except Exception as e: print("예외발생:"+str(e)) print("start:"+dto.startDateStr) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, dto) WebDriverWait(driver, timeout=20, poll_frequency=1).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mktId_0_1"))) date = datetime.strptime(dto.startDateStr, "%Y%m%d") endDate = datetime.strptime(dto.endDateStr, "%Y%m%d") while date <= endDate: dateStr = date.strftime("%Y%m%d") downloadTask = StockCrawlingDownloadTask(**{ "dateStr": dateStr, "market": dto.market, "uuid": uuid, "taskId": dto.taskId, "taskUniqueId": dto.taskUniqueId }) self.logger.info("crawling", f"create downloadTask taskId: {dto.taskId} market: {dto.market} date: {dateStr} taskUniqueId: {dto.taskUniqueId}") print(downloadTask.json()) downloadObserver.event_handler.setDownloadTask(downloadTask) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, downloadTask) await asyncRetryNonBlock(5, 1, self.downloadData, downloadTask, downloadObserver, driver) # await self.downloadData(downloadTask, downloadObserver, driver) date = date + timedelta(days=1) except Exception as e: raise e finally: if downloadObserver: downloadObserver.stopObserver() if driver: driver.quit() async def downloadData(self, downloadTask: StockCrawlingDownloadTask, downloadObserver: DownloadObserver, driver: WebDriver) -> None: self.logger.info("downloadData") if driver is None: return # pymitter before = driver.execute_script("return $('.CI-MDI-UNIT-TIME').text()") if downloadTask.market == "kospi": driver.execute_script('$("#mktId_0_1").click()') elif downloadTask.market == "kosdaq": driver.execute_script('$("#mktId_0_2").click()') elif downloadTask.market == "konex": driver.execute_script('$("#mktId_0_3").click()') # driver.implicitly_wait(1) driver.execute_script(f'$("#trdDd")[0].value = "{downloadTask.dateStr}"') # driver.implicitly_wait(1) driver.execute_script('$(".btn_component_search").click()') # driver.implicitly_wait(1) after = before while before == after: after = driver.execute_script('return $(".CI-MDI-UNIT-TIME").text()') await sleepNonBlock(0.5) # driver.implicitly_wait(1) print("before:"+before) print("after:"+after) await sleepNonBlock(3) WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[class='CI-MDI-UNIT-DOWNLOAD']"))) driver.execute_script("$('[class=\"CI-MDI-UNIT-DOWNLOAD\"]').click()") WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[data-type='csv']"))) driver.execute_script("$(\"[data-type='csv']\").click()") print("wait:"+downloadTask.dateStr) loop = asyncio.get_running_loop() queue: asyncio.Queue = asyncio.Queue(maxsize=1, loop=loop) async def fileResultOfData(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: result = {} result["event"] = event result["downloadTask"] = downloadTask await queue.put(result) @self.ee.once(FILE_SYSTEM_HANDLER(downloadTask.uuid)) def downloadComplete(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: loop.create_task(fileResultOfData(event, downloadTask)) try: result = await asyncio.wait_for(queue.get(), timeout=30) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, downloadTask) await asyncio.create_task(self.makeMarcapData(result["event"], result["downloadTask"])) except Exception as e: raise e finally: queue.task_done() def convertFileToDto(self, path: str, dto: StockMarketCapitalResult) -> None: lines = [] with open(path, "r", encoding="utf-8") as f: # p = Path(f.name) # dto.date = p.stem lines = f.readlines() for i in range(1, len(lines)): data = lines[i].replace('"', '').split(",") if dto.market == "kospi": marcap = StockMarketCapital(**{ "date": dto.date, "market": dto.market, "code": data[0].strip(), "name": data[1].strip(), "close": data[2].strip(), "diff": data[3].strip(), "percent": data[4].strip(), "open": data[5].strip(), "high": data[6].strip(), "low": data[7].strip(), "volume": data[8].strip(), "price": data[9].strip(), "marcap": data[10].strip(), "number": data[11].strip() }) else: marcap = StockMarketCapital(**{ "date": dto.date, "market": dto.market, "code": data[0].strip(), "name": data[1].strip(), "close": data[3].strip(), "diff": data[4].strip(), "percent": data[5].strip(), "open": data[6].strip(), "high": data[7].strip(), "low": data[8].strip(), "volume": data[9].strip(), "price": data[10].strip(), "marcap": data[11].strip(), "number": data[12].strip() }) # print("append marcap: " + str(marcap)) dto.data.append(marcap) async def isExistFile(self, path: str, ext: str = ".csv") -> bool: isExist = path.endswith(ext) restTimes = 3 while not isExist and restTimes >= 0: await sleepNonBlock(1) isExist = path.endswith(ext) restTimes -= 1 return isExist async def parseReceivedFile(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: retdto = StockMarketCapitalResult() date = downloadTask.dateStr market = downloadTask.market retdto.date = date retdto.market = market isExist = await self.isExistFile(event.src_path) if not isExist: return print("created: " + date) await sleepNonBlock(0.5) dest_path = f'{os.path.dirname(event.src_path)}/{market+"-"+date}.csv' if os.path.isfile(dest_path): return self.changeCharSet(event.src_path) os.rename(event.src_path, dest_path) self.convertFileToDto(dest_path, retdto) retdto.result = "success" self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, True, retdto, downloadTask) self.ee.emit(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, downloadTask, retdto) self.logger.info("parseFile", f"success, {downloadTask.taskUniqueId}") async def makeMarcapData(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None: try: await asyncRetry(3, 1, self.parseReceivedFile, event, downloadTask) except Exception: retdto = StockMarketCapitalResult() retdto.result = "fail" retdto.errorMsg = traceback.format_exc() self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, False, retdto, downloadTask) self.logger.error("parseFile", f"fail, {downloadTask.taskUniqueId} error: {traceback.format_exc()}") finally: self.logger.info("parseFile...") def changeCharSet(self, path: str) -> None: lines = None with open(path, "r", encoding="euc-kr") as f: lines = f.readlines() with open(path, 'w', encoding="utf-8") as f: f.writelines(lines)
class TasksRepository(object): def __init__(self, mongod: TaskMongoDataSource) -> None: super().__init__() self.mongod = mongod self.logger = Logger("TasksRepository") self.taskEventEmitter = EventEmitter() self.tasksdto = ProcessTasks() self.taskRunner: Optional[TaskRunner] = None self.createTaskRunner() # 태스크 러너를 만든다. def createTaskRunner(self) -> None: if self.taskRunner is None: self.taskRunner = TaskRunner() self.taskRunner.notifyCallback = self.onUpdatePoolInfo self.logger.info("createTaskRunner", "created taskrunner") # 태스크 풀 정보가 업데이트 될 떄 이벤트를 날린다. def onUpdatePoolInfo(self, poolInfo: TaskPoolInfo) -> None: self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_POOL_INFO, poolInfo) self.logger.info("updatePoolInfo", f"{poolInfo.json()}") # 테스크 풀 정보를 가져온다. def getPoolInfo(self) -> None: if self.taskRunner: poolInfo = self.taskRunner.getPoolInfo() self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_POOL_INFO, poolInfo) # 태스크 풀에 태스크를 등록한다. def runTask(self, task: Task) -> None: # print("runTask") if self.taskRunner: self.taskRunner.put(task) # 추가된 태스크 정보를 저장한다. def addTask(self, task: ProcessTask) -> None: if task.taskId not in self.tasksdto.tasks: self.tasksdto.tasks[task.taskId] = dict() self.tasksdto.tasks[task.taskId]["list"] = dict() self.tasksdto.tasks[task.taskId]["ids"] = [] self.tasksdto.taskIds.append(task.taskId) self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] = task self.tasksdto.tasks[task.taskId]["ids"].append(task.taskUniqueId) self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto) self.logger.info("addTask", f"{task.taskUniqueId}") # 갱신 태스크 정보를 저장한다. def updateTask(self, task: ProcessTask) -> None: self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] = task self.logger.info("updateTask", f"{task.taskUniqueId}") self.mongod.upsertTask(task.dict()) self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto) def updateAllTask(self) -> None: self.taskEventEmitter.emit(EVENT_TASK_REPO_UPDATE_TASKS, self.tasksdto) # 저장된 테스크 정보를 반환한다. def getTask(self, taskId: str, taskUniqueId: str) -> ProcessTask: if self.isExistTask(taskId, taskUniqueId): return self.tasksdto.tasks[taskId]["list"][taskUniqueId] return None # 저장된 태스크가 있는지 확인한다. def isExistTask(self, taskId: str, taskUniqueId: str) -> bool: return taskId in self.tasksdto.tasks and taskUniqueId in self.tasksdto.tasks[ taskId]["list"] # 저장된 태스크 정보를 삭제한다. def deleteTask(self, task: ProcessTask) -> None: if task.taskId in self.tasksdto.tasks: if task.taskUniqueId in self.tasksdto.tasks[task.taskId]["list"]: del self.tasksdto.tasks[task.taskId]["list"][task.taskUniqueId] self.tasksdto.tasks[task.taskId]["ids"].remove( task.taskUniqueId) self.logger.info("deleteTask", f"{task.taskUniqueId}") def errorTask(self, dto: TaskModel, errMsg: str) -> None: task = self.getTask(dto.taskId, dto.taskUniqueId) task.state = "error" task.errMsg = errMsg self.updateTask(task) def completeFactorConvertFileToDbTask(self, task: ProcessTask) -> None: self.success(task, 1) self.updateTask(task) self.deleteTask(task) self.taskEventEmitter.emit(EVENT_TASK_REPO_TASK_COMPLETE, "factorFile", None) def completeFactorDart(self, task: ProcessTask, year: int) -> None: self.success(task, 1) self.updateTask(task) if task.restCount <= 0: self.deleteTask(task) task.state = "complete" self.updateTask(task) self.logger.info("completeFactorDart", "complete") self.taskEventEmitter.emit( EVENT_TASK_REPO_TASK_COMPLETE, "factorDart", StockUpdateState( **{ "taskId": task.taskId, "market": task.market, "date": year, "ret": 1 })) # 완료된 태스크 정보를 처린한다. def completeStockCrawlingTask(self, isSuccess: bool, retdto: StockMarketCapitalResult, dto: StockCrawlingDownloadTask) -> None: self.logger.info("##############completeStockCrawlingTask", str(isSuccess)) task = self.getTask(dto.taskId, dto.taskUniqueId) if isSuccess: self.success(task, 1) else: self.fail(task, 1) if task.restCount <= 0: self.deleteTask(task) if retdto: task.errMsg = retdto.errorMsg task.state = "success" self.updateTask(task) self.logger.info("completeStockCrawlingTask", "complete") self.taskEventEmitter.emit( EVENT_TASK_REPO_TASK_COMPLETE, "marcap", StockUpdateState( **{ "taskId": dto.taskId, "market": dto.market, "date": dto.dateStr, "ret": 1 if isSuccess else 2 })) # 성공한 태스크 정보를 처리한다. def success(self, task: ProcessTask, count: int) -> None: task.successCount = task.successCount + count task.restCount = task.restCount - count i = 0 for _ in range(count): task.tasksRet[task.index + i] = SUCCESS i = i + 1 task.index = task.index + count task.percent = (task.successCount + task.failCount) / task.count * 100 if task.restCount <= 0: task.state = "success" else: task.state = "waiting next task" self.logger.info("success", f"{task.taskUniqueId}") # 실패한 태스크 정보를 처리한다. def fail(self, task: ProcessTask, count: int) -> None: task.failCount = task.failCount + count task.restCount = task.restCount - count i = 0 for _ in range(count): left = task.tasks[task.index + i] task.failTasks.append(left) task.tasksRet[task.index + i] = FAIL i = i + 1 task.index = task.index + count task.percent = (task.successCount + task.failCount) / task.count * 100 if task.restCount <= 0: task.state = "fail" else: task.state = "waiting next task" self.logger.info("fail", f"{task.taskUniqueId}") # 완료된 태스크 정보를 반환한다. def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao: taskData = self.mongod.getCompletedTask(dto) print(taskData) tasks: Dict = dict() taskIds = [] for task in taskData.data: if task["taskId"] not in tasks: tasks[task["taskId"]] = dict() tasks[task["taskId"]]["list"] = dict() tasks[task["taskId"]]["ids"] = [] taskIds.append(task["taskId"]) tasks[task["taskId"]]["list"][task["taskUniqueId"]] = task tasks[task["taskId"]]["ids"].append(task["taskUniqueId"]) stockCrawlingCompletedTasksDTO = StockCrawlingCompletedTasks( **{ "history": tasks, "historyIds": taskIds }) taskData.data = stockCrawlingCompletedTasksDTO self.logger.info("getCompletedTask", f"count: {len(taskIds)}") return taskData # 모든 태스크 상태를 반환한다. def getAllTaskState(self, taskId: str) -> StockTaskState: markets = ["kospi", "kosdaq"] resultDict: YearData = YearData(**{"yearData": dict()}) resultDict.yearData[taskId] = dict() for market in markets: data = self.mongod.getAllTaskState(taskId, market) compDict: Dict = {} count: Dict = {} for one in data: for idx, taskDate in enumerate(one["tasks"]): if taskDate in compDict.keys(): if compDict[taskDate]["ret"] == 1 or one["tasksRet"][ idx] == 1: compDict[taskDate] = {"date": taskDate, "ret": 1} else: year = taskDate[0:4] if year in count.keys(): count[year] = count[year] + 1 else: count[year] = 1 compDict[taskDate] = { "date": taskDate, "ret": one["tasksRet"][idx] } collect: List = list(compDict.values()) collect = sorted(collect, key=lambda x: x["date"]) resultDict.yearData[taskId][market] = StockTaskState( **{ "taskStates": compDict, "taskKeys": compDict.keys(), "stocks": collect, "years": count, "market": market, "taskId": taskId }) return resultDict
class StockService: def __init__(self, stockRepository: StockRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository) -> None: self.stockRepository = stockRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.logger = Logger("StockService") async def getStockData(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]: return await self.stockRepository.getStockData(market, startDate, endDate) def crawlingMarcapStockData(self, dtoList: List[StockRunCrawling]) -> None: self.logger.info("crawlingMarcapStockData", str(len(dtoList))) for dto in dtoList: if dto.taskId == "marcap": async def marcapTaskWorker(runDto: StockRunCrawling, pool: Pool, taskPool: TaskPool) -> None: try: self.logger.info("runCrawling&marcapTaskWorker", "start") marcapCrawler = MarcapCrawler() taskUniqueId = runDto.taskUniqueId self.crawlerRepository.addCrawler( taskUniqueId, marcapCrawler) self.createListners(marcapCrawler.ee) self.logger.info("runCrawling&marcapTaskWorker", f"taskWorker:{taskUniqueId}") await marcapCrawler.crawling(runDto) taskPool.removeTaskPool(pool) self.crawlerRepository.removeCrawler(taskUniqueId) except asyncio.CancelledError: self.logger.info("convertFactorFileToDbTask", "cancel") except Exception: self.logger.error("convertFactorFileToDbTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(runDto, traceback.format_exc()) workerTask = Task(dto.taskUniqueId, marcapTaskWorker, {"runDto": dto}) if self.tasksRepository.taskRunner: if self.tasksRepository.isExistTask( dto.taskId, dto.taskUniqueId): return startDate = datetime.strptime(dto.startDateStr, "%Y%m%d") endDate = datetime.strptime(dto.endDateStr, "%Y%m%d") taskDates = [ (startDate + timedelta(days=x)).strftime("%Y%m%d") for x in range((endDate - startDate).days + 1) ] task = ProcessTask( **{ "market": dto.market, "startDateStr": dto.startDateStr, "endDateStr": dto.endDateStr, "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": len(taskDates), "tasks": deque(taskDates), "restCount": len(taskDates), "tasksRet": deque(([0] * len(taskDates))), }) task.state = "find worker" self.tasksRepository.addTask(task) self.tasksRepository.runTask(workerTask) self.logger.info("runMarcapTask", f"runTask {task.json()}") def createListners(self, ee: EventEmitter) -> None: ee.on(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, self.onResultOfStockData) ee.on(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, self.onConnectingWebDriver) ee.on(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, self.onStartCrawling) ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, self.onDownloadStart) ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, self.onDownloadComplete) ee.on(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, self.onParsingComplete) ee.on(EVENT_MARCAP_CRAWLING_ON_ERROR, self.onError) ee.on(EVENT_MARCAP_CRAWLING_ON_CANCEL, self.onCancelled) # 주식 종목 데이터 크롤링 결과값을 db에 저장한다. def onResultOfStockData(self, dto: StockCrawlingDownloadTask, retDto: StockMarketCapitalResult) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "insert to database" self.tasksRepository.updateTask(task) async def completeMarcapTask() -> None: await self.stockRepository.insertMarcap(retDto) self.tasksRepository.completeStockCrawlingTask(True, retDto, dto) asyncio.create_task(completeMarcapTask()) # 크롤링 중 웹드라이버와 연결되었을 때 이벤트 def onConnectingWebDriver(self, dto: StockRunCrawling) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "connecting webdriver" self.tasksRepository.updateTask(task) self.logger.info("onConnectingWebDriver", task.taskUniqueId) # 크롤링이 시작되었을 떄 이벤트 def onStartCrawling(self, dto: StockRunCrawling) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "start crawling" self.tasksRepository.updateTask(task) self.logger.info("onStartCrawling", task.taskUniqueId) # 크롤링 데이터 다운로드가 시작되었을 때 이벤트 def onDownloadStart(self, dto: StockCrawlingDownloadTask) -> None: # self.logger.info("onDownloadStart: "+dto.json()) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download start" self.tasksRepository.updateTask(task) self.logger.info("onDownloadStart", task.taskUniqueId) # 크롤링 데이터 다운로드가 완료되었을 때 이벤트 def onDownloadComplete(self, dto: StockCrawlingDownloadTask) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download complete" self.tasksRepository.updateTask(task) self.logger.info("onDownloadComplete", task.taskUniqueId) # 크롤링 데이터 변환이 완료되었을 때 이벤트 def onParsingComplete(self, isSuccess: bool, retdto: StockMarketCapitalResult, dto: StockCrawlingDownloadTask) -> None: self.logger.info("onParsingComplete") self.logger.info(f"taskId:{dto.taskId} taskUniqueId{dto.taskUniqueId}") tar = self.tasksRepository.tasksdto.tasks[dto.taskId]["list"] self.logger.info(f"taskDTO: {tar}") if not isSuccess: self.tasksRepository.completeStockCrawlingTask( isSuccess, retdto, dto) # 크롤링이 취소되었을 때 이벤트 def onCancelled(self, dto: StockRunCrawling) -> None: self.logger.info("onCancelled") # self.tasksRepository.updateAllTask() # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) # self.tasksRepository.fail(task, task.restCount) # task.state = "cancelled" # self.tasksRepository.updateTask(task) # self.logger.info("onCancelled", task.taskUniqueId) # 크롤링이 에러가났을 때 이벤트 def onError(self, dto: StockRunCrawling, errorMsg: str) -> None: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) self.tasksRepository.fail(task, task.restCount) task.state = "error" task.errMsg = errorMsg self.tasksRepository.updateTask(task) self.logger.error("onError", task.taskUniqueId)
def __init__(self) -> None: super().__init__() self.logger = Logger("FactorDartMongoDataSource")
class FactorDartMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("FactorDartMongoDataSource") async def getFactor(self, year: str = "*", month: str = "*", code: str = "*") -> list: try: findObj: Dict[str, Any] = {} self.mergeFindObj(findObj, "dataYear", year) self.mergeFindObj(findObj, "dataMonth", month) self.mergeFindObj(findObj, "code", code) cursor = self.factorDart.find(findObj) fields = [ "code", "dataMonth", "dataName", "dataYear", "dataId", "dataValue", "name" ] return list( map( lambda data: FactorData( **{field: data[field] for field in fields}), list(cursor))) except Exception: self.logger.error("getFactor", traceback.format_exc()) return list() async def insertFactor(self, li: List[FactorDao]) -> None: try: if not self.isSetupMarcap(): self.setupMarcap() for one in li: data = one.dict() data["updatedAt"] = getNow() self.factorDart.update_one( { "code": data["code"], "dataYear": data["dataYear"], "dataMonth": data["dataMonth"], "dataName": data["dataName"], }, { "$set": data, "$setOnInsert": { "createdAt": getNow() } }, upsert=True) except Exception: self.logger.error("insertFactor", traceback.format_exc()) def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "success"}, {"state": "fail"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({ "$or": [{ "state": "success" }, { "state": "fail" }] }).count() res = ListLimitResponse( **{ "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getCompletedTask", traceback.format_exc()) return []
class TaskMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("TaskMongoDataSource") def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "success"}, {"state": "fail"}, {"state": "complete"}, {"state": "error"}, {"state": "cancelled"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({ "$or": [{ "state": "success" }, { "state": "fail" }, { "state": "complete" }, { "state": "error" }, { "state": "cancelled" }] }).count() print("res:start") res = ListLimitDataDao( **{ "taskId": data["taskId"], "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getCompletedTask", traceback.format_exc()) return [] def getAllTaskState(self, taskId: str, market: str) -> list: try: cursor = self.task.find( { "taskId": taskId, "market": market # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}] }, projection=["tasks", "tasksRet"]) return list(cursor) except Exception: self.logger.error("getAllTaskState", traceback.format_exc()) return [] def upsertTask(self, value: dict) -> None: try: value["updatedAt"] = getNow() self.task.update_one({"taskUniqueId": value["taskUniqueId"]}, { "$set": value, "$setOnInsert": { "createdAt": getNow() } }, upsert=True) except Exception: self.logger.error("upsertTask", traceback.format_exc())
def __init__(self) -> None: super().__init__() self.isRun = False self.logger = Logger("Pool") self.task: Optional[Task] = None self.taskId = ""
def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.isLock = False self.isCancelled = False self.logger = Logger("DartApiCrawler")
class DartApiCrawler(object): def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.isLock = False self.isCancelled = False self.logger = Logger("DartApiCrawler") def createUUID(self) -> str: return str(uuid.uuid4()) async def downloadCodes(self, isCodeNew: bool, apiKey: str) -> Dict: if "pytest" in sys.modules: # savepath = Path('factors/codes.zip') loadpath = Path('factors/codes') datapath = Path("factors/codes/CORPCODE.xml") else: # savepath = Path('app/static/factors/codes.zip') loadpath = Path('app/static/factors/codes') datapath = Path("app/static/factors/codes/CORPCODE.xml") if isCodeNew or not os.path.exists(datapath.resolve()): # user_agent = UserAgent(cache=False, use_cache_server=True) headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'", 'accept-language': 'ko' } params = {"crtfc_key": apiKey} url = "https://opendart.fss.or.kr/api/corpCode.xml" async with aiohttp.ClientSession() as session: async with session.get(url, params=params, headers=headers) as response: data = await response.read() ZipFile(io.BytesIO(data)).extractall(loadpath.resolve()) tree = ET.parse(datapath.resolve()) codes: Dict[str, Any] = {} for li in tree.findall("list"): el = li.find("stock_code") if el is not None: stockCode = el.text if isinstance(stockCode, str) and len(stockCode) == 6: codeEl = li.find("corp_code") nameEl = li.find("corp_name") if codeEl is not None: codes[stockCode] = {} codes[stockCode]["corp_code"] = codeEl.text if nameEl is not None: codes[stockCode]["corp_name"] = nameEl.text return codes async def crawling(self, dto: DartApiCrawling) -> None: # cpu bound 작업 try: if dto.startYear < 2015: dto.startYear = 2015 self.ee.emit(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, dto) codes = await asyncRetryNonBlock(5, 1, self.downloadCodes, isCodeNew=dto.isCodeNew, apiKey=dto.apiKey) # codes = self.downloadCodes(dto.isCodeNew, dto.apiKey) self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto) for year in range(dto.startYear, dto.endYear + 1): self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto) self.logger.info("crawling", str(len(codes))) for code in codes: # newDf = self.getYearDf(dart, code, codes, year) newDf = await asyncRetryNonBlock(5, 1, self.getYearDf, dto.apiKey, code, codes, year) if self.isCancelled: self.ee.emit(EVENT_DART_API_CRAWLING_ON_CANCEL, dto) if newDf is not None: self.logger.info("crawling", code) self.ee.emit( EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, dto, year, newDf.to_dict("records")) # yearDf = await self.getYearDf(dart, code, codes, year, yearDf) self.ee.emit(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, dto, year) self.logger.info("crawling", str(year)) except Exception as e: raise e async def getYearDf(self, apiKey: str, code: str, codes: Dict, year: int) -> pd.DataFrame: self.logger.info("getYearDf", f"crawling: {code}") df = None try: url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json' # user_agent = UserAgent(cache=False, use_cache_server=True) headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'", 'accept-language': 'ko', } params = { 'crtfc_key': apiKey, 'corp_code': codes[code]["corp_code"], 'bsns_year': year, # 사업년도 'reprt_code': "11011", # "11011": 사업보고서 'fs_div': "CFS", # "CFS":연결재무제표, "OFS":재무제표 } connector = aiohttp.TCPConnector(limit=50, force_close=True) async with aiohttp.ClientSession(connector=connector) as session: timeout = aiohttp.ClientTimeout(total=15) # async with session.get(url, params=params, headers=headers) as response: async with session.get(url, params=params, timeout=timeout, headers=headers) as response: data = await response.json() if 'list' not in data: return None df = pd.json_normalize(data, 'list') # df = dart.finstate_all(code, year) # df = await asyncio.create_task(dart.finstate_all(code, year)) # df = await loop.run_in_executor(self.pool, dart.finstate_all, code, year) except Exception as e: self.logger.error("getYearDf", traceback.format_exc()) raise e self.logger.info("df", str(df)) if df is not None: df["crawling_year"] = year df["crawling_code"] = code df["crawling_name"] = codes[code]["corp_name"] name = codes[code]["corp_name"] self.logger.info("getYearDf", f"{str(year)} {str(code)} {str(name)}") return df # allCodeDf = pd.concat([allCodeDf, df]) # return allCodeDf return None
class StockMongoDataSource(MongoDataSource): def __init__(self) -> None: super().__init__() self.logger = Logger("StockMongoDataSource") async def insertMarcap(self, li: List[StockMarketCapital]) -> None: try: if not self.isSetupMarcap(): self.setupMarcap() for one in li: asyncio.create_task(self.insertMarpcapOne(one)) except Exception: self.logger.error("insertMarcap", traceback.format_exc()) async def insertMarpcapOne(self, one: StockMarketCapital) -> None: try: data = one.dict() data["updatedAt"] = getNow() self.marcap.update_one({ "code": data["code"], "date": data["date"], "market": data["market"] }, { "$set": data, "$setOnInsert": {"createdAt": getNow()} }, upsert=True) except Exception: self.logger.error("insertMarpcapOne", traceback.format_exc()) async def getMarcap(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]: try: if not self.isSetupMarcap(): self.setupMarcap() cursor = self.marcap.find({"$and": [{"date": {"$gte": startDate, "$lte": endDate}}, {"market": market}]}) return list(map(lambda data: StockMarketCapital(**{ "date": data["date"], "market": data["market"], "code": data["code"], "name": data["name"], "close": data["close"], "diff": data["diff"], "percent": data["percent"], "open": data["open"], "high": data["high"], "low": data["low"], "volume": data["volume"], "price": data["price"], "marcap": data["marcap"], "number": data["number"] }), list(cursor))) except Exception: self.logger.error("getMarcap", traceback.format_exc()) return list() def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse: try: data = dto.dict() cursor = self.task.find({"$or": [ {"state": "complete"}, {"state": "error"}, {"state": "cancelled"} ]} ).sort("createdAt", DESCENDING)\ .skip(data["offset"])\ .limit(data["limit"]) count = self.task.find({"$or": [ {"state": "complete"}, {"state": "error"}, {"state": "cancelled"} ]} ).count() res = ListLimitResponse(**{ "count": count, "offset": data["offset"], "limit": data["limit"], "data": self.exceptId(list(cursor)) }) return res except Exception: self.logger.error("getCompletedTask", traceback.format_exc()) return [] def getAllTaskState(self, taskId: str, market: str) -> list: try: cursor = self.task.find({ "taskId": taskId, "market": market # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}] }, projection=["tasks", "tasksRet"]) return list(cursor) except Exception: self.logger.error("getAllTaskState", traceback.format_exc()) return [] def upsertTask(self, value: dict) -> None: try: value["updatedAt"] = getNow() self.task.update_one({ "taskUniqueId": value["taskUniqueId"] }, { "$set": value, "$setOnInsert": {"createdAt": getNow()} }, upsert=True) except Exception: self.logger.error("upsertTask", traceback.format_exc())
class TaskService: def __init__( self, manager: ConnectionManager, tasksRepository: TasksRepository, taskScheduler: TaskScheduler, factorService: FactorService, stockService: StockService, crawlerRepository: CrawlerRepository ) -> None: self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.manager = manager self.taskScheduler = taskScheduler self.factorService = factorService self.stockService = stockService self.logger = Logger("TaskService") self.ee = self.tasksRepository.taskEventEmitter self.setupEvents() def setupEvents(self) -> None: self.ee.on(EVENT_TASK_REPO_UPDATE_TASKS, self.fetchTasks) self.ee.on(EVENT_TASK_REPO_TASK_COMPLETE, self.updateTaskState) self.ee.on(EVENT_TASK_REPO_UPDATE_POOL_INFO, self.updateTaskPoolInfo) def getTaskSchedule(self, webSocket: WebSocket, isBroadCast: bool = False) -> None: jobs = self.taskScheduler.getJobs() stockTaskScheduleList = StockTaskScheduleList(**{"list": []}) for i in range(len(jobs)): fields = jobs[i].trigger.fields id = jobs[i].id self.logger.info(f"jobargs: {str(jobs[i].args[0])}") stockTaskScheduleList.list.append(StockTaskScheduleInfo(**{ "id": id, "year": str(fields[0]), "month": str(fields[1]), "day": str(fields[2]), "dayOfWeek": str(fields[4]), "hour": str(fields[5]), "minute": str(fields[6]), "second": str(fields[7]), "taskList": list(jobs[i].args[0]) })) if isBroadCast: self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASK_SCHEDULE, stockTaskScheduleList.dict()) else: self.manager.send(RES_SOCKET_TASK_FETCH_TASK_SCHEDULE, stockTaskScheduleList.dict(), webSocket) @staticmethod def marcapJob(marcapDtos: List[StockRunCrawling]) -> None: service: StockService = Locator.getInstance().get(StockService) logger = Logger("TaskService_marcapJob") for dto in marcapDtos: logger.info("#### schedule job start ####") logger.info("command" + dto.startDateStr + "~" + dto.endDateStr) dto.taskUniqueId = dto.taskId + dto.market+dto.startDateStr + dto.endDateStr + str(uuid.uuid4()) if dto.isNow: dto.startDateStr = getNowDateStr() dto.endDateStr = getNowDateStr() logger.info("real:" + dto.startDateStr + "~" + dto.endDateStr) service.crawlingMarcapStockData(marcapDtos) def addTaskSchedule(self, scheduleDto: StockTaskSchedule, runCrawlingDto: List[StockRunCrawling], webSocket: WebSocket) -> None: marcapDtos = [] for dto in runCrawlingDto: if dto.taskId == "marcap": marcapDtos.append(dto) self.taskScheduler.addJob( self.marcapJob, scheduleDto.year, scheduleDto.month, scheduleDto.dayOfWeek, scheduleDto.day, scheduleDto.hour, scheduleDto.minute, scheduleDto.second, "marcap", args=[marcapDtos]) self.getTaskSchedule(webSocket, True) def removeTaskSchedule(self, id: str, webSocket: WebSocket) -> None: self.taskScheduler.removeJob(id) self.getTaskSchedule(webSocket, True) def fetchTasks(self, data: ProcessTasks = None, websocket: WebSocket = None) -> None: if data is None: data = self.tasksRepository.tasksdto self.logger.info("fetchTasks", data.json()) if websocket is None: self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASKS, data.dict()) else: self.manager.send(RES_SOCKET_TASK_FETCH_TASKS, data.dict(), websocket) def getTaskState(self, taskId: str, webSocket: WebSocket) -> None: data: YearData = self.tasksRepository.getAllTaskState(taskId) self.manager.send(RES_SOCKET_TASK_FETCH_TASK_STATE, data.dict(), webSocket) def updateTaskState(self, taskId: str, stockUpdateState: StockUpdateState = None) -> None: if stockUpdateState is not None: self.manager.sendBroadCast(RES_SOCKET_TASK_UPDATE_TASK_STATE, stockUpdateState.dict()) self.fetchTasks() def getTaskPoolInfo(self, webSocket: WebSocket) -> None: taskPoolInfo: TaskPoolInfo = self.tasksRepository.getPoolInfo() self.manager.send(RES_SOCKET_TASK_FETCH_TASK_POOL_INFO, taskPoolInfo.dict(), webSocket) def updateTaskPoolInfo(self, poolInfo: TaskPoolInfo) -> None: # logger.info(f"updateTaskPoolInfo:{poolInfo.json()}") self.manager.sendBroadCast(RES_SOCKET_TASK_FETCH_TASK_POOL_INFO, poolInfo.dict()) def addTask(self, taskName: str, dto: Any) -> None: if isinstance(dto, dict): if taskName == "crawlingMarcapStockData": data = [] for market in dto["market"]: taskUniqueId = dto["taskId"]+market+dto["startDate"]+dto["endDate"]+str(uuid.uuid4()) dtoOne = StockRunCrawling(**{ "driverAddr": "http://fin-carwling-webdriver:4444", "market": market, "startDateStr": dto["startDate"], "endDateStr": dto["endDate"], "taskId": dto["taskId"], "taskUniqueId": taskUniqueId }) data.append(dtoOne) elif taskName == "convertFactorFileToDb": data = RunFactorFileConvert(**{ "taskId": dto["taskId"], "taskUniqueId": dto["taskId"] + str(uuid.uuid4()) }) elif taskName == "crawlingFactorDartData": data = DartApiCrawling(**{ "apiKey": dto["apiKey"], "isCodeNew": dto["isCodeNew"], "startYear": dto["startYear"], "endYear": dto["endYear"], "taskId": dto["taskId"], "taskUniqueId": dto["taskId"] + dto["startYear"] + dto["endYear"] + str(uuid.uuid4()) }) else: data = dto if taskName == "convertFactorFileToDb": self.factorService.convertFactorFileToDb(data) elif taskName == "crawlingMarcapStockData": self.stockService.crawlingMarcapStockData(data) elif taskName == "crawlingFactorDartData": self.factorService.crawlingFactorDartData(data) def cancelTask(self, taskId: str, taskUniqueId: str) -> None: if taskUniqueId in self.crawlerRepository.getCrawlers(): self.crawlerRepository.getCrawler(taskUniqueId).isCancelled = True self.tasksRepository.taskRunner.cancel(taskUniqueId) task = self.tasksRepository.getTask(taskId, taskUniqueId) if task is not None: if task.state == "cancel": self.tasksRepository.deleteTask(task) self.tasksRepository.updateAllTask() elif task.state == "error": self.tasksRepository.deleteTask(task) self.tasksRepository.updateAllTask() else: task.state = "cancel" self.tasksRepository.updateTask(task) else: self.tasksRepository.updateAllTask() def fetchCompletedTask(self, dto: ListLimitData, webSocket: WebSocket) -> None: dao = ListLimitDao(**{ "limit": dto.limit, "offset": dto.offset, "taskId": dto.taskId }) tasks = self.tasksRepository.getCompletedTask(dao) self.manager.send(RES_SOCKET_TASK_FETCH_COMPLETED_TASK, tasks.dict(), webSocket)
def __init__(self) -> None: super().__init__() self.logger = Logger("TaskMongoDataSource")
def __init__(self) -> None: super().__init__() self.ee = EventEmitter() self.logger = Logger("MarcapCrawler")
class FactorService: def __init__(self, manager: ConnectionManager, factorRepository: FactorRepository, tasksRepository: TasksRepository, crawlerRepository: CrawlerRepository, taskService: 'TaskService') -> None: self.manager = manager self.factorRepository = factorRepository self.tasksRepository = tasksRepository self.crawlerRepository = crawlerRepository self.taskService = taskService self.logger = Logger("FactorService") async def getFactor(self, code: str, year: str, month: str, source: str) -> List[FactorData]: return await self.factorRepository.getFactor(code, year, month, source) def crawlingFactorDartData(self, dto: DartApiCrawling) -> None: async def crawlingFactorDartDataTask(pool: Pool, taskPool: TaskPool) -> None: # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) try: crawler = DartApiCrawler() self.crawlerRepository.addCrawler(dto.taskUniqueId, crawler) self.createFactorDartListener(crawler.ee) await crawler.crawling(dto) self.crawlerRepository.removeCrawler(dto.taskUniqueId) except asyncio.CancelledError: self.logger.info("crawlingFactorDartDataTask", "cancel") except Exception: self.logger.error("crawlingFactorDartDataTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(dto, traceback.format_exc()) finally: taskPool.removeTaskPool(pool) count = dto.endYear - dto.startYear + 1 task = ProcessTask( **{ "market": "", "startDateStr": dto.startYear, "endDateStr": dto.endYear, "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": count, "tasks": list(range(dto.startYear, dto.endYear + 1)), "restCount": count, "tasksRet": [0] * count, "state": "find worker" }) self.tasksRepository.addTask(task) workerTask = Task(dto.taskUniqueId, crawlingFactorDartDataTask) self.tasksRepository.runTask(workerTask) # file에 있는 factor를 db에 저장한다. def convertFactorFileToDb(self, dto: RunFactorFileConvert) -> None: self.logger.info("convertFactorFileToDb") async def convertFactorFileToDbTask(pool: Pool, taskPool: TaskPool) -> None: try: task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) data = await asyncio.create_task( self.factorRepository.getFactorsInFile()) task.state = "make Factor Object" self.tasksRepository.updateTask(task) daoList = await batchFunction(100, data, self.makeFactorDaoList) task.state = "start insert db" self.tasksRepository.updateTask(task) self.logger.info("convertFactorFileToDbTask", f"insertCount: {str(len(daoList))}") await self.factorRepository.insertFactor(daoList) task.state = "complete" self.tasksRepository.completeFactorConvertFileToDbTask(task) except asyncio.CancelledError: self.logger.info("convertFactorFileToDbTask", "cancel") except Exception: self.logger.error("convertFactorFileToDbTask", f"error: {traceback.format_exc()}") self.tasksRepository.errorTask(dto, traceback.format_exc()) finally: taskPool.removeTaskPool(pool) task = ProcessTask( **{ "market": "", "startDateStr": "20070101", "endDateStr": "20191231", "taskUniqueId": dto.taskUniqueId, "taskId": dto.taskId, "count": 1, "tasks": ["convert"], "restCount": 1, "tasksRet": [0], "state": "start get file" }) self.tasksRepository.addTask(task) workerTask = Task(dto.taskUniqueId, convertFactorFileToDbTask) self.tasksRepository.runTask(workerTask) async def makeFactorDaoList(self, data: List[Dict]) -> List[FactorDao]: daoList = [] for one in data: dao = FactorDao( **{ "code": one["종목코드"], # 종목코드 "name": one["종목명"], # 종목이름 "dataYear": one["년"], # 결산년 "dataMonth": one["결산월"], # 결산월 "dataName": one["데이터명"], # 데이터명 "dataValue": ( one["데이터값"] * 1000) if one["단위"] == "천원" else one["데이터값"] # 데이터값 }) daoList.append(dao) return daoList def createFactorDartListener(self, ee: EventEmitter) -> None: ee.on(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, self.onDownloadingCodes) ee.on(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, self.onCrawlingFactorData) ee.on(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, self.onCompleteYear) ee.on(EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, self.onResultOfFactor) ee.on(EVENT_DART_API_CRAWLING_ON_CANCEL, self.onCancelled) def onDownloadingCodes(self, dto: DartApiCrawling) -> None: self.logger.info("onDownloadingCodes", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "download Codes" self.tasksRepository.updateTask(task) def onCrawlingFactorData(self, dto: DartApiCrawling) -> None: self.logger.info("onCrawlingFactorData", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) task.state = "crawling factor data" self.tasksRepository.updateTask(task) def onCompleteYear(self, dto: DartApiCrawling, year: int) -> None: self.logger.info("onCompleteYear", dto.taskUniqueId) task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId) self.tasksRepository.completeFactorDart(task, year) def onResultOfFactor(self, dto: DartApiCrawling, year: int, obj: List) -> None: self.logger.info("onResultOfFactor", dto.taskUniqueId) listOfFactorDao = list( map( lambda one: FactorDao( **{ "code": one["crawling_code"], "name": one["crawling_name"], "dataYear": one["bsns_year"], "dataMonth": getMonthFromReprtCode(one["reprt_code"]), "dataName": one["account_nm"], "dataValue": one["thstrm_amount"], "dataId": one["account_id"] }), obj)) asyncio.create_task( self.factorRepository.insertFactorDart(listOfFactorDao)) def onCancelled(self, dto: DartApiCrawling) -> None: self.logger.info("onCancelled")
from typing import Dict from app.module.logger import Logger from pymongo import ASCENDING, MongoClient, monitoring from pymongo.collection import Collection from pymongo.database import Database from pymongo.monitoring import (CommandFailedEvent, CommandStartedEvent, CommandSucceededEvent) from dotenv import dotenv_values log = Logger("MongoDataSource", "mongoDb") config = dotenv_values('.env') class CommandLogger(monitoring.CommandListener): def started(self, event: CommandStartedEvent) -> None: pass # log.info("started", "Command {0.command_name} with request id ""{0.request_id} started on server ""{0.connection_id}".format(event)) def succeeded(self, event: CommandSucceededEvent) -> None: pass # log.info("succeeded", "Command {0.command_name} with request id ""{0.request_id} on server {0.connection_id} ""succeeded in {0.duration_micros} ""microseconds".format(event)) def failed(self, event: CommandFailedEvent) -> None: pass # log.info("failed", "Command {0.command_name} with request id ""{0.request_id} on server {0.connection_id} ""failed in {0.duration_micros} ""microseconds".format(event)) monitoring.register(CommandLogger())
class TaskRunner(object): def __init__(self) -> None: super().__init__() self.logger = Logger("TaskRunner") self.queue: asyncio.Queue = asyncio.Queue() self.loop = asyncio.get_running_loop() self.pool = TaskPool(notifyCallback=self.notifyRmOnPool) self.notifyCallback = None # self.loop: asyncio.AbstractEventLoop = asyncio.new_event_loop() def getPoolInfo(self) -> TaskPoolInfo: return TaskPoolInfo( **{ "poolSize": self.pool.poolSize, "poolCount": self.pool.poolCount(), "runCount": self.pool.runCount(), "queueCount": self.queue.qsize() }) def updatePoolInfo(self) -> None: self.logger.info( "updatePoolInfo", f"runCount:{self.pool.runCount()}, queueCount:{self.queue.qsize()}" ) if self.notifyCallback: self.notifyCallback( TaskPoolInfo( **{ "poolSize": self.pool.poolSize, "poolCount": self.pool.poolCount(), "runCount": self.pool.runCount(), "queueCount": self.queue.qsize() })) def notifyPutOnQueue(self) -> None: self.loop.create_task(self.notifyToPool()) def notifyRmOnPool(self) -> None: if self.queue.qsize() > 0: self.loop.create_task(self.notifyToPool()) else: self.updatePoolInfo() def cancel(self, id: str) -> None: pool: Optional[Pool] = self.pool.findPool(id) if pool is not None: self.logger.info("cancel", id) pool.cancel() self.pool.removeTaskPool(pool) else: self.logger.info("cancel", "pool is not exist") def isExist(self, id: str) -> bool: return self.pool.findPool(id) is not None async def notifyToPool(self) -> None: try: if self.queue.qsize() > 0 and (self.pool.poolSize - self.pool.poolCount()) > 0: pool = self.pool.addTaskPool(Pool(), False) # timeout이 있으면 nonblocking으로 움직임 task: Task = await asyncio.wait_for(self.queue.get(), timeout=1) if task: pool.setTask(task) pool.run(self.pool) else: self.pool.removeTaskPool(pool, False) # if self.pool.poolSize > self.queue.qsize() and self.pool.poolCount() >= self.queue.qsize(): # print("exit") # elif self.pool.poolSize > self.pool.poolCount() and self.queue.qsize() > 0: # pool = self.pool.addTaskPool(Pool(), False) # print(f"before qsize:{self.queue.qsize()}") # task: Task = await asyncio.wait_for(self.queue.get(), timeout=1) # print(f"after qsize:{self.queue.qsize()}") # if task: # pool.setTask(task) # pool.run(self.pool) # else: # self.pool.removeTaskPool(pool, False) except asyncio.TimeoutError as e: self.logger.info("notifyToPool", f"timeout:{str(e)}") self.pool.removeTaskPool(pool, False) finally: self.updatePoolInfo() def put(self, task: Task) -> None: task.loop = self.loop self.loop.create_task(self._put(task)) async def _put(self, task: Task) -> None: self.logger.info("_put", "task put") await self.queue.put(task) self.notifyPutOnQueue()