Exemple #1
0
class StockService:
    def __init__(self, stockRepository: StockRepository,
                 tasksRepository: TasksRepository,
                 crawlerRepository: CrawlerRepository) -> None:
        self.stockRepository = stockRepository
        self.tasksRepository = tasksRepository
        self.crawlerRepository = crawlerRepository
        self.logger = Logger("StockService")

    async def getStockData(self, market: str, startDate: str,
                           endDate: str) -> List[StockMarketCapital]:
        return await self.stockRepository.getStockData(market, startDate,
                                                       endDate)

    def crawlingMarcapStockData(self, dtoList: List[StockRunCrawling]) -> None:
        self.logger.info("crawlingMarcapStockData", str(len(dtoList)))
        for dto in dtoList:
            if dto.taskId == "marcap":

                async def marcapTaskWorker(runDto: StockRunCrawling,
                                           pool: Pool,
                                           taskPool: TaskPool) -> None:
                    try:
                        self.logger.info("runCrawling&marcapTaskWorker",
                                         "start")
                        marcapCrawler = MarcapCrawler()
                        taskUniqueId = runDto.taskUniqueId
                        self.crawlerRepository.addCrawler(
                            taskUniqueId, marcapCrawler)
                        self.createListners(marcapCrawler.ee)
                        self.logger.info("runCrawling&marcapTaskWorker",
                                         f"taskWorker:{taskUniqueId}")
                        await marcapCrawler.crawling(runDto)
                        taskPool.removeTaskPool(pool)
                        self.crawlerRepository.removeCrawler(taskUniqueId)
                    except asyncio.CancelledError:
                        self.logger.info("convertFactorFileToDbTask", "cancel")
                    except Exception:
                        self.logger.error("convertFactorFileToDbTask",
                                          f"error: {traceback.format_exc()}")
                        self.tasksRepository.errorTask(runDto,
                                                       traceback.format_exc())

                workerTask = Task(dto.taskUniqueId, marcapTaskWorker,
                                  {"runDto": dto})
                if self.tasksRepository.taskRunner:
                    if self.tasksRepository.isExistTask(
                            dto.taskId, dto.taskUniqueId):
                        return
                    startDate = datetime.strptime(dto.startDateStr, "%Y%m%d")
                    endDate = datetime.strptime(dto.endDateStr, "%Y%m%d")
                    taskDates = [
                        (startDate + timedelta(days=x)).strftime("%Y%m%d")
                        for x in range((endDate - startDate).days + 1)
                    ]
                    task = ProcessTask(
                        **{
                            "market": dto.market,
                            "startDateStr": dto.startDateStr,
                            "endDateStr": dto.endDateStr,
                            "taskUniqueId": dto.taskUniqueId,
                            "taskId": dto.taskId,
                            "count": len(taskDates),
                            "tasks": deque(taskDates),
                            "restCount": len(taskDates),
                            "tasksRet": deque(([0] * len(taskDates))),
                        })
                    task.state = "find worker"
                    self.tasksRepository.addTask(task)
                    self.tasksRepository.runTask(workerTask)
                    self.logger.info("runMarcapTask", f"runTask {task.json()}")

    def createListners(self, ee: EventEmitter) -> None:
        ee.on(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA,
              self.onResultOfStockData)

        ee.on(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER,
              self.onConnectingWebDriver)
        ee.on(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, self.onStartCrawling)
        ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, self.onDownloadStart)
        ee.on(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE,
              self.onDownloadComplete)
        ee.on(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE,
              self.onParsingComplete)
        ee.on(EVENT_MARCAP_CRAWLING_ON_ERROR, self.onError)
        ee.on(EVENT_MARCAP_CRAWLING_ON_CANCEL, self.onCancelled)

    # 주식 종목 데이터 크롤링 결과값을 db에 저장한다.
    def onResultOfStockData(self, dto: StockCrawlingDownloadTask,
                            retDto: StockMarketCapitalResult) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "insert to database"
        self.tasksRepository.updateTask(task)

        async def completeMarcapTask() -> None:
            await self.stockRepository.insertMarcap(retDto)
            self.tasksRepository.completeStockCrawlingTask(True, retDto, dto)

        asyncio.create_task(completeMarcapTask())

    # 크롤링 중 웹드라이버와 연결되었을 때 이벤트
    def onConnectingWebDriver(self, dto: StockRunCrawling) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "connecting webdriver"
        self.tasksRepository.updateTask(task)
        self.logger.info("onConnectingWebDriver", task.taskUniqueId)

    # 크롤링이 시작되었을 떄 이벤트
    def onStartCrawling(self, dto: StockRunCrawling) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "start crawling"
        self.tasksRepository.updateTask(task)
        self.logger.info("onStartCrawling", task.taskUniqueId)

    # 크롤링 데이터 다운로드가 시작되었을 때 이벤트
    def onDownloadStart(self, dto: StockCrawlingDownloadTask) -> None:
        # self.logger.info("onDownloadStart: "+dto.json())
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "download start"
        self.tasksRepository.updateTask(task)
        self.logger.info("onDownloadStart", task.taskUniqueId)

    # 크롤링 데이터 다운로드가 완료되었을 때 이벤트
    def onDownloadComplete(self, dto: StockCrawlingDownloadTask) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "download complete"
        self.tasksRepository.updateTask(task)
        self.logger.info("onDownloadComplete", task.taskUniqueId)

    # 크롤링 데이터 변환이 완료되었을 때 이벤트
    def onParsingComplete(self, isSuccess: bool,
                          retdto: StockMarketCapitalResult,
                          dto: StockCrawlingDownloadTask) -> None:
        self.logger.info("onParsingComplete")
        self.logger.info(f"taskId:{dto.taskId} taskUniqueId{dto.taskUniqueId}")
        tar = self.tasksRepository.tasksdto.tasks[dto.taskId]["list"]
        self.logger.info(f"taskDTO: {tar}")
        if not isSuccess:
            self.tasksRepository.completeStockCrawlingTask(
                isSuccess, retdto, dto)

    # 크롤링이 취소되었을 때 이벤트
    def onCancelled(self, dto: StockRunCrawling) -> None:
        self.logger.info("onCancelled")
        # self.tasksRepository.updateAllTask()
        # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        # self.tasksRepository.fail(task, task.restCount)
        # task.state = "cancelled"
        # self.tasksRepository.updateTask(task)
        # self.logger.info("onCancelled", task.taskUniqueId)

    # 크롤링이 에러가났을 때 이벤트
    def onError(self, dto: StockRunCrawling, errorMsg: str) -> None:
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        self.tasksRepository.fail(task, task.restCount)
        task.state = "error"
        task.errMsg = errorMsg
        self.tasksRepository.updateTask(task)
        self.logger.error("onError", task.taskUniqueId)
Exemple #2
0
class MarcapCrawler(object):
    
    def __init__(self) -> None:
        super().__init__()
        self.ee = EventEmitter()
        self.logger = Logger("MarcapCrawler")

    def createUUID(self) -> str:
        return str(uuid.uuid4())

    async def connectWebDriver(self, addr: str, uuid: str) -> WebDriver:
        chrome_options = webdriver.ChromeOptions()
        prefs = {
            'profile.default_content_setting_values.automatic_downloads': 1,
            'download.default_directory': f"/home/seluser/Downloads/{uuid}"
        }
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Remote(
            command_executor=addr,
            options=chrome_options,

        )
        driver.set_page_load_timeout(60)
        driver.set_script_timeout(60)
        self.logger.info("connectWebDriver", "create driver")
        return driver

    def connectLocalDriver(self, addr: str, uuid: str) -> WebDriver:
        chrome_options = webdriver.ChromeOptions()
        prefs = {
            'profile.default_content_setting_values.automatic_downloads': 1,
            'download.default_directory': f"/Users/iseongjae/Documents/PersonalProjects/fin-web/fin-crawling-server/server/downloads/{uuid}"
        }
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome(executable_path="/Users/iseongjae/Downloads/chromedriver", chrome_options=chrome_options)
        return driver

    async def crawling(self, dto: StockRunCrawling) -> None:
        driver = None
        downloadObserver = None
        try:
            uuid = self.createUUID()
            self.logger.info("crawling", uuid)
            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_CONNECTING_WEBDRIVER, dto)
            
            downloadObserver = DownloadObserver()
            path = await asyncRetryNonBlock(5, 1, downloadObserver.makePath, uuid)
            downloadObserver.startObserver(path, self.ee)
            self.logger.info("crawling", "create observer and start")
            print("startObserver")

            driver = await asyncRetryNonBlock(5, 1, self.connectWebDriver, dto.driverAddr, uuid)
            print("connectWebDriver")
            driver.get("http://data.krx.co.kr/contents/MDC/MDI/mdiLoader/index.cmd?menuId=MDC0201020101")
            try:
                alert = WebDriverWait(driver, timeout=3).until(EC.alert_is_present())
                alert.accept()
            except Exception as e:
                print("예외발생:"+str(e))
            print("start:"+dto.startDateStr)

            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_START_CRAWLING, dto)
            WebDriverWait(driver, timeout=20, poll_frequency=1).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#mktId_0_1")))
            date = datetime.strptime(dto.startDateStr, "%Y%m%d")
            endDate = datetime.strptime(dto.endDateStr, "%Y%m%d")

            while date <= endDate:
                dateStr = date.strftime("%Y%m%d")
                downloadTask = StockCrawlingDownloadTask(**{
                    "dateStr": dateStr,
                    "market": dto.market,
                    "uuid": uuid,
                    "taskId": dto.taskId,
                    "taskUniqueId": dto.taskUniqueId
                })
                self.logger.info("crawling", f"create downloadTask taskId: {dto.taskId} market: {dto.market} date: {dateStr} taskUniqueId: {dto.taskUniqueId}")
                print(downloadTask.json())
                downloadObserver.event_handler.setDownloadTask(downloadTask)
                self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_START, downloadTask)
                await asyncRetryNonBlock(5, 1, self.downloadData, downloadTask, downloadObserver, driver)
                # await self.downloadData(downloadTask, downloadObserver, driver)
                date = date + timedelta(days=1)
        except Exception as e:
            raise e
        finally:
            if downloadObserver:
                downloadObserver.stopObserver()
            if driver:
                driver.quit()
    
    async def downloadData(self, downloadTask: StockCrawlingDownloadTask, downloadObserver: DownloadObserver, driver: WebDriver) -> None:
        self.logger.info("downloadData")
        if driver is None:
            return
        # pymitter
        before = driver.execute_script("return $('.CI-MDI-UNIT-TIME').text()")
        if downloadTask.market == "kospi":
            driver.execute_script('$("#mktId_0_1").click()')
        elif downloadTask.market == "kosdaq":
            driver.execute_script('$("#mktId_0_2").click()')
        elif downloadTask.market == "konex":
            driver.execute_script('$("#mktId_0_3").click()')
        #     driver.implicitly_wait(1)
        driver.execute_script(f'$("#trdDd")[0].value = "{downloadTask.dateStr}"')
        #     driver.implicitly_wait(1)
        driver.execute_script('$(".btn_component_search").click()')
        #     driver.implicitly_wait(1)
        after = before
        while before == after:
            after = driver.execute_script('return $(".CI-MDI-UNIT-TIME").text()')
            await sleepNonBlock(0.5)
        #     driver.implicitly_wait(1)
        print("before:"+before)
        print("after:"+after)
        await sleepNonBlock(3)
        WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[class='CI-MDI-UNIT-DOWNLOAD']")))
        driver.execute_script("$('[class=\"CI-MDI-UNIT-DOWNLOAD\"]').click()")
        WebDriverWait(driver, timeout=10, poll_frequency=2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "*[data-type='csv']")))
        driver.execute_script("$(\"[data-type='csv']\").click()")
        print("wait:"+downloadTask.dateStr)

        loop = asyncio.get_running_loop()
        queue: asyncio.Queue = asyncio.Queue(maxsize=1, loop=loop)

        async def fileResultOfData(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
            result = {}
            result["event"] = event
            result["downloadTask"] = downloadTask
            await queue.put(result)

        @self.ee.once(FILE_SYSTEM_HANDLER(downloadTask.uuid))
        def downloadComplete(event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
            loop.create_task(fileResultOfData(event, downloadTask))
            
        try:
            result = await asyncio.wait_for(queue.get(), timeout=30)
            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_DOWNLOAD_COMPLETE, downloadTask)
            await asyncio.create_task(self.makeMarcapData(result["event"], result["downloadTask"]))
        except Exception as e:
            raise e
        finally:
            queue.task_done()

    def convertFileToDto(self, path: str, dto: StockMarketCapitalResult) -> None:
        lines = []
        with open(path, "r", encoding="utf-8") as f:
            # p = Path(f.name)
            # dto.date = p.stem
            lines = f.readlines()
        
        for i in range(1, len(lines)):
            data = lines[i].replace('"', '').split(",")
            if dto.market == "kospi":
                marcap = StockMarketCapital(**{
                    "date": dto.date,
                    "market": dto.market,
                    "code": data[0].strip(),
                    "name": data[1].strip(),
                    "close": data[2].strip(),
                    "diff": data[3].strip(),
                    "percent": data[4].strip(),
                    "open": data[5].strip(),
                    "high": data[6].strip(),
                    "low": data[7].strip(),
                    "volume": data[8].strip(),
                    "price": data[9].strip(),
                    "marcap": data[10].strip(),
                    "number": data[11].strip()
                })
            else:
                marcap = StockMarketCapital(**{
                    "date": dto.date,
                    "market": dto.market,
                    "code": data[0].strip(),
                    "name": data[1].strip(),
                    "close": data[3].strip(),
                    "diff": data[4].strip(),
                    "percent": data[5].strip(),
                    "open": data[6].strip(),
                    "high": data[7].strip(),
                    "low": data[8].strip(),
                    "volume": data[9].strip(),
                    "price": data[10].strip(),
                    "marcap": data[11].strip(),
                    "number": data[12].strip()
                })
            # print("append marcap: " + str(marcap))
            
            dto.data.append(marcap)

    async def isExistFile(self, path: str, ext: str = ".csv") -> bool:
        isExist = path.endswith(ext)
        restTimes = 3
        while not isExist and restTimes >= 0:
            await sleepNonBlock(1)
            isExist = path.endswith(ext)
            restTimes -= 1
        return isExist
    
    async def parseReceivedFile(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
        retdto = StockMarketCapitalResult()
        date = downloadTask.dateStr
        market = downloadTask.market
        retdto.date = date
        retdto.market = market
        isExist = await self.isExistFile(event.src_path)
        if not isExist:
            return
        print("created: " + date)
        await sleepNonBlock(0.5)
        dest_path = f'{os.path.dirname(event.src_path)}/{market+"-"+date}.csv'
        if os.path.isfile(dest_path):
            return
        self.changeCharSet(event.src_path)
        os.rename(event.src_path, dest_path)
        self.convertFileToDto(dest_path, retdto)
        retdto.result = "success"
        self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, True, retdto, downloadTask)
        self.ee.emit(EVENT_MARCAP_CRAWLING_ON_RESULT_OF_STOCK_DATA, downloadTask, retdto)
        self.logger.info("parseFile", f"success, {downloadTask.taskUniqueId}")
    
    async def makeMarcapData(self, event: FileCreatedEvent, downloadTask: StockCrawlingDownloadTask) -> None:
        try:
            await asyncRetry(3, 1, self.parseReceivedFile, event, downloadTask)
        except Exception:
            retdto = StockMarketCapitalResult()
            retdto.result = "fail"
            retdto.errorMsg = traceback.format_exc()
            self.ee.emit(EVENT_MARCAP_CRAWLING_ON_PARSING_COMPLETE, False, retdto, downloadTask)
            self.logger.error("parseFile", f"fail, {downloadTask.taskUniqueId} error: {traceback.format_exc()}")
        finally:
            self.logger.info("parseFile...")

    def changeCharSet(self, path: str) -> None:
        lines = None
        with open(path, "r", encoding="euc-kr") as f:
            lines = f.readlines()
        with open(path, 'w', encoding="utf-8") as f:
            f.writelines(lines)
Exemple #3
0
class FactorDartMongoDataSource(MongoDataSource):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("FactorDartMongoDataSource")

    async def getFactor(self,
                        year: str = "*",
                        month: str = "*",
                        code: str = "*") -> list:
        try:
            findObj: Dict[str, Any] = {}
            self.mergeFindObj(findObj, "dataYear", year)
            self.mergeFindObj(findObj, "dataMonth", month)
            self.mergeFindObj(findObj, "code", code)
            cursor = self.factorDart.find(findObj)
            fields = [
                "code", "dataMonth", "dataName", "dataYear", "dataId",
                "dataValue", "name"
            ]
            return list(
                map(
                    lambda data: FactorData(
                        **{field: data[field]
                           for field in fields}), list(cursor)))
        except Exception:
            self.logger.error("getFactor", traceback.format_exc())
            return list()

    async def insertFactor(self, li: List[FactorDao]) -> None:
        try:
            if not self.isSetupMarcap():
                self.setupMarcap()
            for one in li:
                data = one.dict()
                data["updatedAt"] = getNow()
                self.factorDart.update_one(
                    {
                        "code": data["code"],
                        "dataYear": data["dataYear"],
                        "dataMonth": data["dataMonth"],
                        "dataName": data["dataName"],
                    }, {
                        "$set": data,
                        "$setOnInsert": {
                            "createdAt": getNow()
                        }
                    },
                    upsert=True)
        except Exception:
            self.logger.error("insertFactor", traceback.format_exc())

    def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse:
        try:
            data = dto.dict()
            cursor = self.task.find({"$or": [
                        {"state": "success"},
                        {"state": "fail"}
                    ]}
                ).sort("createdAt", DESCENDING)\
                .skip(data["offset"])\
                .limit(data["limit"])

            count = self.task.find({
                "$or": [{
                    "state": "success"
                }, {
                    "state": "fail"
                }]
            }).count()

            res = ListLimitResponse(
                **{
                    "count": count,
                    "offset": data["offset"],
                    "limit": data["limit"],
                    "data": self.exceptId(list(cursor))
                })

            return res
        except Exception:
            self.logger.error("getCompletedTask", traceback.format_exc())
        return []
Exemple #4
0
class FactorService:
    def __init__(self, manager: ConnectionManager,
                 factorRepository: FactorRepository,
                 tasksRepository: TasksRepository,
                 crawlerRepository: CrawlerRepository,
                 taskService: 'TaskService') -> None:
        self.manager = manager
        self.factorRepository = factorRepository
        self.tasksRepository = tasksRepository
        self.crawlerRepository = crawlerRepository
        self.taskService = taskService
        self.logger = Logger("FactorService")

    async def getFactor(self, code: str, year: str, month: str,
                        source: str) -> List[FactorData]:
        return await self.factorRepository.getFactor(code, year, month, source)

    def crawlingFactorDartData(self, dto: DartApiCrawling) -> None:
        async def crawlingFactorDartDataTask(pool: Pool,
                                             taskPool: TaskPool) -> None:
            # task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
            try:
                crawler = DartApiCrawler()
                self.crawlerRepository.addCrawler(dto.taskUniqueId, crawler)
                self.createFactorDartListener(crawler.ee)
                await crawler.crawling(dto)
                self.crawlerRepository.removeCrawler(dto.taskUniqueId)
            except asyncio.CancelledError:
                self.logger.info("crawlingFactorDartDataTask", "cancel")
            except Exception:
                self.logger.error("crawlingFactorDartDataTask",
                                  f"error: {traceback.format_exc()}")
                self.tasksRepository.errorTask(dto, traceback.format_exc())
            finally:
                taskPool.removeTaskPool(pool)

        count = dto.endYear - dto.startYear + 1
        task = ProcessTask(
            **{
                "market": "",
                "startDateStr": dto.startYear,
                "endDateStr": dto.endYear,
                "taskUniqueId": dto.taskUniqueId,
                "taskId": dto.taskId,
                "count": count,
                "tasks": list(range(dto.startYear, dto.endYear + 1)),
                "restCount": count,
                "tasksRet": [0] * count,
                "state": "find worker"
            })
        self.tasksRepository.addTask(task)
        workerTask = Task(dto.taskUniqueId, crawlingFactorDartDataTask)
        self.tasksRepository.runTask(workerTask)

    # file에 있는 factor를 db에 저장한다.
    def convertFactorFileToDb(self, dto: RunFactorFileConvert) -> None:
        self.logger.info("convertFactorFileToDb")

        async def convertFactorFileToDbTask(pool: Pool,
                                            taskPool: TaskPool) -> None:
            try:
                task = self.tasksRepository.getTask(dto.taskId,
                                                    dto.taskUniqueId)
                data = await asyncio.create_task(
                    self.factorRepository.getFactorsInFile())
                task.state = "make Factor Object"
                self.tasksRepository.updateTask(task)
                daoList = await batchFunction(100, data,
                                              self.makeFactorDaoList)
                task.state = "start insert db"
                self.tasksRepository.updateTask(task)
                self.logger.info("convertFactorFileToDbTask",
                                 f"insertCount: {str(len(daoList))}")
                await self.factorRepository.insertFactor(daoList)
                task.state = "complete"
                self.tasksRepository.completeFactorConvertFileToDbTask(task)
            except asyncio.CancelledError:
                self.logger.info("convertFactorFileToDbTask", "cancel")
            except Exception:
                self.logger.error("convertFactorFileToDbTask",
                                  f"error: {traceback.format_exc()}")
                self.tasksRepository.errorTask(dto, traceback.format_exc())
            finally:
                taskPool.removeTaskPool(pool)

        task = ProcessTask(
            **{
                "market": "",
                "startDateStr": "20070101",
                "endDateStr": "20191231",
                "taskUniqueId": dto.taskUniqueId,
                "taskId": dto.taskId,
                "count": 1,
                "tasks": ["convert"],
                "restCount": 1,
                "tasksRet": [0],
                "state": "start get file"
            })
        self.tasksRepository.addTask(task)
        workerTask = Task(dto.taskUniqueId, convertFactorFileToDbTask)
        self.tasksRepository.runTask(workerTask)

    async def makeFactorDaoList(self, data: List[Dict]) -> List[FactorDao]:
        daoList = []
        for one in data:
            dao = FactorDao(
                **{
                    "code":
                    one["종목코드"],  # 종목코드
                    "name":
                    one["종목명"],  # 종목이름
                    "dataYear":
                    one["년"],  # 결산년
                    "dataMonth":
                    one["결산월"],  # 결산월
                    "dataName":
                    one["데이터명"],  # 데이터명
                    "dataValue": (
                        one["데이터값"] *
                        1000) if one["단위"] == "천원" else one["데이터값"]  # 데이터값
                })
            daoList.append(dao)
        return daoList

    def createFactorDartListener(self, ee: EventEmitter) -> None:
        ee.on(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES,
              self.onDownloadingCodes)
        ee.on(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA,
              self.onCrawlingFactorData)
        ee.on(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, self.onCompleteYear)
        ee.on(EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR,
              self.onResultOfFactor)
        ee.on(EVENT_DART_API_CRAWLING_ON_CANCEL, self.onCancelled)

    def onDownloadingCodes(self, dto: DartApiCrawling) -> None:
        self.logger.info("onDownloadingCodes", dto.taskUniqueId)
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "download Codes"
        self.tasksRepository.updateTask(task)

    def onCrawlingFactorData(self, dto: DartApiCrawling) -> None:
        self.logger.info("onCrawlingFactorData", dto.taskUniqueId)
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        task.state = "crawling factor data"
        self.tasksRepository.updateTask(task)

    def onCompleteYear(self, dto: DartApiCrawling, year: int) -> None:
        self.logger.info("onCompleteYear", dto.taskUniqueId)
        task = self.tasksRepository.getTask(dto.taskId, dto.taskUniqueId)
        self.tasksRepository.completeFactorDart(task, year)

    def onResultOfFactor(self, dto: DartApiCrawling, year: int,
                         obj: List) -> None:
        self.logger.info("onResultOfFactor", dto.taskUniqueId)
        listOfFactorDao = list(
            map(
                lambda one: FactorDao(
                    **{
                        "code": one["crawling_code"],
                        "name": one["crawling_name"],
                        "dataYear": one["bsns_year"],
                        "dataMonth": getMonthFromReprtCode(one["reprt_code"]),
                        "dataName": one["account_nm"],
                        "dataValue": one["thstrm_amount"],
                        "dataId": one["account_id"]
                    }), obj))
        asyncio.create_task(
            self.factorRepository.insertFactorDart(listOfFactorDao))

    def onCancelled(self, dto: DartApiCrawling) -> None:
        self.logger.info("onCancelled")
Exemple #5
0
class DartApiCrawler(object):
    def __init__(self) -> None:
        super().__init__()
        self.ee = EventEmitter()
        self.isLock = False
        self.isCancelled = False
        self.logger = Logger("DartApiCrawler")

    def createUUID(self) -> str:
        return str(uuid.uuid4())

    async def downloadCodes(self, isCodeNew: bool, apiKey: str) -> Dict:
        if "pytest" in sys.modules:
            # savepath = Path('factors/codes.zip')
            loadpath = Path('factors/codes')
            datapath = Path("factors/codes/CORPCODE.xml")
        else:
            # savepath = Path('app/static/factors/codes.zip')
            loadpath = Path('app/static/factors/codes')
            datapath = Path("app/static/factors/codes/CORPCODE.xml")

        if isCodeNew or not os.path.exists(datapath.resolve()):
            # user_agent = UserAgent(cache=False, use_cache_server=True)
            headers = {
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'",
                'accept-language': 'ko'
            }
            params = {"crtfc_key": apiKey}
            url = "https://opendart.fss.or.kr/api/corpCode.xml"
            async with aiohttp.ClientSession() as session:
                async with session.get(url, params=params,
                                       headers=headers) as response:
                    data = await response.read()
                    ZipFile(io.BytesIO(data)).extractall(loadpath.resolve())
        tree = ET.parse(datapath.resolve())
        codes: Dict[str, Any] = {}
        for li in tree.findall("list"):
            el = li.find("stock_code")
            if el is not None:
                stockCode = el.text
                if isinstance(stockCode, str) and len(stockCode) == 6:
                    codeEl = li.find("corp_code")
                    nameEl = li.find("corp_name")
                    if codeEl is not None:
                        codes[stockCode] = {}
                        codes[stockCode]["corp_code"] = codeEl.text
                        if nameEl is not None:
                            codes[stockCode]["corp_name"] = nameEl.text
        return codes

    async def crawling(self, dto: DartApiCrawling) -> None:
        # cpu bound 작업
        try:
            if dto.startYear < 2015:
                dto.startYear = 2015
            self.ee.emit(EVENT_DART_API_CRAWLING_ON_DOWNLOADING_CODES, dto)
            codes = await asyncRetryNonBlock(5,
                                             1,
                                             self.downloadCodes,
                                             isCodeNew=dto.isCodeNew,
                                             apiKey=dto.apiKey)
            # codes = self.downloadCodes(dto.isCodeNew, dto.apiKey)
            self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA, dto)
            for year in range(dto.startYear, dto.endYear + 1):
                self.ee.emit(EVENT_DART_API_CRAWLING_ON_CRAWLING_FACTOR_DATA,
                             dto)
                self.logger.info("crawling", str(len(codes)))
                for code in codes:
                    # newDf = self.getYearDf(dart, code, codes, year)
                    newDf = await asyncRetryNonBlock(5, 1, self.getYearDf,
                                                     dto.apiKey, code, codes,
                                                     year)
                    if self.isCancelled:
                        self.ee.emit(EVENT_DART_API_CRAWLING_ON_CANCEL, dto)
                    if newDf is not None:
                        self.logger.info("crawling", code)
                        self.ee.emit(
                            EVENT_DART_API_CRAWLING_ON_RESULT_OF_FACTOR, dto,
                            year, newDf.to_dict("records"))
                    # yearDf = await self.getYearDf(dart, code, codes, year, yearDf)
                self.ee.emit(EVENT_DART_API_CRAWLING_ON_COMPLETE_YEAR, dto,
                             year)
                self.logger.info("crawling", str(year))
        except Exception as e:
            raise e

    async def getYearDf(self, apiKey: str, code: str, codes: Dict,
                        year: int) -> pd.DataFrame:
        self.logger.info("getYearDf", f"crawling: {code}")
        df = None
        try:

            url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.json'

            # user_agent = UserAgent(cache=False, use_cache_server=True)
            headers = {
                'User-Agent':
                "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2'",
                'accept-language': 'ko',
            }
            params = {
                'crtfc_key': apiKey,
                'corp_code': codes[code]["corp_code"],
                'bsns_year': year,  # 사업년도
                'reprt_code': "11011",  # "11011": 사업보고서
                'fs_div': "CFS",  # "CFS":연결재무제표, "OFS":재무제표
            }
            connector = aiohttp.TCPConnector(limit=50, force_close=True)
            async with aiohttp.ClientSession(connector=connector) as session:
                timeout = aiohttp.ClientTimeout(total=15)
                # async with session.get(url, params=params, headers=headers) as response:
                async with session.get(url,
                                       params=params,
                                       timeout=timeout,
                                       headers=headers) as response:
                    data = await response.json()
                    if 'list' not in data:
                        return None
                    df = pd.json_normalize(data, 'list')
            # df = dart.finstate_all(code, year)
            # df = await asyncio.create_task(dart.finstate_all(code, year))
            # df = await loop.run_in_executor(self.pool, dart.finstate_all, code, year)
        except Exception as e:
            self.logger.error("getYearDf", traceback.format_exc())
            raise e
        self.logger.info("df", str(df))
        if df is not None:
            df["crawling_year"] = year
            df["crawling_code"] = code
            df["crawling_name"] = codes[code]["corp_name"]
            name = codes[code]["corp_name"]
            self.logger.info("getYearDf",
                             f"{str(year)} {str(code)} {str(name)}")
            return df
            # allCodeDf = pd.concat([allCodeDf, df])
            # return allCodeDf
        return None
Exemple #6
0
class TaskMongoDataSource(MongoDataSource):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("TaskMongoDataSource")

    def getCompletedTask(self, dto: ListLimitDao) -> ListLimitDataDao:
        try:
            data = dto.dict()
            cursor = self.task.find({"$or": [
                        {"state": "success"},
                        {"state": "fail"},
                        {"state": "complete"},
                        {"state": "error"},
                        {"state": "cancelled"}
                    ]}
                ).sort("createdAt", DESCENDING)\
                .skip(data["offset"])\
                .limit(data["limit"])

            count = self.task.find({
                "$or": [{
                    "state": "success"
                }, {
                    "state": "fail"
                }, {
                    "state": "complete"
                }, {
                    "state": "error"
                }, {
                    "state": "cancelled"
                }]
            }).count()
            print("res:start")
            res = ListLimitDataDao(
                **{
                    "taskId": data["taskId"],
                    "count": count,
                    "offset": data["offset"],
                    "limit": data["limit"],
                    "data": self.exceptId(list(cursor))
                })
            return res
        except Exception:
            self.logger.error("getCompletedTask", traceback.format_exc())
        return []

    def getAllTaskState(self, taskId: str, market: str) -> list:
        try:
            cursor = self.task.find(
                {
                    "taskId": taskId,
                    "market": market
                    # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}]
                },
                projection=["tasks", "tasksRet"])
            return list(cursor)
        except Exception:
            self.logger.error("getAllTaskState", traceback.format_exc())
        return []

    def upsertTask(self, value: dict) -> None:
        try:
            value["updatedAt"] = getNow()
            self.task.update_one({"taskUniqueId": value["taskUniqueId"]}, {
                "$set": value,
                "$setOnInsert": {
                    "createdAt": getNow()
                }
            },
                                 upsert=True)
        except Exception:
            self.logger.error("upsertTask", traceback.format_exc())
class StockMongoDataSource(MongoDataSource):
    def __init__(self) -> None:
        super().__init__()
        self.logger = Logger("StockMongoDataSource")

    async def insertMarcap(self, li: List[StockMarketCapital]) -> None:
        try:
            if not self.isSetupMarcap():
                self.setupMarcap()
            for one in li:
                asyncio.create_task(self.insertMarpcapOne(one))
        except Exception:
            self.logger.error("insertMarcap", traceback.format_exc())
    
    async def insertMarpcapOne(self, one: StockMarketCapital) -> None:
        try:
            data = one.dict()
            data["updatedAt"] = getNow()
            self.marcap.update_one({
                "code": data["code"],
                "date": data["date"],
                "market": data["market"]
            }, {
                "$set": data,
                "$setOnInsert": {"createdAt": getNow()}
            }, upsert=True)
        except Exception:
            self.logger.error("insertMarpcapOne", traceback.format_exc())
        
    async def getMarcap(self, market: str, startDate: str, endDate: str) -> List[StockMarketCapital]:
        try:
            if not self.isSetupMarcap():
                self.setupMarcap()
            cursor = self.marcap.find({"$and": [{"date": {"$gte": startDate, "$lte": endDate}}, {"market": market}]})
            return list(map(lambda data: StockMarketCapital(**{
                "date": data["date"],
                "market": data["market"],
                "code": data["code"],
                "name": data["name"],
                "close": data["close"],
                "diff": data["diff"],
                "percent": data["percent"],
                "open": data["open"],
                "high": data["high"],
                "low": data["low"],
                "volume": data["volume"],
                "price": data["price"],
                "marcap": data["marcap"],
                "number": data["number"]
            }), list(cursor)))
        except Exception:
            self.logger.error("getMarcap", traceback.format_exc())
            return list()

    def getCompletedTask(self, dto: ListLimitData) -> ListLimitResponse:
        try:
            data = dto.dict()
            cursor = self.task.find({"$or": [
                        {"state": "complete"}, 
                        {"state": "error"},
                        {"state": "cancelled"}
                    ]}
                ).sort("createdAt", DESCENDING)\
                .skip(data["offset"])\
                .limit(data["limit"])
            
            count = self.task.find({"$or": [
                        {"state": "complete"}, 
                        {"state": "error"},
                        {"state": "cancelled"}
                    ]}
                ).count()
            
            res = ListLimitResponse(**{
                "count": count,
                "offset": data["offset"],
                "limit": data["limit"],
                "data": self.exceptId(list(cursor))
            })
            
            return res
        except Exception:
            self.logger.error("getCompletedTask", traceback.format_exc())
        return []
    
    def getAllTaskState(self, taskId: str, market: str) -> list:
        try:
            cursor = self.task.find({
                "taskId": taskId,
                "market": market
                # "$or": [{"state": "success"}, {"state": "fail"}, {"state": "error"}]
            }, projection=["tasks", "tasksRet"])
            return list(cursor)
        except Exception:
            self.logger.error("getAllTaskState", traceback.format_exc())
        return []

    def upsertTask(self, value: dict) -> None:
        try:
            value["updatedAt"] = getNow()
            self.task.update_one({
                "taskUniqueId": value["taskUniqueId"]
            }, {
                "$set": value,
                "$setOnInsert": {"createdAt": getNow()}
            }, upsert=True)
        except Exception:
            self.logger.error("upsertTask", traceback.format_exc())