Ejemplo n.º 1
0
class JrjSpider(scrapy.Spider):
    name = 'jrj'
    allowed_domains = ['stock.jrj.com.cn']
    start_urls = ['http://*****:*****@class="titmain"]//h1//text()'
        temp = './/div[@class="texttit_m1"]//p//text()'
        #        print(response)
        item = CrawlerItem()
        item['link'] = response.url
        ans0 = response.xpath(temp0).getall()
        ans1 = response.xpath(temp).getall()
        item['title'] = ans0
        item['content'] = ans1
        preInfo = None
        if (self.preInfoUrlDict != None):
            preInfo = self.preInfoUrlDict[item['link']]
        elif (len(self.preInfoList) == 1):
            preInfo = self.preInfoList[0]
        ansFinal = {
            'type': 'crawlerResult',
            'content': {
                'link': item['link'],
                'title': ans0,
                'content': ans1,
                'preInfo': preInfo
            }
        }
        ansJson = json.dumps(ansFinal)
        self.cacheAgent.push(ansJson)
        #        self.cache.close()
        #        self.cacheAgent.close()
        yield item
Ejemplo n.º 2
0
class GeneralSpider(scrapy.Spider):
    name = 'general'
    allowed_domains = []
    start_urls = ['http://finance.jrj.com.cn/2020/04/24012529362098.shtml']

    def __init__(self,
                 cacheCrawlerPath='',
                 cacheKey='',
                 cacheAgentPath='',
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.cacheKey = cacheKey
        self.cacheCrawlerPath = cacheCrawlerPath
        print('aa', cacheCrawlerPath, 'bb', cacheAgentPath, cacheKey)
        self.cache = Cache(cacheCrawlerPath)
        self.cacheAgent = Cache(cacheAgentPath)
        self.oContentExtract = CContentExtract('boilerpipe')
        #        jsonStr = self.cache[int(cacheKey)]
        _, jsonStr = self.cache.pull()
        print('cc', jsonStr)
        if (jsonStr == None):
            oUrlList = [
                'http://finance.jrj.com.cn/2020/04/24012529362098.shtml'
            ]
            self.start_urls = oUrlList
        else:
            oUrlList = json.loads(jsonStr)
            self.start_urls = oUrlList['urlList']
            self.logInfo = oUrlList['logInfo']
            self.preInfoList: list = oUrlList['preInfo']
            self.preInfoUrlDict = None
            if (len(self.preInfoList) == len(self.start_urls)):
                self.preInfoUrlDict = dict()
                for idx, url in enumerate(self.start_urls):
                    self.preInfoUrlDict[url] = self.preInfoList[idx]

            logInfo = {'type': 'logInfo', 'content': {'data': self.logInfo}}
            logInfoStr = json.dumps(logInfo)
            self.cacheAgent.push(logInfoStr)

    def parse(self, response):
        temp0 = './/div[@class="titmain"]//h1//text()'
        temp = './/div[@class="texttit_m1"]//p//text()'
        #        print(response)
        item = CrawlerItem()
        item['link'] = response.url
        html = response.text
        #        print(html)
        #        ans0 = response.xpath(temp0).getall()
        #        ans1 = response.xpath(temp).getall()
        ans0, ans1 = self.oContentExtract.boilerpipe(html)
        item['title'] = ans0
        item['content'] = ans1
        print(ans0, ans1)
        preInfo = None
        if (self.preInfoUrlDict != None):
            preInfo = self.preInfoUrlDict[item['link']]
        elif (len(self.preInfoList) == 1):
            preInfo = self.preInfoList[0]
        ansFinal = {
            'type': 'crawlerResult',
            'content': {
                'data': {
                    'link': item['link'],
                    'title': ans0,
                    'content': ans1
                },
                'preInfo': preInfo
            }
        }
        ansJson = json.dumps(ansFinal)
        self.cacheAgent.push(ansJson)
        #        self.cache.close()
        #        self.cacheAgent.close()
        yield item
Ejemplo n.º 3
0
class CAgent:
    def __init__(self,
                 name,
                 oDir: CDirectoryConfig,
                 oConfigByYaml: CConfigByYaml,
                 connectKnowlegeServer=False):
        self.name = name
        self.crawlerManager: CCrawlerManager = None
        self.storageManager: CStorage = None
        self.knowledgeManagerClient: CKnowledgeClient = None
        self.oDir: CDirectoryConfig = oDir
        self.oConf = oConfigByYaml
        self.oLog = CLog(oDir['Log'], self.name + '_log')
        self.dbWeb = ''
        self.cacheAgent = Cache(oDir['cacheAgentFolder'])
        self.cacheCrawler = Cache(oDir['cacheCrawlerFolder'])
        self.flagConnectKnowlegeServer = connectKnowlegeServer
        fKeyboardInterruptRegistrar(self._callbackKeyboardInterrupt)
        self.flagUserClose = False
#        fKeyboardInterruptRegistrar._register['test'] = self._callbackKeyboardInterrupt

    def _configStorage(self, mode='mongoDB'):
        oSubConfig = self.oConf['Storage']
        self.dbWeb = oSubConfig['dbWeb']
        if (oSubConfig.get('mode') != None):
            mode = oSubConfig['mode']
        path = self.dbWeb
        if (mode == 'mongoDB'):
            self.storageManager = CStorageMongoDB(self.name, path)

    def _configCrawler(self):
        self.crawlerManager = CCrawlerManager(self.name,
                                              self.oDir['crawlerCWD'],
                                              self.oLog,
                                              self.oDir['cacheCrawlerFolder'],
                                              self.oDir['cacheAgentFolder'])

    def _configKnowledgeManager(self):
        oSubConfig = self.oConf['KnowledgeManager']
        addressTuple = (oSubConfig['address'], oSubConfig['port'])
        key = oSubConfig['password']
        key = bytes(key, 'utf-8')
        print(key)
        self.knowledgeManagerClient = CKnowledgeClient(addressTuple, key,
                                                       self.oLog)
        if self.flagConnectKnowlegeServer:
            err = self.knowledgeManagerClient.connect()
            if err == False:
                raise ValueError("KnowledgeManager connection failed")

    def configAll(self):
        self._configCrawler()
        self.oLog.safeRecordTime('CrawlerManager conf finished')
        self._configKnowledgeManager()
        self.oLog.safeRecordTime('KnowledgeManager conf finished')
        self._configStorage()
        self.oLog.safeRecordTime('StorageManager conf finished')

    def startCrawling(self, jobsList: list):
        return self.crawlerManager.engineStart(jobsList)

    def fetchResult(
        self,
        handler,
        subProcHandle,
        timeWaitStep=1,
        maxWaitTimes=5
    ):  #total continuous waittime will be (timeWaitStep * maxWaitTimes)
        result = ''
        cnt = 0
        global WRITE_TO_STORAGE_FLAG
        WRITE_TO_STORAGE_FLAG = True
        while (True):
            _, result = self.cacheAgent.pull()
            if (result != None):
                result = json.loads(result)
                ans = handler(result['type'], result['content'])
                #                print(ans)
                for temp in ans:
                    self.storageManager.storeData(temp[0], temp[1], temp[2])
#                break
                cnt = 0  #clear counter
            elif (timeWaitStep * maxWaitTimes > 0):
                if (cnt >= maxWaitTimes
                    ):  # if continuous wait time equals to maxWaitTimes
                    WRITE_TO_STORAGE_FLAG = False
                    return False
                elif subProcHandle.poll(
                ) != None:  #if the subprocess is finished
                    WRITE_TO_STORAGE_FLAG = False
                    return subProcHandle.poll()
                else:
                    time.sleep(timeWaitStep)
                    cnt += 1  #counter add one
            else:
                WRITE_TO_STORAGE_FLAG = False
                raise ValueError(
                    "timeWaitStep * maxWaitTimes should be bigger than 0")

    def clearCache(self):
        self.cacheAgent.clear()
        self.cacheCrawler.clear()

    def closeCache(self):
        self.cacheAgent.close()
        self.cacheCrawler.close()
        self.crawlerManager.closeCache()

    def _callbackKeyboardInterrupt(self, *args, **kwargs):
        global WRITE_TO_STORAGE_FLAG
        self.flagUserClose = True
        if (WRITE_TO_STORAGE_FLAG is True):
            numRemainedMsg = len(self.cacheAgent)
            MSG = "Agent is fetching the result to the Storage," + \
            " number of remained items: " + str(numRemainedMsg) + \
            ", will close later."
            return False, MSG
        else:
            return True, ''

    def test(self):
        #code for testing keyboard interruption handle
        global WRITE_TO_STORAGE_FLAG
        WRITE_TO_STORAGE_FLAG = True
        for i in range(1000):
            time.sleep(0.01)
        WRITE_TO_STORAGE_FLAG = False
        #


#        print('Press Ctrl+C')
#        for x in range(1,100):
#            time.sleep(0.2)
#            print(x)

    def close(self):
        self.knowledgeManagerClient.close()
        self.closeCache()