def crawYiCaiStockDailyNews(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextArray = browsor.find_elements_by_tag_name('dl') for context in contextArray: try: titleValue = context.find_element_by_tag_name('h1') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() linkUrl = context.find_element_by_tag_name('a').get_attribute( 'href') try: imageObj = context.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException, e: continue title = titleValue.text currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'YICAINET' ])
def crawYCFinanceHLDataSource(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) courrentContext = browsor.find_elements_by_tag_name('dl') for currentDiv in courrentContext: try: titleObj = currentDiv.find_element_by_tag_name('h1') title = titleObj.text linkUrl = titleObj.find_element_by_tag_name('a').get_attribute( 'href') descriptContext = currentDiv.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() try: imageObj = currentDiv.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException, e: continue listArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'YCNET' ])
def crawMorningOilDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name('news_list_all').find_elements_by_tag_name('li') for context in maincontext: imageUrl = CommonsInitValue.initoiltempimage() descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'OIL','XIDU']) return currentArray
def crawZBNewsNetDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('l_title') for context in contextList: pubDate = CommonsInitValue.initNowTime() try: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() title = context.find_element_by_class_name('title').text descriptContext = context.find_element_by_class_name('text').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','ZBNET'])
def crawCNFinanceNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name('pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1) pubDate =datetime+' '+currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET']) return currentList
def crawMorningOilDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name( 'news_list_all').find_elements_by_tag_name('li') for context in maincontext: imageUrl = CommonsInitValue.initoiltempimage() descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'OIL', 'XIDU' ]) return currentArray
def crawZBNewsNetDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('l_title') for context in contextList: pubDate = CommonsInitValue.initNowTime() try: imageUrl = context.find_element_by_tag_name('img').get_attribute( 'src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() title = context.find_element_by_class_name('title').text descriptContext = context.find_element_by_class_name('text').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'ZBNET' ])
def crawDailyOilNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() print '----START CRAW XDNETNEWS OIL NEWS----' try: XDNetOilNewsSpider.writeMorningOilDailyNews() except Exception, e: currentList.append([currentTime,str(uuid.uuid1()),'XDNetOilNewsSpider.writeMorningOilDailyNews',e])
def crawDailyStockComments(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name("ul-news-list").find_elements_by_tag_name("li") for context in mainlist: linkUrl = context.find_element_by_tag_name("a").get_attribute("href") title = context.text pubDate = CommonsInitValue.initNowTime() currentList.append([str(uuid.uuid1()), linkUrl, title, pubDate, "[...]", "STOCK", "HGNET"]) return currentList
def crawFinanceHLDataSource(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) maincontext = browsor.find_element_by_id('news_pic').find_element_by_class_name('changeDiv') linkUrl = maincontext.find_element_by_tag_name('a').get_attribute('href') pubDate = CommonsInitValue.initNowTime() imageUrl = maincontext.find_element_by_tag_name('img').get_attribute('src') title = maincontext.find_element_by_tag_name('img').get_attribute('alt') currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,'[...]','MACRO','TAKCHINA']) return currentList
def crawCNStockNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name( 'pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1) pubDate = datetime + ' ' + currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', '21CNNET' ]) return currentList
def crawThemeNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW THE IMPORT NEWS print '----START CRAW THE IMPORT NEWS----' try: ImportantNewsSpider.writeCompanyNews() except Exception, e: currentList.append([currentTime,str(uuid.uuid1()),'ImportantNewsSpider.writeCompanyNews',e])
def crawDataCenter(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW FOREXGOLD DATA SIPDER print '----START CRAW FOREXGOLD DATA----' try: ForexGoldDataNetSpider.writeForexGoldDataSource() except Exception, e: currentList.append([currentTime,str(uuid.uuid1()),'ForexGoldDataNetSpider.writeForexGoldDataSource()',e])
def crawYiCaiStockDailyNews(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextArray = browsor.find_elements_by_tag_name('dl') for context in contextArray: try: titleValue = context.find_element_by_tag_name('h1') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() linkUrl = context.find_element_by_tag_name('a').get_attribute('href') try: imageObj = context.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException,e: continue title = titleValue.text currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','YICAINET'])
def crawYCFinanceHLDataSource(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) courrentContext = browsor.find_elements_by_tag_name('dl') for currentDiv in courrentContext: try: titleObj = currentDiv.find_element_by_tag_name('h1') title = titleObj.text linkUrl = titleObj.find_element_by_tag_name('a').get_attribute('href') descriptContext = currentDiv.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() try: imageObj = currentDiv.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException,e: continue listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','YCNET'])
def crawMorningOilDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) mainlist = browsor.find_element_by_id('table').find_elements_by_class_name('evenrow') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = CommonsInitValue.initNowTime() if title =='': continue print title+":"+linkUrl
def crawFinanceHLDataSource(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_class_name('column11') title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href') imageUrl = mainContext.find_element_by_tag_name('img').get_attribute('src') descriptContext = mainContext.find_element_by_class_name('lead').text pubDate = CommonsInitValue.initNowTime() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','FTCHINA']) return currentList
def crawDailyComments(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('news-item') for mainContext in contextList: pubDate = CommonsInitValue.initNowTime() title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href') descriptContext = mainContext.find_element_by_class_name('desc').text currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'FOREX','ADSNET']) return currentList
def crawDailyOilNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() print '----START CRAW XDNETNEWS OIL NEWS----' try: XDNetOilNewsSpider.writeMorningOilDailyNews() except Exception, e: currentList.append([ currentTime, str(uuid.uuid1()), 'XDNetOilNewsSpider.writeMorningOilDailyNews', e ])
def crawMorningDailyNews(linkUrl): currentList = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) resultList = browsor.find_elements_by_class_name('mt24') for div in resultList: imageUrl = div.find_element_by_tag_name('img').get_attribute('src') linkUrl = div.find_element_by_tag_name('a').get_attribute('href') title = div.find_element_by_tag_name('a').text descriptContext = div.find_element_by_class_name('news-p').text pubDate = CommonsInitValue.initNowTime() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','NBDNET']) return currentList
def crawMorningFinanceDailyNews(linkUrl): currentArray=[] browsor = webdriver.PhantomJS() browsor.get(linkUrl) mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name('li') for context in mainList: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = context.find_element_by_class_name('date').text descriptContext = context.find_element_by_tag_name('p').text imageUrl = CommonsInitValue.initTempImage() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','IFengNET']) return currentArray
def crawFinanceHLDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name('news_list').find_elements_by_class_name('list') for context in mainlist: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text linkUrl = context.find_element_by_class_name('title').find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','TAKCHINA']) return currentArray
def crawMorningFinanceDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name('area_left')\ .find_elements_by_class_name('list_item') for context in maincontext: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('h2').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','XXCB']) return currentArray
def crawDailyStockComments(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name('w_660')\ .find_element_by_tag_name('ul')\ .find_elements_by_tag_name('li') for context in mainlist: title = context.find_element_by_tag_name('a').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'STOCK','JFNET']) return currentArray
def updateDailyForexPic(): currentList = [] currentTime = CommonsInitValue.initNowTime() log.info('The system crawling the resource of forex picture ') print '----START CRAW THE FOREX PICTURE----' CnForexImageSpider.writeForexImages() print '----START CRAW THE XEHUN PICTURE----' try: HeXunForexImageSpider.writeHeXunForexImage() except Exception,e: currentList.append([currentTime,str(uuid.uuid1()),'HeXunForexImageSpider.writeHeXunForexImage',e])
def crawDailyMetalComments(link): currentArray =[] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_element_by_class_name('right_box796')\ .find_element_by_tag_name('ul')\ .find_elements_by_tag_name('li') for context in contextList: pubDate = context.find_element_by_class_name('time').text title = context.find_element_by_tag_name('a').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') descriptContext = CommonsInitValue.removeSpecialCharacter(context.text) currentArray.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','GXNET']) return currentArray
def crawMorningMetalDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('articleItem') for context in contextList: try: linkUrl = context.find_element_by_class_name('img').get_attribute('href') imageUrl = context.find_element_by_class_name('img').find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text pubDate = CommonsInitValue.initNowTime() descriptContext = context.find_element_by_tag_name('p').text except NoSuchElementException,e: continue currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'METAL','INVESTINGNET'])
def crawFinanceHLDataSource(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_class_name('column11') title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href') imageUrl = mainContext.find_element_by_tag_name('img').get_attribute('src') descriptContext = mainContext.find_element_by_class_name('lead').text pubDate = CommonsInitValue.initNowTime() currentList.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'MACRO', 'FTCHINA' ]) return currentList
def crawDailyNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW HEJNEWS COMMENTS NEWS SIPDER #print '----START CRAW HEJNEWS NEWS----' #HEJNewsNetSpider.writeMorningDailyNews() # CRAW QQNEWS COMMENTS NEWS SIPDER print '----START CRAW QQNEWS NEWS----' try: QQNewsNetSpider.writeMorningQQDailyNews() except Exception,e: print e
def crawDailyNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW HEJNEWS COMMENTS NEWS SIPDER #print '----START CRAW HEJNEWS NEWS----' #HEJNewsNetSpider.writeMorningDailyNews() # CRAW QQNEWS COMMENTS NEWS SIPDER print '----START CRAW QQNEWS NEWS----' try: QQNewsNetSpider.writeMorningQQDailyNews() except Exception, e: print e
def crawDataCenter(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW FOREXGOLD DATA SIPDER print '----START CRAW FOREXGOLD DATA----' try: ForexGoldDataNetSpider.writeForexGoldDataSource() except Exception, e: currentList.append([ currentTime, str(uuid.uuid1()), 'ForexGoldDataNetSpider.writeForexGoldDataSource()', e ])
def crawMorningForexDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('articleItem') for context in contextList: try: linkUrl = context.find_element_by_class_name('img').get_attribute('href') imageUrl = context.find_element_by_class_name('img').find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text pubDate = CommonsInitValue.initNowTime() descriptContext = context.find_element_by_tag_name('p').text except NoSuchElementException,e: continue currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'FOREX','INVESTINGNET'])
def crawCnForexImages(link,keyList): currentArray = [] detaiArray = [] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_elements_by_class_name('imgModel') for model in imageList: linkUrl = model.find_element_by_tag_name('a').get_attribute('href') imageUrl = model.find_element_by_tag_name('img').get_attribute('src') pubDate = CommonsInitValue.returnCreateDate(model.find_element_by_tag_name('p').text) if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append([mianId,imageUrl,linkUrl,pubDate,'CNFOREXNET']) detaiArray.append([mianId,linkUrl]) CnForexImageDetailSpider.writeCnForexImageDetail(detaiArray) return currentArray
def crawMorningFinanceDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name( 'li') for context in mainList: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = context.find_element_by_class_name('date').text descriptContext = context.find_element_by_tag_name('p').text imageUrl = CommonsInitValue.initTempImage() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'IFengNET' ]) return currentArray
def crawMorningForexDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name("articleItem") for context in contextList: try: linkUrl = context.find_element_by_class_name("img").get_attribute("href") imageUrl = context.find_element_by_class_name("img").find_element_by_tag_name("img").get_attribute("src") title = context.find_element_by_class_name("title").text pubDate = CommonsInitValue.initNowTime() descriptContext = context.find_element_by_tag_name("p").text except NoSuchElementException, e: continue currentList.append( [str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, "FOREX", "INVESTINGNET"] )
def crawDailyMetalComments(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_element_by_class_name('right_box796')\ .find_element_by_tag_name('ul')\ .find_elements_by_tag_name('li') for context in contextList: pubDate = context.find_element_by_class_name('time').text title = context.find_element_by_tag_name('a').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') descriptContext = CommonsInitValue.removeSpecialCharacter(context.text) currentArray.append([ str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext, 'METAL', 'GXNET' ]) return currentArray
def crawDailyComments(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('news-item') for mainContext in contextList: pubDate = CommonsInitValue.initNowTime() title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute( 'href') descriptContext = mainContext.find_element_by_class_name('desc').text currentList.append([ str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext, 'FOREX', 'ADSNET' ]) return currentList
def crawMorningFinanceDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name('area_left')\ .find_elements_by_class_name('list_item') for context in maincontext: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('h2').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'XXCB' ]) return currentArray
def crawDailyStockComments(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name('w_660')\ .find_element_by_tag_name('ul')\ .find_elements_by_tag_name('li') for context in mainlist: title = context.find_element_by_tag_name('a').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext, 'STOCK', 'JFNET' ]) return currentArray
def crawFinanceHLDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name( 'news_list').find_elements_by_class_name('list') for context in mainlist: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text linkUrl = context.find_element_by_class_name( 'title').find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'TAKCHINA' ]) return currentArray
def crawCnForexImages(link, keyList): currentArray = [] detaiArray = [] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_elements_by_class_name('imgModel') for model in imageList: linkUrl = model.find_element_by_tag_name('a').get_attribute('href') imageUrl = model.find_element_by_tag_name('img').get_attribute('src') pubDate = CommonsInitValue.returnCreateDate( model.find_element_by_tag_name('p').text) if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append( [mianId, imageUrl, linkUrl, pubDate, 'CNFOREXNET']) detaiArray.append([mianId, linkUrl]) CnForexImageDetailSpider.writeCnForexImageDetail(detaiArray) return currentArray
def crawMorningDailyNews(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_id('listArticle') listContext = mainContext.find_elements_by_class_name('boxa') initImage = CommonsInitValue.initTempImage() for context in listContext: try: imageContext = context.find_element_by_class_name('pic') imageUrl = imageContext.find_element_by_tag_name('img').get_attribute('src') except NoSuchElementException,e: imageUrl = initImage title = context.find_element_by_tag_name('h4').text linkUrl = context.find_element_by_tag_name('h4')\ .find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = time.strftime("%Y-%m-%d %X",time.localtime()) listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','CXNET'])
def crawHeXunForexImage(link,keyList): currentArray =[] detaiArray=[] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_element_by_class_name('tupianpindao') mainList = imageList.find_elements_by_tag_name('div') for context in mainList: try: linkObj = context.find_element_by_tag_name('a') linkUrl = linkObj.get_attribute('href') imageUrl = context.find_element_by_tag_name('img').get_attribute('src') pubDate = CommonsInitValue.splitCreateDate(linkUrl,'/',3) descriptContext = context.find_element_by_tag_name('p').text if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append([mianId,imageUrl,linkUrl,pubDate,'HEXUNFOREXNET',descriptContext]) detaiArray.append([mianId,linkUrl]) except NoSuchElementException,e: continue
def crawMorningDailyNews(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_id('listArticle') listContext = mainContext.find_elements_by_class_name('boxa') initImage = CommonsInitValue.initTempImage() for context in listContext: try: imageContext = context.find_element_by_class_name('pic') imageUrl = imageContext.find_element_by_tag_name( 'img').get_attribute('src') except NoSuchElementException, e: imageUrl = initImage title = context.find_element_by_tag_name('h4').text linkUrl = context.find_element_by_tag_name('h4')\ .find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = time.strftime("%Y-%m-%d %X", time.localtime()) listArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'CXNET' ])
def crawHeXunForexImage(link, keyList): currentArray = [] detaiArray = [] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_element_by_class_name('tupianpindao') mainList = imageList.find_elements_by_tag_name('div') for context in mainList: try: linkObj = context.find_element_by_tag_name('a') linkUrl = linkObj.get_attribute('href') imageUrl = context.find_element_by_tag_name('img').get_attribute( 'src') pubDate = CommonsInitValue.splitCreateDate(linkUrl, '/', 3) descriptContext = context.find_element_by_tag_name('p').text if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append([ mianId, imageUrl, linkUrl, pubDate, 'HEXUNFOREXNET', descriptContext ]) detaiArray.append([mianId, linkUrl]) except NoSuchElementException, e: continue