def crawYCFinanceHLDataSource(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) courrentContext = browsor.find_elements_by_tag_name('dl') for currentDiv in courrentContext: try: titleObj = currentDiv.find_element_by_tag_name('h1') title = titleObj.text linkUrl = titleObj.find_element_by_tag_name('a').get_attribute( 'href') descriptContext = currentDiv.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() try: imageObj = currentDiv.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException, e: continue listArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'YCNET' ])
def crawYiCaiStockDailyNews(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextArray = browsor.find_elements_by_tag_name('dl') for context in contextArray: try: titleValue = context.find_element_by_tag_name('h1') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() linkUrl = context.find_element_by_tag_name('a').get_attribute( 'href') try: imageObj = context.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException, e: continue title = titleValue.text currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'YICAINET' ])
def crawDailyOilNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() print '----START CRAW XDNETNEWS OIL NEWS----' try: XDNetOilNewsSpider.writeMorningOilDailyNews() except Exception, e: currentList.append([currentTime,str(uuid.uuid1()),'XDNetOilNewsSpider.writeMorningOilDailyNews',e])
def crawDailyStockComments(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name("ul-news-list").find_elements_by_tag_name("li") for context in mainlist: linkUrl = context.find_element_by_tag_name("a").get_attribute("href") title = context.text pubDate = CommonsInitValue.initNowTime() currentList.append([str(uuid.uuid1()), linkUrl, title, pubDate, "[...]", "STOCK", "HGNET"]) return currentList
def crawFinanceHLDataSource(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) maincontext = browsor.find_element_by_id('news_pic').find_element_by_class_name('changeDiv') linkUrl = maincontext.find_element_by_tag_name('a').get_attribute('href') pubDate = CommonsInitValue.initNowTime() imageUrl = maincontext.find_element_by_tag_name('img').get_attribute('src') title = maincontext.find_element_by_tag_name('img').get_attribute('alt') currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,'[...]','MACRO','TAKCHINA']) return currentList
def crawThemeNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW THE IMPORT NEWS print '----START CRAW THE IMPORT NEWS----' try: ImportantNewsSpider.writeCompanyNews() except Exception, e: currentList.append([currentTime,str(uuid.uuid1()),'ImportantNewsSpider.writeCompanyNews',e])
def crawDataCenter(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW FOREXGOLD DATA SIPDER print '----START CRAW FOREXGOLD DATA----' try: ForexGoldDataNetSpider.writeForexGoldDataSource() except Exception, e: currentList.append([currentTime,str(uuid.uuid1()),'ForexGoldDataNetSpider.writeForexGoldDataSource()',e])
def crawMorningOilDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) mainlist = browsor.find_element_by_id('table').find_elements_by_class_name('evenrow') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = CommonsInitValue.initNowTime() if title =='': continue print title+":"+linkUrl
def crawFinanceHLDataSource(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_class_name('column11') title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href') imageUrl = mainContext.find_element_by_tag_name('img').get_attribute('src') descriptContext = mainContext.find_element_by_class_name('lead').text pubDate = CommonsInitValue.initNowTime() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','FTCHINA']) return currentList
def crawMorningOilDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name('news_list_all').find_elements_by_tag_name('li') for context in maincontext: imageUrl = CommonsInitValue.initoiltempimage() descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'OIL','XIDU']) return currentArray
def crawFinanceHLDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name('news_list').find_elements_by_class_name('list') for context in mainlist: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text linkUrl = context.find_element_by_class_name('title').find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','TAKCHINA']) return currentArray
def crawMorningDailyNews(linkUrl): currentList = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) resultList = browsor.find_elements_by_class_name('mt24') for div in resultList: imageUrl = div.find_element_by_tag_name('img').get_attribute('src') linkUrl = div.find_element_by_tag_name('a').get_attribute('href') title = div.find_element_by_tag_name('a').text descriptContext = div.find_element_by_class_name('news-p').text pubDate = CommonsInitValue.initNowTime() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','NBDNET']) return currentList
def crawDailyOilNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() print '----START CRAW XDNETNEWS OIL NEWS----' try: XDNetOilNewsSpider.writeMorningOilDailyNews() except Exception, e: currentList.append([ currentTime, str(uuid.uuid1()), 'XDNetOilNewsSpider.writeMorningOilDailyNews', e ])
def crawDailyComments(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('news-item') for mainContext in contextList: pubDate = CommonsInitValue.initNowTime() title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href') descriptContext = mainContext.find_element_by_class_name('desc').text currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'FOREX','ADSNET']) return currentList
def crawMorningFinanceDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name('area_left')\ .find_elements_by_class_name('list_item') for context in maincontext: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('h2').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','XXCB']) return currentArray
def updateDailyForexPic(): currentList = [] currentTime = CommonsInitValue.initNowTime() log.info('The system crawling the resource of forex picture ') print '----START CRAW THE FOREX PICTURE----' CnForexImageSpider.writeForexImages() print '----START CRAW THE XEHUN PICTURE----' try: HeXunForexImageSpider.writeHeXunForexImage() except Exception,e: currentList.append([currentTime,str(uuid.uuid1()),'HeXunForexImageSpider.writeHeXunForexImage',e])
def crawDailyStockComments(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name('w_660')\ .find_element_by_tag_name('ul')\ .find_elements_by_tag_name('li') for context in mainlist: title = context.find_element_by_tag_name('a').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'STOCK','JFNET']) return currentArray
def crawMorningMetalDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('articleItem') for context in contextList: try: linkUrl = context.find_element_by_class_name('img').get_attribute('href') imageUrl = context.find_element_by_class_name('img').find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text pubDate = CommonsInitValue.initNowTime() descriptContext = context.find_element_by_tag_name('p').text except NoSuchElementException,e: continue currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'METAL','INVESTINGNET'])
def crawFinanceHLDataSource(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_class_name('column11') title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href') imageUrl = mainContext.find_element_by_tag_name('img').get_attribute('src') descriptContext = mainContext.find_element_by_class_name('lead').text pubDate = CommonsInitValue.initNowTime() currentList.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'MACRO', 'FTCHINA' ]) return currentList
def crawDailyNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW HEJNEWS COMMENTS NEWS SIPDER #print '----START CRAW HEJNEWS NEWS----' #HEJNewsNetSpider.writeMorningDailyNews() # CRAW QQNEWS COMMENTS NEWS SIPDER print '----START CRAW QQNEWS NEWS----' try: QQNewsNetSpider.writeMorningQQDailyNews() except Exception, e: print e
def crawMorningForexDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('articleItem') for context in contextList: try: linkUrl = context.find_element_by_class_name('img').get_attribute('href') imageUrl = context.find_element_by_class_name('img').find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text pubDate = CommonsInitValue.initNowTime() descriptContext = context.find_element_by_tag_name('p').text except NoSuchElementException,e: continue currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'FOREX','INVESTINGNET'])
def crawDataCenter(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW FOREXGOLD DATA SIPDER print '----START CRAW FOREXGOLD DATA----' try: ForexGoldDataNetSpider.writeForexGoldDataSource() except Exception, e: currentList.append([ currentTime, str(uuid.uuid1()), 'ForexGoldDataNetSpider.writeForexGoldDataSource()', e ])
def crawZBNewsNetDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('l_title') for context in contextList: pubDate = CommonsInitValue.initNowTime() try: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() title = context.find_element_by_class_name('title').text descriptContext = context.find_element_by_class_name('text').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','ZBNET'])
def crawDailyNews(): currentList = [] currentTime = CommonsInitValue.initNowTime() # CRAW HEJNEWS COMMENTS NEWS SIPDER #print '----START CRAW HEJNEWS NEWS----' #HEJNewsNetSpider.writeMorningDailyNews() # CRAW QQNEWS COMMENTS NEWS SIPDER print '----START CRAW QQNEWS NEWS----' try: QQNewsNetSpider.writeMorningQQDailyNews() except Exception,e: print e
def crawMorningForexDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name("articleItem") for context in contextList: try: linkUrl = context.find_element_by_class_name("img").get_attribute("href") imageUrl = context.find_element_by_class_name("img").find_element_by_tag_name("img").get_attribute("src") title = context.find_element_by_class_name("title").text pubDate = CommonsInitValue.initNowTime() descriptContext = context.find_element_by_tag_name("p").text except NoSuchElementException, e: continue currentList.append( [str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, "FOREX", "INVESTINGNET"] )
def crawMorningFinanceDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name('area_left')\ .find_elements_by_class_name('list_item') for context in maincontext: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('h2').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'XXCB' ]) return currentArray
def crawMorningOilDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) maincontext = browsor.find_element_by_class_name( 'news_list_all').find_elements_by_tag_name('li') for context in maincontext: imageUrl = CommonsInitValue.initoiltempimage() descriptContext = context.find_element_by_tag_name('p').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'OIL', 'XIDU' ]) return currentArray
def crawDailyStockComments(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name('w_660')\ .find_element_by_tag_name('ul')\ .find_elements_by_tag_name('li') for context in mainlist: title = context.find_element_by_tag_name('a').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext, 'STOCK', 'JFNET' ]) return currentArray
def crawDailyComments(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('news-item') for mainContext in contextList: pubDate = CommonsInitValue.initNowTime() title = mainContext.find_element_by_tag_name('a').text linkUrl = mainContext.find_element_by_tag_name('a').get_attribute( 'href') descriptContext = mainContext.find_element_by_class_name('desc').text currentList.append([ str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext, 'FOREX', 'ADSNET' ]) return currentList
def crawFinanceHLDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_element_by_class_name( 'news_list').find_elements_by_class_name('list') for context in mainlist: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') title = context.find_element_by_class_name('title').text linkUrl = context.find_element_by_class_name( 'title').find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'TAKCHINA' ]) return currentArray
def crawZBNewsNetDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('l_title') for context in contextList: pubDate = CommonsInitValue.initNowTime() try: imageUrl = context.find_element_by_tag_name('img').get_attribute( 'src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() title = context.find_element_by_class_name('title').text descriptContext = context.find_element_by_class_name('text').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'ZBNET' ])
def crawYiCaiStockDailyNews(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextArray = browsor.find_elements_by_tag_name('dl') for context in contextArray: try: titleValue = context.find_element_by_tag_name('h1') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() linkUrl = context.find_element_by_tag_name('a').get_attribute('href') try: imageObj = context.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException,e: continue title = titleValue.text currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','YICAINET'])
def crawYCFinanceHLDataSource(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) courrentContext = browsor.find_elements_by_tag_name('dl') for currentDiv in courrentContext: try: titleObj = currentDiv.find_element_by_tag_name('h1') title = titleObj.text linkUrl = titleObj.find_element_by_tag_name('a').get_attribute('href') descriptContext = currentDiv.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() try: imageObj = currentDiv.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException,e: continue listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','YCNET'])