def crawYCFinanceHLDataSource(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) courrentContext = browsor.find_elements_by_tag_name('dl') for currentDiv in courrentContext: try: titleObj = currentDiv.find_element_by_tag_name('h1') title = titleObj.text linkUrl = titleObj.find_element_by_tag_name('a').get_attribute( 'href') descriptContext = currentDiv.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() try: imageObj = currentDiv.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException, e: continue listArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'YCNET' ])
def crawYiCaiStockDailyNews(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextArray = browsor.find_elements_by_tag_name('dl') for context in contextArray: try: titleValue = context.find_element_by_tag_name('h1') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() linkUrl = context.find_element_by_tag_name('a').get_attribute( 'href') try: imageObj = context.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException, e: continue title = titleValue.text currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'YICAINET' ])
def crawMorningFinanceDailyNews(linkUrl): currentArray=[] browsor = webdriver.PhantomJS() browsor.get(linkUrl) mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name('li') for context in mainList: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = context.find_element_by_class_name('date').text descriptContext = context.find_element_by_tag_name('p').text imageUrl = CommonsInitValue.initTempImage() currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','IFengNET']) return currentArray
def crawZBNewsNetDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('l_title') for context in contextList: pubDate = CommonsInitValue.initNowTime() try: imageUrl = context.find_element_by_tag_name('img').get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() title = context.find_element_by_class_name('title').text descriptContext = context.find_element_by_class_name('text').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','ZBNET'])
def crawCNFinanceNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name('pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1) pubDate =datetime+' '+currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET']) return currentList
def crawMorningFinanceDailyNews(linkUrl): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(linkUrl) mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name( 'li') for context in mainList: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text pubDate = context.find_element_by_class_name('date').text descriptContext = context.find_element_by_tag_name('p').text imageUrl = CommonsInitValue.initTempImage() currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'IFengNET' ]) return currentArray
def crawMorningDailyNews(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_id('listArticle') listContext = mainContext.find_elements_by_class_name('boxa') initImage = CommonsInitValue.initTempImage() for context in listContext: try: imageContext = context.find_element_by_class_name('pic') imageUrl = imageContext.find_element_by_tag_name('img').get_attribute('src') except NoSuchElementException,e: imageUrl = initImage title = context.find_element_by_tag_name('h4').text linkUrl = context.find_element_by_tag_name('h4')\ .find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = time.strftime("%Y-%m-%d %X",time.localtime()) listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','CXNET'])
def crawZBNewsNetDataSource(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextList = browsor.find_elements_by_class_name('l_title') for context in contextList: pubDate = CommonsInitValue.initNowTime() try: imageUrl = context.find_element_by_tag_name('img').get_attribute( 'src') except NoSuchElementException, e: imageUrl = CommonsInitValue.initTempImage() title = context.find_element_by_class_name('title').text descriptContext = context.find_element_by_class_name('text').text linkUrl = context.find_element_by_tag_name('a').get_attribute('href') currentArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'CHINA', 'ZBNET' ])
def crawCNStockNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name( 'pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1) pubDate = datetime + ' ' + currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', '21CNNET' ]) return currentList
def crawYiCaiStockDailyNews(link): currentArray = [] browsor = webdriver.PhantomJS() browsor.get(link) contextArray = browsor.find_elements_by_tag_name('dl') for context in contextArray: try: titleValue = context.find_element_by_tag_name('h1') descriptContext = context.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() linkUrl = context.find_element_by_tag_name('a').get_attribute('href') try: imageObj = context.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException,e: continue title = titleValue.text currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','YICAINET'])
def crawYCFinanceHLDataSource(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) courrentContext = browsor.find_elements_by_tag_name('dl') for currentDiv in courrentContext: try: titleObj = currentDiv.find_element_by_tag_name('h1') title = titleObj.text linkUrl = titleObj.find_element_by_tag_name('a').get_attribute('href') descriptContext = currentDiv.find_element_by_tag_name('p').text pubDate = CommonsInitValue.initNowTime() try: imageObj = currentDiv.find_element_by_tag_name('img') imageUrl = imageObj.get_attribute('src') except NoSuchElementException,e: imageUrl = CommonsInitValue.initTempImage() except NoSuchElementException,e: continue listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','YCNET'])
def crawMorningDailyNews(link): listArray = [] browsor = webdriver.PhantomJS() browsor.get(link) mainContext = browsor.find_element_by_id('listArticle') listContext = mainContext.find_elements_by_class_name('boxa') initImage = CommonsInitValue.initTempImage() for context in listContext: try: imageContext = context.find_element_by_class_name('pic') imageUrl = imageContext.find_element_by_tag_name( 'img').get_attribute('src') except NoSuchElementException, e: imageUrl = initImage title = context.find_element_by_tag_name('h4').text linkUrl = context.find_element_by_tag_name('h4')\ .find_element_by_tag_name('a').get_attribute('href') descriptContext = context.find_element_by_tag_name('p').text pubDate = time.strftime("%Y-%m-%d %X", time.localtime()) listArray.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', 'CXNET' ])