def crawCNFinanceNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name('pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1) pubDate =datetime+' '+currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET']) return currentList
def crawCNStockNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name( 'pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1) pubDate = datetime + ' ' + currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', '21CNNET' ]) return currentList
def crawHeXunForexImage(link,keyList): currentArray =[] detaiArray=[] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_element_by_class_name('tupianpindao') mainList = imageList.find_elements_by_tag_name('div') for context in mainList: try: linkObj = context.find_element_by_tag_name('a') linkUrl = linkObj.get_attribute('href') imageUrl = context.find_element_by_tag_name('img').get_attribute('src') pubDate = CommonsInitValue.splitCreateDate(linkUrl,'/',3) descriptContext = context.find_element_by_tag_name('p').text if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append([mianId,imageUrl,linkUrl,pubDate,'HEXUNFOREXNET',descriptContext]) detaiArray.append([mianId,linkUrl]) except NoSuchElementException,e: continue
def crawHeXunForexImage(link, keyList): currentArray = [] detaiArray = [] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_element_by_class_name('tupianpindao') mainList = imageList.find_elements_by_tag_name('div') for context in mainList: try: linkObj = context.find_element_by_tag_name('a') linkUrl = linkObj.get_attribute('href') imageUrl = context.find_element_by_tag_name('img').get_attribute( 'src') pubDate = CommonsInitValue.splitCreateDate(linkUrl, '/', 3) descriptContext = context.find_element_by_tag_name('p').text if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append([ mianId, imageUrl, linkUrl, pubDate, 'HEXUNFOREXNET', descriptContext ]) detaiArray.append([mianId, linkUrl]) except NoSuchElementException, e: continue