def crawCnForexImages(link,keyList): currentArray = [] detaiArray = [] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_elements_by_class_name('imgModel') for model in imageList: linkUrl = model.find_element_by_tag_name('a').get_attribute('href') imageUrl = model.find_element_by_tag_name('img').get_attribute('src') pubDate = CommonsInitValue.returnCreateDate(model.find_element_by_tag_name('p').text) if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append([mianId,imageUrl,linkUrl,pubDate,'CNFOREXNET']) detaiArray.append([mianId,linkUrl]) CnForexImageDetailSpider.writeCnForexImageDetail(detaiArray) return currentArray
def crawCNFinanceNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name('pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1) pubDate =datetime+' '+currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET']) return currentList
def crawCnForexImages(link, keyList): currentArray = [] detaiArray = [] browsor = webdriver.PhantomJS() browsor.get(link) imageList = browsor.find_elements_by_class_name('imgModel') for model in imageList: linkUrl = model.find_element_by_tag_name('a').get_attribute('href') imageUrl = model.find_element_by_tag_name('img').get_attribute('src') pubDate = CommonsInitValue.returnCreateDate( model.find_element_by_tag_name('p').text) if not (imageUrl in keyList): mianId = str(uuid.uuid1()) currentArray.append( [mianId, imageUrl, linkUrl, pubDate, 'CNFOREXNET']) detaiArray.append([mianId, linkUrl]) CnForexImageDetailSpider.writeCnForexImageDetail(detaiArray) return currentArray
def crawCNStockNetDailyNews(link): currentList = [] browsor = webdriver.PhantomJS() browsor.get(link) mainlist = browsor.find_elements_by_class_name('art-list') for context in mainlist: linkUrl = context.find_element_by_tag_name('a').get_attribute('href') title = context.find_element_by_tag_name('a').text descriptContext = context.find_element_by_class_name( 'pic-details').text timeText = context.find_element_by_class_name('time').text datetime = CommonsInitValue.returnCreateDate(timeText) currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1) pubDate = datetime + ' ' + currentTime imageUrl = CommonsInitValue.initTempImage() currentList.append([ str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, 'STOCK', '21CNNET' ]) return currentList