예제 #1
0
def crawCNFinanceNetDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_elements_by_class_name('art-list')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        descriptContext = context.find_element_by_class_name('pic-details').text
        timeText = context.find_element_by_class_name('time').text
        datetime = CommonsInitValue.returnCreateDate(timeText)
        currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1)
        pubDate =datetime+' '+currentTime
        imageUrl = CommonsInitValue.initTempImage()
        currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET'])
    return currentList
def crawCNStockNetDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_elements_by_class_name('art-list')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        descriptContext = context.find_element_by_class_name(
            'pic-details').text
        timeText = context.find_element_by_class_name('time').text
        datetime = CommonsInitValue.returnCreateDate(timeText)
        currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1)
        pubDate = datetime + ' ' + currentTime
        imageUrl = CommonsInitValue.initTempImage()
        currentList.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', '21CNNET'
        ])
    return currentList
예제 #3
0
def crawHeXunForexImage(link,keyList):
    currentArray =[]
    detaiArray=[]
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    imageList = browsor.find_element_by_class_name('tupianpindao')
    mainList = imageList.find_elements_by_tag_name('div')
    for context in mainList:
        try:
            linkObj = context.find_element_by_tag_name('a')
            linkUrl = linkObj.get_attribute('href')
            imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
            pubDate = CommonsInitValue.splitCreateDate(linkUrl,'/',3)
            descriptContext = context.find_element_by_tag_name('p').text
            if not (imageUrl in keyList):
                mianId = str(uuid.uuid1())
                currentArray.append([mianId,imageUrl,linkUrl,pubDate,'HEXUNFOREXNET',descriptContext])
                detaiArray.append([mianId,linkUrl])
        except NoSuchElementException,e:
            continue
예제 #4
0
def crawHeXunForexImage(link, keyList):
    currentArray = []
    detaiArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    imageList = browsor.find_element_by_class_name('tupianpindao')
    mainList = imageList.find_elements_by_tag_name('div')
    for context in mainList:
        try:
            linkObj = context.find_element_by_tag_name('a')
            linkUrl = linkObj.get_attribute('href')
            imageUrl = context.find_element_by_tag_name('img').get_attribute(
                'src')
            pubDate = CommonsInitValue.splitCreateDate(linkUrl, '/', 3)
            descriptContext = context.find_element_by_tag_name('p').text
            if not (imageUrl in keyList):
                mianId = str(uuid.uuid1())
                currentArray.append([
                    mianId, imageUrl, linkUrl, pubDate, 'HEXUNFOREXNET',
                    descriptContext
                ])
                detaiArray.append([mianId, linkUrl])
        except NoSuchElementException, e:
            continue