예제 #1
0
def crawYCFinanceHLDataSource(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    courrentContext = browsor.find_elements_by_tag_name('dl')

    for currentDiv in courrentContext:
        try:
            titleObj = currentDiv.find_element_by_tag_name('h1')
            title = titleObj.text
            linkUrl = titleObj.find_element_by_tag_name('a').get_attribute(
                'href')
            descriptContext = currentDiv.find_element_by_tag_name('p').text
            pubDate = CommonsInitValue.initNowTime()
            try:
                imageObj = currentDiv.find_element_by_tag_name('img')
                imageUrl = imageObj.get_attribute('src')
            except NoSuchElementException, e:
                imageUrl = CommonsInitValue.initTempImage()
        except NoSuchElementException, e:
            continue
        listArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'YCNET'
        ])
def crawYiCaiStockDailyNews(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextArray = browsor.find_elements_by_tag_name('dl')
    for context in contextArray:
        try:
            titleValue = context.find_element_by_tag_name('h1')
            descriptContext = context.find_element_by_tag_name('p').text
            pubDate = CommonsInitValue.initNowTime()
            linkUrl = context.find_element_by_tag_name('a').get_attribute(
                'href')
            try:
                imageObj = context.find_element_by_tag_name('img')
                imageUrl = imageObj.get_attribute('src')
            except NoSuchElementException, e:
                imageUrl = CommonsInitValue.initTempImage()

        except NoSuchElementException, e:
            continue
        title = titleValue.text
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', 'YICAINET'
        ])
def crawMorningFinanceDailyNews(linkUrl):
    currentArray=[]
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name('li')
    for context in mainList:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = context.find_element_by_class_name('date').text
        descriptContext = context.find_element_by_tag_name('p').text
        imageUrl = CommonsInitValue.initTempImage()
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','IFengNET'])
    return currentArray
예제 #4
0
def crawZBNewsNetDataSource(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('l_title')
    for context in contextList:
        pubDate = CommonsInitValue.initNowTime()
        try:
            imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
        except NoSuchElementException,e:
            imageUrl = CommonsInitValue.initTempImage()
        title = context.find_element_by_class_name('title').text
        descriptContext = context.find_element_by_class_name('text').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','ZBNET'])
예제 #5
0
def crawCNFinanceNetDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_elements_by_class_name('art-list')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        descriptContext = context.find_element_by_class_name('pic-details').text
        timeText = context.find_element_by_class_name('time').text
        datetime = CommonsInitValue.returnCreateDate(timeText)
        currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1)
        pubDate =datetime+' '+currentTime
        imageUrl = CommonsInitValue.initTempImage()
        currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET'])
    return currentList
예제 #6
0
def crawMorningFinanceDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name(
        'li')
    for context in mainList:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = context.find_element_by_class_name('date').text
        descriptContext = context.find_element_by_tag_name('p').text
        imageUrl = CommonsInitValue.initTempImage()
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'IFengNET'
        ])
    return currentArray
예제 #7
0
def crawMorningDailyNews(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainContext = browsor.find_element_by_id('listArticle')
    listContext = mainContext.find_elements_by_class_name('boxa')
    initImage = CommonsInitValue.initTempImage()
    for context in listContext:
        try:
            imageContext = context.find_element_by_class_name('pic')
            imageUrl = imageContext.find_element_by_tag_name('img').get_attribute('src')
        except NoSuchElementException,e:
            imageUrl = initImage
        title = context.find_element_by_tag_name('h4').text
        linkUrl = context.find_element_by_tag_name('h4')\
                          .find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = time.strftime("%Y-%m-%d %X",time.localtime())
        listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','CXNET'])
예제 #8
0
def crawZBNewsNetDataSource(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('l_title')
    for context in contextList:
        pubDate = CommonsInitValue.initNowTime()
        try:
            imageUrl = context.find_element_by_tag_name('img').get_attribute(
                'src')
        except NoSuchElementException, e:
            imageUrl = CommonsInitValue.initTempImage()
        title = context.find_element_by_class_name('title').text
        descriptContext = context.find_element_by_class_name('text').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'ZBNET'
        ])
def crawCNStockNetDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_elements_by_class_name('art-list')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        descriptContext = context.find_element_by_class_name(
            'pic-details').text
        timeText = context.find_element_by_class_name('time').text
        datetime = CommonsInitValue.returnCreateDate(timeText)
        currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1)
        pubDate = datetime + ' ' + currentTime
        imageUrl = CommonsInitValue.initTempImage()
        currentList.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', '21CNNET'
        ])
    return currentList
예제 #10
0
def crawYiCaiStockDailyNews(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextArray = browsor.find_elements_by_tag_name('dl')
    for context in contextArray:
        try:
          titleValue = context.find_element_by_tag_name('h1')
          descriptContext = context.find_element_by_tag_name('p').text
          pubDate = CommonsInitValue.initNowTime()
          linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
          try:
              imageObj = context.find_element_by_tag_name('img')
              imageUrl = imageObj.get_attribute('src')
          except NoSuchElementException,e:
              imageUrl = CommonsInitValue.initTempImage()

        except NoSuchElementException,e:
              continue
        title = titleValue.text
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','YICAINET'])
예제 #11
0
def crawYCFinanceHLDataSource(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    courrentContext = browsor.find_elements_by_tag_name('dl')

    for currentDiv in  courrentContext:
         try:
              titleObj = currentDiv.find_element_by_tag_name('h1')
              title = titleObj.text
              linkUrl = titleObj.find_element_by_tag_name('a').get_attribute('href')
              descriptContext = currentDiv.find_element_by_tag_name('p').text
              pubDate = CommonsInitValue.initNowTime()
              try:
                  imageObj = currentDiv.find_element_by_tag_name('img')
                  imageUrl = imageObj.get_attribute('src')
              except NoSuchElementException,e:
                  imageUrl = CommonsInitValue.initTempImage()
         except NoSuchElementException,e:
              continue
         listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','YCNET'])
예제 #12
0
def crawMorningDailyNews(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainContext = browsor.find_element_by_id('listArticle')
    listContext = mainContext.find_elements_by_class_name('boxa')
    initImage = CommonsInitValue.initTempImage()
    for context in listContext:
        try:
            imageContext = context.find_element_by_class_name('pic')
            imageUrl = imageContext.find_element_by_tag_name(
                'img').get_attribute('src')
        except NoSuchElementException, e:
            imageUrl = initImage
        title = context.find_element_by_tag_name('h4').text
        linkUrl = context.find_element_by_tag_name('h4')\
                          .find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = time.strftime("%Y-%m-%d %X", time.localtime())
        listArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', 'CXNET'
        ])