def crawYiCaiStockDailyNews(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextArray = browsor.find_elements_by_tag_name('dl')
    for context in contextArray:
        try:
            titleValue = context.find_element_by_tag_name('h1')
            descriptContext = context.find_element_by_tag_name('p').text
            pubDate = CommonsInitValue.initNowTime()
            linkUrl = context.find_element_by_tag_name('a').get_attribute(
                'href')
            try:
                imageObj = context.find_element_by_tag_name('img')
                imageUrl = imageObj.get_attribute('src')
            except NoSuchElementException, e:
                imageUrl = CommonsInitValue.initTempImage()

        except NoSuchElementException, e:
            continue
        title = titleValue.text
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', 'YICAINET'
        ])
Esempio n. 2
0
def crawYCFinanceHLDataSource(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    courrentContext = browsor.find_elements_by_tag_name('dl')

    for currentDiv in courrentContext:
        try:
            titleObj = currentDiv.find_element_by_tag_name('h1')
            title = titleObj.text
            linkUrl = titleObj.find_element_by_tag_name('a').get_attribute(
                'href')
            descriptContext = currentDiv.find_element_by_tag_name('p').text
            pubDate = CommonsInitValue.initNowTime()
            try:
                imageObj = currentDiv.find_element_by_tag_name('img')
                imageUrl = imageObj.get_attribute('src')
            except NoSuchElementException, e:
                imageUrl = CommonsInitValue.initTempImage()
        except NoSuchElementException, e:
            continue
        listArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'YCNET'
        ])
Esempio n. 3
0
def crawMorningOilDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    maincontext = browsor.find_element_by_class_name('news_list_all').find_elements_by_tag_name('li')
    for context in maincontext:
        imageUrl = CommonsInitValue.initoiltempimage()
        descriptContext = context.find_element_by_tag_name('p').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'OIL','XIDU'])
    return currentArray
def crawZBNewsNetDataSource(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('l_title')
    for context in contextList:
        pubDate = CommonsInitValue.initNowTime()
        try:
            imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
        except NoSuchElementException,e:
            imageUrl = CommonsInitValue.initTempImage()
        title = context.find_element_by_class_name('title').text
        descriptContext = context.find_element_by_class_name('text').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','ZBNET'])
def crawCNFinanceNetDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_elements_by_class_name('art-list')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        descriptContext = context.find_element_by_class_name('pic-details').text
        timeText = context.find_element_by_class_name('time').text
        datetime = CommonsInitValue.returnCreateDate(timeText)
        currentTime = CommonsInitValue.splitCreateDate(timeText,' ',1)
        pubDate =datetime+' '+currentTime
        imageUrl = CommonsInitValue.initTempImage()
        currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','21CNNET'])
    return currentList
Esempio n. 6
0
def crawMorningOilDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    maincontext = browsor.find_element_by_class_name(
        'news_list_all').find_elements_by_tag_name('li')
    for context in maincontext:
        imageUrl = CommonsInitValue.initoiltempimage()
        descriptContext = context.find_element_by_tag_name('p').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'OIL', 'XIDU'
        ])
    return currentArray
Esempio n. 7
0
def crawZBNewsNetDataSource(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('l_title')
    for context in contextList:
        pubDate = CommonsInitValue.initNowTime()
        try:
            imageUrl = context.find_element_by_tag_name('img').get_attribute(
                'src')
        except NoSuchElementException, e:
            imageUrl = CommonsInitValue.initTempImage()
        title = context.find_element_by_class_name('title').text
        descriptContext = context.find_element_by_class_name('text').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'ZBNET'
        ])
def crawDailyOilNews():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    print '----START CRAW XDNETNEWS OIL NEWS----'
    try:
        XDNetOilNewsSpider.writeMorningOilDailyNews()
    except Exception, e:
        currentList.append([currentTime,str(uuid.uuid1()),'XDNetOilNewsSpider.writeMorningOilDailyNews',e])
Esempio n. 9
0
def crawDailyStockComments(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_element_by_class_name("ul-news-list").find_elements_by_tag_name("li")
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name("a").get_attribute("href")
        title = context.text
        pubDate = CommonsInitValue.initNowTime()
        currentList.append([str(uuid.uuid1()), linkUrl, title, pubDate, "[...]", "STOCK", "HGNET"])
    return currentList
def crawFinanceHLDataSource(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    maincontext = browsor.find_element_by_id('news_pic').find_element_by_class_name('changeDiv')
    linkUrl = maincontext.find_element_by_tag_name('a').get_attribute('href')
    pubDate = CommonsInitValue.initNowTime()
    imageUrl = maincontext.find_element_by_tag_name('img').get_attribute('src')
    title = maincontext.find_element_by_tag_name('img').get_attribute('alt')
    currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,'[...]','MACRO','TAKCHINA'])
    return currentList
def crawCNStockNetDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_elements_by_class_name('art-list')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        descriptContext = context.find_element_by_class_name(
            'pic-details').text
        timeText = context.find_element_by_class_name('time').text
        datetime = CommonsInitValue.returnCreateDate(timeText)
        currentTime = CommonsInitValue.splitCreateDate(timeText, ' ', 1)
        pubDate = datetime + ' ' + currentTime
        imageUrl = CommonsInitValue.initTempImage()
        currentList.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', '21CNNET'
        ])
    return currentList
Esempio n. 12
0
def  crawThemeNews():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    # CRAW THE IMPORT NEWS
    print '----START CRAW THE IMPORT NEWS----'
    try:
       ImportantNewsSpider.writeCompanyNews()
    except Exception, e:
        currentList.append([currentTime,str(uuid.uuid1()),'ImportantNewsSpider.writeCompanyNews',e])
def crawDataCenter():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    # CRAW FOREXGOLD DATA SIPDER
    print '----START CRAW FOREXGOLD DATA----'
    try:
       ForexGoldDataNetSpider.writeForexGoldDataSource()
    except Exception, e:
        currentList.append([currentTime,str(uuid.uuid1()),'ForexGoldDataNetSpider.writeForexGoldDataSource()',e])
Esempio n. 14
0
def crawYiCaiStockDailyNews(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextArray = browsor.find_elements_by_tag_name('dl')
    for context in contextArray:
        try:
          titleValue = context.find_element_by_tag_name('h1')
          descriptContext = context.find_element_by_tag_name('p').text
          pubDate = CommonsInitValue.initNowTime()
          linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
          try:
              imageObj = context.find_element_by_tag_name('img')
              imageUrl = imageObj.get_attribute('src')
          except NoSuchElementException,e:
              imageUrl = CommonsInitValue.initTempImage()

        except NoSuchElementException,e:
              continue
        title = titleValue.text
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','YICAINET'])
Esempio n. 15
0
def crawYCFinanceHLDataSource(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    courrentContext = browsor.find_elements_by_tag_name('dl')

    for currentDiv in  courrentContext:
         try:
              titleObj = currentDiv.find_element_by_tag_name('h1')
              title = titleObj.text
              linkUrl = titleObj.find_element_by_tag_name('a').get_attribute('href')
              descriptContext = currentDiv.find_element_by_tag_name('p').text
              pubDate = CommonsInitValue.initNowTime()
              try:
                  imageObj = currentDiv.find_element_by_tag_name('img')
                  imageUrl = imageObj.get_attribute('src')
              except NoSuchElementException,e:
                  imageUrl = CommonsInitValue.initTempImage()
         except NoSuchElementException,e:
              continue
         listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','YCNET'])
def crawMorningOilDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    mainlist = browsor.find_element_by_id('table').find_elements_by_class_name('evenrow')
    for context in mainlist:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = CommonsInitValue.initNowTime()
        if title =='':
            continue
        print title+":"+linkUrl
Esempio n. 17
0
def crawFinanceHLDataSource(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainContext = browsor.find_element_by_class_name('column11')
    title = mainContext.find_element_by_tag_name('a').text
    linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href')
    imageUrl = mainContext.find_element_by_tag_name('img').get_attribute('src')
    descriptContext = mainContext.find_element_by_class_name('lead').text
    pubDate = CommonsInitValue.initNowTime()
    currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'MACRO','FTCHINA'])
    return currentList
Esempio n. 18
0
def crawDailyComments(link):

    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('news-item')
    for mainContext in contextList:
        pubDate = CommonsInitValue.initNowTime()
        title = mainContext.find_element_by_tag_name('a').text
        linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href')
        descriptContext = mainContext.find_element_by_class_name('desc').text
        currentList.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'FOREX','ADSNET'])
    return currentList
def crawDailyOilNews():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    print '----START CRAW XDNETNEWS OIL NEWS----'
    try:
        XDNetOilNewsSpider.writeMorningOilDailyNews()
    except Exception, e:
        currentList.append([
            currentTime,
            str(uuid.uuid1()), 'XDNetOilNewsSpider.writeMorningOilDailyNews', e
        ])
def crawMorningDailyNews(linkUrl):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    resultList = browsor.find_elements_by_class_name('mt24')
    for div in resultList:
        imageUrl = div.find_element_by_tag_name('img').get_attribute('src')
        linkUrl = div.find_element_by_tag_name('a').get_attribute('href')
        title = div.find_element_by_tag_name('a').text
        descriptContext  = div.find_element_by_class_name('news-p').text
        pubDate = CommonsInitValue.initNowTime()
        currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','NBDNET'])
    return currentList
def crawMorningFinanceDailyNews(linkUrl):
    currentArray=[]
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name('li')
    for context in mainList:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = context.find_element_by_class_name('date').text
        descriptContext = context.find_element_by_tag_name('p').text
        imageUrl = CommonsInitValue.initTempImage()
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','IFengNET'])
    return currentArray
Esempio n. 22
0
def crawFinanceHLDataSource(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_element_by_class_name('news_list').find_elements_by_class_name('list')
    for context in mainlist:
        imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
        title = context.find_element_by_class_name('title').text
        linkUrl = context.find_element_by_class_name('title').find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','TAKCHINA'])
    return currentArray
def crawMorningFinanceDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    maincontext = browsor.find_element_by_class_name('area_left')\
            .find_elements_by_class_name('list_item')
    for context in maincontext:
        imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
        descriptContext = context.find_element_by_tag_name('p').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('h2').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'CHINA','XXCB'])
    return currentArray
Esempio n. 24
0
def crawDailyStockComments(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_element_by_class_name('w_660')\
        .find_element_by_tag_name('ul')\
        .find_elements_by_tag_name('li')
    for context in mainlist:
        title = context.find_element_by_tag_name('a').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'STOCK','JFNET'])
    return currentArray
def updateDailyForexPic():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    log.info('The system crawling the resource of forex picture ')
    print '----START CRAW THE FOREX PICTURE----'
    CnForexImageSpider.writeForexImages()

    print '----START CRAW THE XEHUN PICTURE----'
    try:
        HeXunForexImageSpider.writeHeXunForexImage()
    except Exception,e:
        currentList.append([currentTime,str(uuid.uuid1()),'HeXunForexImageSpider.writeHeXunForexImage',e])
Esempio n. 26
0
def crawDailyMetalComments(link):
    currentArray =[]
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_element_by_class_name('right_box796')\
        .find_element_by_tag_name('ul')\
        .find_elements_by_tag_name('li')
    for context in contextList:
        pubDate = context.find_element_by_class_name('time').text
        title = context.find_element_by_tag_name('a').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        descriptContext = CommonsInitValue.removeSpecialCharacter(context.text)
        currentArray.append([str(uuid.uuid1()),linkUrl,title,pubDate,descriptContext,'METAL','GXNET'])
    return currentArray
def crawMorningMetalDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('articleItem')
    for context in contextList:
        try:
            linkUrl = context.find_element_by_class_name('img').get_attribute('href')
            imageUrl = context.find_element_by_class_name('img').find_element_by_tag_name('img').get_attribute('src')
            title = context.find_element_by_class_name('title').text
            pubDate = CommonsInitValue.initNowTime()
            descriptContext = context.find_element_by_tag_name('p').text
        except NoSuchElementException,e:
            continue
        currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'METAL','INVESTINGNET'])
Esempio n. 28
0
def crawFinanceHLDataSource(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainContext = browsor.find_element_by_class_name('column11')
    title = mainContext.find_element_by_tag_name('a').text
    linkUrl = mainContext.find_element_by_tag_name('a').get_attribute('href')
    imageUrl = mainContext.find_element_by_tag_name('img').get_attribute('src')
    descriptContext = mainContext.find_element_by_class_name('lead').text
    pubDate = CommonsInitValue.initNowTime()
    currentList.append([
        str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext,
        'MACRO', 'FTCHINA'
    ])
    return currentList
def crawDailyNews():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    # CRAW HEJNEWS COMMENTS NEWS SIPDER
    #print '----START CRAW HEJNEWS NEWS----'
    #HEJNewsNetSpider.writeMorningDailyNews()
    
    # CRAW QQNEWS COMMENTS NEWS SIPDER
    print '----START CRAW QQNEWS NEWS----'
    try:
        QQNewsNetSpider.writeMorningQQDailyNews()
    except Exception,e:
        print e
Esempio n. 30
0
def crawDailyNews():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    # CRAW HEJNEWS COMMENTS NEWS SIPDER
    #print '----START CRAW HEJNEWS NEWS----'
    #HEJNewsNetSpider.writeMorningDailyNews()

    # CRAW QQNEWS COMMENTS NEWS SIPDER
    print '----START CRAW QQNEWS NEWS----'
    try:
        QQNewsNetSpider.writeMorningQQDailyNews()
    except Exception, e:
        print e
Esempio n. 31
0
def crawDataCenter():

    currentList = []
    currentTime = CommonsInitValue.initNowTime()

    # CRAW FOREXGOLD DATA SIPDER
    print '----START CRAW FOREXGOLD DATA----'
    try:
        ForexGoldDataNetSpider.writeForexGoldDataSource()
    except Exception, e:
        currentList.append([
            currentTime,
            str(uuid.uuid1()),
            'ForexGoldDataNetSpider.writeForexGoldDataSource()', e
        ])
Esempio n. 32
0
def crawMorningForexDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('articleItem')
    for context in contextList:
        try:
            linkUrl = context.find_element_by_class_name('img').get_attribute('href')
            imageUrl = context.find_element_by_class_name('img').find_element_by_tag_name('img').get_attribute('src')
            title = context.find_element_by_class_name('title').text
            pubDate = CommonsInitValue.initNowTime()
            descriptContext = context.find_element_by_tag_name('p').text
        except NoSuchElementException,e:
            continue
        currentList.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'FOREX','INVESTINGNET'])
Esempio n. 33
0
def crawCnForexImages(link,keyList):
    currentArray = []
    detaiArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    imageList = browsor.find_elements_by_class_name('imgModel')
    for model in imageList:
        linkUrl = model.find_element_by_tag_name('a').get_attribute('href')
        imageUrl = model.find_element_by_tag_name('img').get_attribute('src')
        pubDate = CommonsInitValue.returnCreateDate(model.find_element_by_tag_name('p').text)
        if not (imageUrl in keyList):
            mianId = str(uuid.uuid1())
            currentArray.append([mianId,imageUrl,linkUrl,pubDate,'CNFOREXNET'])
            detaiArray.append([mianId,linkUrl])
    CnForexImageDetailSpider.writeCnForexImageDetail(detaiArray)
    return currentArray
Esempio n. 34
0
def crawMorningFinanceDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    mainList = browsor.find_element_by_id('list01').find_elements_by_tag_name(
        'li')
    for context in mainList:
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('a').text
        pubDate = context.find_element_by_class_name('date').text
        descriptContext = context.find_element_by_tag_name('p').text
        imageUrl = CommonsInitValue.initTempImage()
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'IFengNET'
        ])
    return currentArray
def crawMorningForexDailyNews(link):
    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name("articleItem")
    for context in contextList:
        try:
            linkUrl = context.find_element_by_class_name("img").get_attribute("href")
            imageUrl = context.find_element_by_class_name("img").find_element_by_tag_name("img").get_attribute("src")
            title = context.find_element_by_class_name("title").text
            pubDate = CommonsInitValue.initNowTime()
            descriptContext = context.find_element_by_tag_name("p").text
        except NoSuchElementException, e:
            continue
        currentList.append(
            [str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate, descriptContext, "FOREX", "INVESTINGNET"]
        )
Esempio n. 36
0
def crawDailyMetalComments(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_element_by_class_name('right_box796')\
        .find_element_by_tag_name('ul')\
        .find_elements_by_tag_name('li')
    for context in contextList:
        pubDate = context.find_element_by_class_name('time').text
        title = context.find_element_by_tag_name('a').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        descriptContext = CommonsInitValue.removeSpecialCharacter(context.text)
        currentArray.append([
            str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext,
            'METAL', 'GXNET'
        ])
    return currentArray
Esempio n. 37
0
def crawDailyComments(link):

    currentList = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    contextList = browsor.find_elements_by_class_name('news-item')
    for mainContext in contextList:
        pubDate = CommonsInitValue.initNowTime()
        title = mainContext.find_element_by_tag_name('a').text
        linkUrl = mainContext.find_element_by_tag_name('a').get_attribute(
            'href')
        descriptContext = mainContext.find_element_by_class_name('desc').text
        currentList.append([
            str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext,
            'FOREX', 'ADSNET'
        ])
    return currentList
def crawMorningFinanceDailyNews(linkUrl):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(linkUrl)
    maincontext = browsor.find_element_by_class_name('area_left')\
            .find_elements_by_class_name('list_item')
    for context in maincontext:
        imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
        descriptContext = context.find_element_by_tag_name('p').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        title = context.find_element_by_tag_name('h2').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'CHINA', 'XXCB'
        ])
    return currentArray
Esempio n. 39
0
def crawDailyStockComments(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_element_by_class_name('w_660')\
        .find_element_by_tag_name('ul')\
        .find_elements_by_tag_name('li')
    for context in mainlist:
        title = context.find_element_by_tag_name('a').text
        linkUrl = context.find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([
            str(uuid.uuid1()), linkUrl, title, pubDate, descriptContext,
            'STOCK', 'JFNET'
        ])
    return currentArray
def crawFinanceHLDataSource(link):
    currentArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainlist = browsor.find_element_by_class_name(
        'news_list').find_elements_by_class_name('list')
    for context in mainlist:
        imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
        title = context.find_element_by_class_name('title').text
        linkUrl = context.find_element_by_class_name(
            'title').find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = CommonsInitValue.initNowTime()
        currentArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', 'TAKCHINA'
        ])
    return currentArray
def crawCnForexImages(link, keyList):
    currentArray = []
    detaiArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    imageList = browsor.find_elements_by_class_name('imgModel')
    for model in imageList:
        linkUrl = model.find_element_by_tag_name('a').get_attribute('href')
        imageUrl = model.find_element_by_tag_name('img').get_attribute('src')
        pubDate = CommonsInitValue.returnCreateDate(
            model.find_element_by_tag_name('p').text)
        if not (imageUrl in keyList):
            mianId = str(uuid.uuid1())
            currentArray.append(
                [mianId, imageUrl, linkUrl, pubDate, 'CNFOREXNET'])
            detaiArray.append([mianId, linkUrl])
    CnForexImageDetailSpider.writeCnForexImageDetail(detaiArray)
    return currentArray
Esempio n. 42
0
def crawMorningDailyNews(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainContext = browsor.find_element_by_id('listArticle')
    listContext = mainContext.find_elements_by_class_name('boxa')
    initImage = CommonsInitValue.initTempImage()
    for context in listContext:
        try:
            imageContext = context.find_element_by_class_name('pic')
            imageUrl = imageContext.find_element_by_tag_name('img').get_attribute('src')
        except NoSuchElementException,e:
            imageUrl = initImage
        title = context.find_element_by_tag_name('h4').text
        linkUrl = context.find_element_by_tag_name('h4')\
                          .find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = time.strftime("%Y-%m-%d %X",time.localtime())
        listArray.append([str(uuid.uuid1()),linkUrl,imageUrl,title,pubDate,descriptContext,'STOCK','CXNET'])
Esempio n. 43
0
def crawHeXunForexImage(link,keyList):
    currentArray =[]
    detaiArray=[]
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    imageList = browsor.find_element_by_class_name('tupianpindao')
    mainList = imageList.find_elements_by_tag_name('div')
    for context in mainList:
        try:
            linkObj = context.find_element_by_tag_name('a')
            linkUrl = linkObj.get_attribute('href')
            imageUrl = context.find_element_by_tag_name('img').get_attribute('src')
            pubDate = CommonsInitValue.splitCreateDate(linkUrl,'/',3)
            descriptContext = context.find_element_by_tag_name('p').text
            if not (imageUrl in keyList):
                mianId = str(uuid.uuid1())
                currentArray.append([mianId,imageUrl,linkUrl,pubDate,'HEXUNFOREXNET',descriptContext])
                detaiArray.append([mianId,linkUrl])
        except NoSuchElementException,e:
            continue
Esempio n. 44
0
def crawMorningDailyNews(link):
    listArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    mainContext = browsor.find_element_by_id('listArticle')
    listContext = mainContext.find_elements_by_class_name('boxa')
    initImage = CommonsInitValue.initTempImage()
    for context in listContext:
        try:
            imageContext = context.find_element_by_class_name('pic')
            imageUrl = imageContext.find_element_by_tag_name(
                'img').get_attribute('src')
        except NoSuchElementException, e:
            imageUrl = initImage
        title = context.find_element_by_tag_name('h4').text
        linkUrl = context.find_element_by_tag_name('h4')\
                          .find_element_by_tag_name('a').get_attribute('href')
        descriptContext = context.find_element_by_tag_name('p').text
        pubDate = time.strftime("%Y-%m-%d %X", time.localtime())
        listArray.append([
            str(uuid.uuid1()), linkUrl, imageUrl, title, pubDate,
            descriptContext, 'STOCK', 'CXNET'
        ])
Esempio n. 45
0
def crawHeXunForexImage(link, keyList):
    currentArray = []
    detaiArray = []
    browsor = webdriver.PhantomJS()
    browsor.get(link)
    imageList = browsor.find_element_by_class_name('tupianpindao')
    mainList = imageList.find_elements_by_tag_name('div')
    for context in mainList:
        try:
            linkObj = context.find_element_by_tag_name('a')
            linkUrl = linkObj.get_attribute('href')
            imageUrl = context.find_element_by_tag_name('img').get_attribute(
                'src')
            pubDate = CommonsInitValue.splitCreateDate(linkUrl, '/', 3)
            descriptContext = context.find_element_by_tag_name('p').text
            if not (imageUrl in keyList):
                mianId = str(uuid.uuid1())
                currentArray.append([
                    mianId, imageUrl, linkUrl, pubDate, 'HEXUNFOREXNET',
                    descriptContext
                ])
                detaiArray.append([mianId, linkUrl])
        except NoSuchElementException, e:
            continue