Beispiel #1
0
        def save_result(data):
            result_file = get_result_name(plantform_e='altxw',
                                          plantform_c='阿勒泰新闻网',
                                          date_time=data['publish_time'],
                                          urlOruid=data['url'],
                                          newsidOrtid=data['id'],
                                          datatype='news',
                                          full_data=data)
            if not result_file:
                return
            print datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'), '--------', result_file
            data['spider_time'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')

            Save_result(plantform='altxw',
                        date_time=data['publish_time'],
                        urlOruid=data['url'],
                        newsidOrtid=data['id'],
                        datatype='news',
                        full_data=data)

            save_data_to_mongodb(data={'data': data},
                                 item_id=result_file,
                                 platform_e='altxw',
                                 platform_c='阿勒泰新闻网',
                                 cache_data_list=self.cache_data_Queue)
Beispiel #2
0
        def save_result(data):
            # host = '182.150.63.40'
            # port = '12308'
            # username = '******'
            # password = '******'
            #
            # # producer = Producer(hosts=host)
            # producer = RemoteProducer(host=host, port=port, username=username, password=password)
            result_file = get_result_name(plantform_e='xilu',
                                          plantform_c='西陆网',
                                          date_time=data['publish_time'],
                                          urlOruid=data['url'],
                                          newsidOrtid=data['id'],
                                          datatype='news',
                                          full_data=data)

            if not result_file:
                return

            print datetime.datetime.now(), '--------', result_file

            save_data_to_mongodb(data={'data': data},
                                 item_id=result_file,
                                 platform_e='xilu',
                                 platform_c='西陆网',
                                 cache_data_list=self.cache_data_list)
Beispiel #3
0
        def save_result(data):
            # Save_result(plantform='csdn', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'],
            #             datatype='news', full_data=data)

            # host = '182.150.63.40'
            # port = '12308'
            # username = '******'
            # password = '******'
            #
            # producer = RemoteProducer(host=host, port=port, username=username, password=password)
            result_file = get_result_name(plantform_e='csdn',
                                          plantform_c='CSDN论坛',
                                          date_time=data['publish_time'],
                                          urlOruid=data['url'],
                                          newsidOrtid=data['id'],
                                          datatype='forum',
                                          full_data=data)

            print datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'), '--------', result_file

            save_data_to_mongodb(data={'data': data},
                                 platform_c='CSDN论坛',
                                 platform_e='csdn',
                                 item_id=result_file)
Beispiel #4
0
        def save_result(data):
            # save_user_to_redis(data)

            result_file = get_result_name(plantform_e='people',
                                          plantform_c='人民网强国社区',
                                          date_time=data['publish_time'],
                                          urlOruid=data['url'],
                                          newsidOrtid=data['id'],
                                          datatype='forum',
                                          full_data=data)
            print datetime.datetime.now(), '--------', result_file

            if not result_file:
                return

            save_data_to_mongodb(data={'data': data},
                                 item_id=result_file,
                                 platform_e='people',
                                 platform_c='人民网强国社区',
                                 cache_data_list=self.cache_data_list)

            # producer.send(topic='1101_STREAM_SPIDER', value={'data': data}, key=result_file,
            #               updatetime=data['spider_time'])

            pass
Beispiel #5
0
        def save_result(data):
            # print 'deal result'
            try:  #因为有些页面有时候会解析错误,导致没有正确的内容,自然也没有publishtime这个属性,所以直接可以用try模块来过滤掉那些没有抓全的数据。

                # host = '192.168.6.187:9092,192.168.6.188:9092,192.168.6.229:9092,192.168.6.230:9092'
                # host='182.150.63.40'
                # port='12308'
                # username='******'
                # password='******'
                #
                # producer=RemoteProducer(host=host,port=port,username=username,password=password)
                result_file = get_result_name(plantform_e='ChengDuQuanSouSuo',
                                              plantform_c='成都全搜索',
                                              date_time=data['publish_time'],
                                              urlOruid=data['url'],
                                              newsidOrtid=data['id'],
                                              datatype='news',
                                              full_data=data)

                if not result_file:
                    return

                print datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S'), '--------', result_file

                data['spider_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')

                save_data_to_mongodb(data={'data': data},
                                     item_id=result_file,
                                     platform_e='ChengDuQuanSouSuo',
                                     platform_c='成都全搜索',
                                     cache_data_list=self.cache_data_list)

                # producer.send(topic='1101_STREAM_SPIDER',value={'data':data},key=result_file,updatetime=data['spider_time'])

                # comsumer=Consumer('topic', host, 'll')
                # what=comsumer.poll()
                # for i in comsumer.poll():
                #     print i.topic
                # for i in what:
                #     # print i.topic,i.partition,i.offset,i.key,i.value
                #     topic=i.topic
                #     partition=i.partition
                #     offset=i.offset
                #     key=i.key
                #     value=i.value
                # datalist=enumerate(what)
                #
                #
                #     Save_result(plantform='chengdu', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'],
                #             datatype='news', full_data=value['content'])
            except Exception as e:
                print e
Beispiel #6
0
        def save_result(data):
            try:
                del data['item_id']
            except Exception as e:
                # print e
                pass
            try:

                result_file = get_result_name(plantform_e='toutiao',
                                              plantform_c='今日头条',
                                              date_time=data['publish_time'],
                                              urlOruid=data['url'],
                                              newsidOrtid=data['id'],
                                              datatype='news',
                                              full_data=data)
                if not result_file:
                    return
                print datetime.datetime.now(), '--------', result_file

                save_data_to_mongodb(data={'data': data},
                                     item_id=result_file,
                                     platform_e='toutiao',
                                     platform_c='今日头条',
                                     cache_data_list=self.cache_data_list)

                # producer.send(topic='1101_STREAM_SPIDER', value={'data': data}, key=result_file, updatetime=data['spider_time'])
                pass

                # host = '192.168.6.187:9092,192.168.6.188:9092,192.168.6.229:9092,192.168.6.230:9092'
                # producer = Producer(hosts=host)
                # result_file = get_result_name(plantform_c='今日头条',plantform_e='JinRiTouTiao', date_time=data['publish_time'], urlOruid=data['url'],
                #                               newsidOrtid=data['id'],
                #                               datatype='news', full_data=data)
                #
                # producer.send(topic='topic', value={'data': data}, key=result_file, updatetime=data['spider_time'])

                # comsumer = Consumer('topic', host, 'll')
                # what = comsumer.poll()
                # # for i in comsumer.poll():
                # #     print i.topic
                # for i in what:
                #     topic = i.topic
                #     partition = i.partition
                #     offset = i.offset
                #     key = i.key
                #     value = i.value

                # Save_result(plantform='toutiao', date_time=data['publish_time'], urlOruid=data['url'],
                #                 newsidOrtid=data['id'], datatype='news', full_data=data)
            except Exception as e:
                # print e
                pass
Beispiel #7
0
        def save_result(data):



            result_file = get_result_name(plantform_e='chengshiluntan', plantform_c='城市论坛',
                                          date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'],
                                          datatype='forum', full_data=data)

            if not result_file:
                return

            print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '--------', result_file

            data['spider_time']=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

            save_data_to_mongodb(data={'data': data}, platform_c='城市论坛', platform_e='chengshiluntan', item_id=result_file,cache_data_list=self.cache_data_list)
Beispiel #8
0
        def save_result(data):

            try:
                del data['is_movie']
            except Exception as e:
                print e

            result_file = get_result_name(plantform_e='PengPai', plantform_c='澎湃新闻', date_time=data['publish_time'],
                                          urlOruid=data['url'],
                                          newsidOrtid=data['id'],
                                          datatype='news', full_data=data)
            if not result_file:
                return
            print datetime.now(),'---',result_file


            save_data_to_mongodb(data={'data':data},item_id=result_file,platform_c='澎湃新闻',platform_e='PengPai',cache_data_list=self.cache_data_list)
Beispiel #9
0
def saveData(content, filename, platform_e, platform_c, news_type):  #数据保存

    host = '182.150.63.40'
    port = '12308'
    username = '******'
    password = '******'
    content = json.loads(content)
    # producer = Producer(hosts=host)
    producer = RemoteProducer(host=host,
                              port=port,
                              username=username,
                              password=password)
    result_file = get_result_name(plantform_e=platform_e,
                                  plantform_c=platform_c,
                                  date_time=content['data']['publish_time'],
                                  urlOruid=content['data']['url'],
                                  newsidOrtid=content['data']['id'],
                                  datatype=news_type,
                                  full_data=content)
    # pass
    save_data_to_mongodb(data=content,
                         item_id=result_file,
                         platform_e=platform_e,
                         platform_c=platform_c)