def save_result(data): result_file = get_result_name(plantform_e='altxw', plantform_c='阿勒泰新闻网', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) if not result_file: return print datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), '--------', result_file data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') Save_result(plantform='altxw', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) save_data_to_mongodb(data={'data': data}, item_id=result_file, platform_e='altxw', platform_c='阿勒泰新闻网', cache_data_list=self.cache_data_Queue)
def save_result(data): # host = '182.150.63.40' # port = '12308' # username = '******' # password = '******' # # # producer = Producer(hosts=host) # producer = RemoteProducer(host=host, port=port, username=username, password=password) result_file = get_result_name(plantform_e='xilu', plantform_c='西陆网', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) if not result_file: return print datetime.datetime.now(), '--------', result_file save_data_to_mongodb(data={'data': data}, item_id=result_file, platform_e='xilu', platform_c='西陆网', cache_data_list=self.cache_data_list)
def save_result(data): # Save_result(plantform='csdn', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], # datatype='news', full_data=data) # host = '182.150.63.40' # port = '12308' # username = '******' # password = '******' # # producer = RemoteProducer(host=host, port=port, username=username, password=password) result_file = get_result_name(plantform_e='csdn', plantform_c='CSDN论坛', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='forum', full_data=data) print datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), '--------', result_file save_data_to_mongodb(data={'data': data}, platform_c='CSDN论坛', platform_e='csdn', item_id=result_file)
def save_result(data): # save_user_to_redis(data) result_file = get_result_name(plantform_e='people', plantform_c='人民网强国社区', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='forum', full_data=data) print datetime.datetime.now(), '--------', result_file if not result_file: return save_data_to_mongodb(data={'data': data}, item_id=result_file, platform_e='people', platform_c='人民网强国社区', cache_data_list=self.cache_data_list) # producer.send(topic='1101_STREAM_SPIDER', value={'data': data}, key=result_file, # updatetime=data['spider_time']) pass
def save_result(data): # print 'deal result' try: #因为有些页面有时候会解析错误,导致没有正确的内容,自然也没有publishtime这个属性,所以直接可以用try模块来过滤掉那些没有抓全的数据。 # host = '192.168.6.187:9092,192.168.6.188:9092,192.168.6.229:9092,192.168.6.230:9092' # host='182.150.63.40' # port='12308' # username='******' # password='******' # # producer=RemoteProducer(host=host,port=port,username=username,password=password) result_file = get_result_name(plantform_e='ChengDuQuanSouSuo', plantform_c='成都全搜索', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) if not result_file: return print datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), '--------', result_file data['spider_time'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') save_data_to_mongodb(data={'data': data}, item_id=result_file, platform_e='ChengDuQuanSouSuo', platform_c='成都全搜索', cache_data_list=self.cache_data_list) # producer.send(topic='1101_STREAM_SPIDER',value={'data':data},key=result_file,updatetime=data['spider_time']) # comsumer=Consumer('topic', host, 'll') # what=comsumer.poll() # for i in comsumer.poll(): # print i.topic # for i in what: # # print i.topic,i.partition,i.offset,i.key,i.value # topic=i.topic # partition=i.partition # offset=i.offset # key=i.key # value=i.value # datalist=enumerate(what) # # # Save_result(plantform='chengdu', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], # datatype='news', full_data=value['content']) except Exception as e: print e
def save_result(data): try: del data['item_id'] except Exception as e: # print e pass try: result_file = get_result_name(plantform_e='toutiao', plantform_c='今日头条', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) if not result_file: return print datetime.datetime.now(), '--------', result_file save_data_to_mongodb(data={'data': data}, item_id=result_file, platform_e='toutiao', platform_c='今日头条', cache_data_list=self.cache_data_list) # producer.send(topic='1101_STREAM_SPIDER', value={'data': data}, key=result_file, updatetime=data['spider_time']) pass # host = '192.168.6.187:9092,192.168.6.188:9092,192.168.6.229:9092,192.168.6.230:9092' # producer = Producer(hosts=host) # result_file = get_result_name(plantform_c='今日头条',plantform_e='JinRiTouTiao', date_time=data['publish_time'], urlOruid=data['url'], # newsidOrtid=data['id'], # datatype='news', full_data=data) # # producer.send(topic='topic', value={'data': data}, key=result_file, updatetime=data['spider_time']) # comsumer = Consumer('topic', host, 'll') # what = comsumer.poll() # # for i in comsumer.poll(): # # print i.topic # for i in what: # topic = i.topic # partition = i.partition # offset = i.offset # key = i.key # value = i.value # Save_result(plantform='toutiao', date_time=data['publish_time'], urlOruid=data['url'], # newsidOrtid=data['id'], datatype='news', full_data=data) except Exception as e: # print e pass
def save_result(data): result_file = get_result_name(plantform_e='chengshiluntan', plantform_c='城市论坛', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='forum', full_data=data) if not result_file: return print datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '--------', result_file data['spider_time']=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') save_data_to_mongodb(data={'data': data}, platform_c='城市论坛', platform_e='chengshiluntan', item_id=result_file,cache_data_list=self.cache_data_list)
def save_result(data): try: del data['is_movie'] except Exception as e: print e result_file = get_result_name(plantform_e='PengPai', plantform_c='澎湃新闻', date_time=data['publish_time'], urlOruid=data['url'], newsidOrtid=data['id'], datatype='news', full_data=data) if not result_file: return print datetime.now(),'---',result_file save_data_to_mongodb(data={'data':data},item_id=result_file,platform_c='澎湃新闻',platform_e='PengPai',cache_data_list=self.cache_data_list)
def saveData(content, filename, platform_e, platform_c, news_type): #数据保存 host = '182.150.63.40' port = '12308' username = '******' password = '******' content = json.loads(content) # producer = Producer(hosts=host) producer = RemoteProducer(host=host, port=port, username=username, password=password) result_file = get_result_name(plantform_e=platform_e, plantform_c=platform_c, date_time=content['data']['publish_time'], urlOruid=content['data']['url'], newsidOrtid=content['data']['id'], datatype=news_type, full_data=content) # pass save_data_to_mongodb(data=content, item_id=result_file, platform_e=platform_e, platform_c=platform_c)