def process_item(self, item, spider): item["source"]=SpiderSourceName.sina item["type"]=SpiderSourceCode.individual_stock item["id"] = shortuuid.uuid() item["scope"] = u"个股" contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getNewsSchema()) kafka_producer.send(kafkaTopic.news,value=contents) sleep(1) self.logger.info("send data to kafka, from "+item["source"] +" , url: "+item["url"]) get_redis_conn().zadd(RedisKeys.sina_individual_crawled,item["url"],item['pub_date'][0:10].replace("-","")) return item
def process_item(self, item, spider): item["id"] = shortuuid.uuid() item["source"]=SpiderSourceName.baidu item["type"]=SpiderSourceCode.baidu_stock_opinion #contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getConsensusSchema()) #kafka_producer.send(kafkaTopic.consensus,value=contents) self.logger.info("send data to kafka, from "+item["source"] +" , batch: "+str(item["batch"])) #sleep(1) get_redis_conn().zadd(RedisKeys.baidu_opinion_crawled+str(item["flag"]),item['pub_date'],item['pub_date']) return item
def process_item(self, item, spider): item["source"]=SpiderSourceName.dongfang item["type"]=SpiderSourceCode.dongfang item["id"] = shortuuid.uuid() item["scope"] = u"新闻" item['code']="" item['name']="" contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getNewsSchema()) self.logger.info("AvroUtils successful!") kafka_producer.send(kafkaTopic.news,value=contents) sleep(1) self.logger.info("send data to kafka, from "+item["source"] +" , url: "+item["url"]) get_redis_conn().zadd(RedisKeys.dongfang_url_crawl,item["url"],item['pub_date'][0:10].replace("-","")) return item
def process_item(self, item, spider): item["id"] = shortuuid.uuid() item["source"] = SpiderSourceName.baidu item["type"] = SpiderSourceCode.baidu_stock_opinion #contents=AvroUtils.createAvroMemoryRecord(item,AvroUtils.getConsensusSchema()) #kafka_producer.send(kafkaTopic.consensus,value=contents) self.logger.info("send data to kafka, from " + item["source"] + " , batch: " + str(item["batch"])) #sleep(1) get_redis_conn().zadd( RedisKeys.baidu_opinion_crawled + str(item["flag"]), item['pub_date'], item['pub_date']) return item
def process_item(self, item, spider): item["source"] = SpiderSourceName.sina item["type"] = SpiderSourceCode.individual_stock item["id"] = shortuuid.uuid() item["scope"] = u"个股" contents = AvroUtils.createAvroMemoryRecord(item, AvroUtils.getNewsSchema()) kafka_producer.send(kafkaTopic.news, value=contents) sleep(1) self.logger.info("send data to kafka, from " + item["source"] + " , url: " + item["url"]) get_redis_conn().zadd(RedisKeys.sina_individual_crawled, item["url"], item['pub_date'][0:10].replace("-", "")) return item
def parse(self, response): access_token_list = response.xpath('//script').re( 'SNB.data.access_token.*\|\| "(.*)";') if len(access_token_list) == 0: self.logger.error("get access_token error") return self.headers["cookie"] = "xq_a_token=" + access_token_list[0] self.redis_conn = get_redis_conn() xueqiu_comment_relation = self.redis_conn.zrange( RedisKeys.xueqiu_comment_relation, start=0, end=-1, desc=False, withscores=True) #get all data, if want to get part of redis, use zrangebyscore for relation in xueqiu_comment_relation: #id&user_id&uuid result = relation[0].split("&&") self.logger.info("article corelation:" + str(result)) user_id = result[0] article_id = result[1] article_hive_id = result[ 2] # shortuuid ,to construct mapping with articles comment_url = "https://xueqiu.com/service/comment/list?id=" + article_id + "&user_id=" + user_id + "&type=status&sort=false&page=1" request = scrapy.Request(comment_url, headers=self.headers, callback=self.parse_comment) request.meta["article_hive_id"] = article_hive_id request.meta["article_id"] = article_id request.meta["user_id"] = user_id yield request
def process_item(self, item, spider): item["source"] = SpiderSourceName.dongfang item["type"] = SpiderSourceCode.dongfang item["id"] = shortuuid.uuid() item["scope"] = u"新闻" item['code'] = "" item['name'] = "" contents = AvroUtils.createAvroMemoryRecord(item, AvroUtils.getNewsSchema()) self.logger.info("AvroUtils successful!") kafka_producer.send(kafkaTopic.news, value=contents) sleep(1) self.logger.info("send data to kafka, from " + item["source"] + " , url: " + item["url"]) get_redis_conn().zadd(RedisKeys.dongfang_url_crawl, item["url"], item['pub_date'][0:10].replace("-", "")) return item
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate=kw["endDate"] else: self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"]+' error format') self.redis_conn=get_redis_conn()
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate = kw["endDate"] else: self.logger.error(kw["endDate"] + ': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"] + ' error format') self.redis_conn = get_redis_conn()
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate=kw["endDate"] else: self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"]+' error format') self.redis_conn=get_redis_conn() #if not self.redis_conn.exists('sina_individual_stock:requests'): # print "set start urls" # self.start_urls = [ # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml", # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml" # ] sha_stock_codes=self.redis_conn.smembers(RedisKeys.SHAStockCode) for code in sha_stock_codes: url=('http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1' % code) self.start_urls.append(url)
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate = kw["endDate"] else: self.logger.error(kw["endDate"] + ': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"] + ' error format') self.redis_conn = get_redis_conn() #if not self.redis_conn.exists('sina_individual_stock:requests'): # print "set start urls" # self.start_urls = [ # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml", # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml" # ] sha_stock_codes = self.redis_conn.smembers(RedisKeys.SHAStockCode) for code in sha_stock_codes: url = ( 'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1' % code) self.start_urls.append(url)
def parse(self, response): access_token_list = response.xpath('//script').re('SNB.data.access_token.*\|\| "(.*)";') if len(access_token_list) == 0: self.logger.error("get access_token error") return self.headers["cookie"]="xq_a_token="+access_token_list[0] self.redis_conn=get_redis_conn() xueqiu_comment_relation=self.redis_conn.zrange(RedisKeys.xueqiu_comment_relation, start=0, end=-1, desc=False, withscores=True) #get all data, if want to get part of redis, use zrangebyscore for relation in xueqiu_comment_relation: #id&user_id&uuid result=relation[0].split("&&") self.logger.info("article corelation:"+str(result)) user_id=result[0] article_id=result[1] article_hive_id=result[2] # shortuuid ,to construct mapping with articles comment_url="https://xueqiu.com/service/comment/list?id="+article_id+"&user_id="+user_id+"&type=status&sort=false&page=1" request = scrapy.Request(comment_url,headers=self.headers,callback=self.parse_comment) request.meta["article_hive_id"]=article_hive_id request.meta["article_id"]=article_id request.meta["user_id"]=user_id yield request
def __init__(self): self.redis_conn = get_redis_conn()
def __init__(self): self.redis_conn=get_redis_conn()
#-*- coding: UTF-8 -*- import sys import os from os.path import dirname path = dirname(os.path.abspath(os.path.dirname(__file__))) sys.path.append(path) #print path from scrapy_redis.connection import get_redis_conn conn=get_redis_conn() fp=open(path+'/tools/SHACode.txt') for line in fp.readlines(): text=line.strip() conn.sadd("SHAStockCode",text) print "add success" #print text print 'contents of SHAStockCode' codes=conn.smembers('SHAStockCode') for code in codes: print code