Esempio n. 1
0
    def parse_list(self, response):
        result = json.loads(response.body_as_unicode())
        scope = response.meta['scope']
        nextPage = True
        less_than_endDate_time = int(0)
        for i in range(20):
            target = result[i]["target"]
            if target == "_blank":
                continue

            item = NewsItem()
            url = "https://xueqiu.com/" + target
            item["url"] = url
            if self.redis_conn.zscore(RedisKeys.xueqiu_url_crawled,
                                      url) is not None:
                self.logger.debug('url has benn got: ' + url)
                continue

            item["scope"] = scope
            item["title"] = result[i]["topic_title"]
            print "title: " + item["title"]
            pub_date = result[i]['timeBefore']  #str
            print "pub_date: " + pub_date
            if pub_date.find(u"今天") != -1:  #今天 17:24
                pub_date = TimeUtils.getCurrentDate() + pub_date[2:] + ":00"
            elif pub_date.find(u"分钟前") != -1:
                minute_before = pub_date[0:pub_date.find(u"分钟前")]
                pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before))
            else:  #07-13 14:08
                pub_date = TimeUtils.getCurrentYear() + "-" + pub_date + ":00"
            print "pub_date: " + pub_date
            item["pub_date"] = pub_date
            if pub_date < self.endDate:
                less_than_endDate_time += 1

            article_id = str(result[i]["id"])
            user_id = str(result[i]["user_id"])
            item["id"] = user_id + "&&" + article_id + "&&"
            request = scrapy.Request(url,
                                     headers=self.headers,
                                     callback=self.parse_article)
            request.meta['item'] = item
            request.meta['user_id'] = user_id
            request.meta['id'] = article_id
            yield request

        if less_than_endDate_time > 4:  #at least 4 times, 避免老数据以为修改显示在前面
            nextPage = False

        if nextPage is True:
            page = int(response.url[-1]) + 1
            time_stamp = str(time.time())[0:-3]
            url = "https://xueqiu.com/statuses/topic.json?simple_user=1&filter_text=1&topicType=0&_=" + time_stamp + "&page=" + str(
                page)
            request = scrapy.Request(url,
                                     headers=self.headers,
                                     callback=self.parse_list)
            yield request
 def parse_comment(self, response):
     print "parse_comment, url: "+response.url
     article_hive_id=response.meta["article_hive_id"]
     nextPage=True
     
     respons_xpath=response.xpath("//div[@class='comment-mod-bd']/div[@*]")
     if len(respons_xpath) == 0 :  #last page
         return
     
     for sel in respons_xpath:
         item=CommentItem()
         comment_id=sel.xpath('@id').extract()[0][8:]
         print "comment_id:"+comment_id
         item["id"]=article_hive_id+"&&"+comment_id
         
         if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled, comment_id) is not None:
             nextPage=False
             print "nextPage=False"
             break
         
         userName=sel.xpath("div[@class='comment-item-bd']/h4/a[@class='name']/text()").extract()[0]
         item["username"]=userName
         comment_content=sel.xpath("div[@class='comment-item-bd']/div[@class='comment-item-content']").extract()[0]   #div[@class='detail']/i
         converter = html2text.HTML2Text()
         converter.ignore_links = True
         comment_content = converter.handle(comment_content)
         item["content"]=comment_content
     
         comment_pub_date=sel.xpath("div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()").extract()[0]
         #print "comment_pub_date original:"+comment_pub_date
         if comment_pub_date.find(u"今天") != -1: #今天 17:24
             comment_pub_date=TimeUtils.getCurrentDate()+comment_pub_date[2:]+":00"
         elif comment_pub_date.find(u"分钟前") != -1:
             minute_before=comment_pub_date[0:comment_pub_date.find(u"分钟前")]
             comment_pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before))
         else: #07-13 14:08
             comment_pub_date=TimeUtils.getCurrentYear()+"-"+comment_pub_date+":00"
         
         item["pub_date"]=comment_pub_date
         item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
         
         #print "userName: "******"comment_content: "+comment_content
         print "comment_pub_date: "+comment_pub_date
         self.logger.info("comment_pub_date:"+comment_pub_date)
         
         yield item
         
     if nextPage == True:
         page=int(response.url[-1])+1
         article_id=response.meta["article_id"]
         user_id=response.meta["user_id"]
         comment_url="https://xueqiu.com/service/comment/list?id="+article_id+"&user_id="+user_id+"&type=status&sort=false&page="+str(page)
         request = scrapy.Request(comment_url,headers=self.headers,callback=self.parse_comment)
         request.meta["article_hive_id"] = article_hive_id
         request.meta["article_id"]=article_id
         request.meta["user_id"]=user_id
         yield request
Esempio n. 3
0
def get_or_create_user_by_email(email):
    defaults = {
        'is_superuser': False,
        'is_staff': False,
        'last_login': TimeUtils.now_ts(),
        'create_time': TimeUtils.now_ts(),
    }
    user, created = User.objects.get_or_create(email=email, defaults=defaults)

    return user
Esempio n. 4
0
 def parse_list(self, response):
     result = json.loads(response.body_as_unicode())
     scope=response.meta['scope']
     nextPage=True
     less_than_endDate_time=int(0)
     for i in range(20):
         target=result[i]["target"]
         if target == "_blank":
             continue
         
         item = NewsItem()
         url="https://xueqiu.com/"+target
         item["url"] = url
         if self.redis_conn.zscore(RedisKeys.xueqiu_url_crawled, url) is not None:
             self.logger.debug('url has benn got: '+url)
             continue
         
         item["scope"]=scope
         item["title"]=result[i]["topic_title"]
         print "title: "+item["title"]
         pub_date = result[i]['timeBefore']   #str
         print "pub_date: "+pub_date
         if pub_date.find(u"今天") != -1: #今天 17:24
             pub_date=TimeUtils.getCurrentDate()+pub_date[2:]+":00"
         elif pub_date.find(u"分钟前") !=-1:
             minute_before=pub_date[0:pub_date.find(u"分钟前")]
             pub_date=TimeUtils.getDateSubtractMinutes(int(minute_before))
         else: #07-13 14:08
             pub_date=TimeUtils.getCurrentYear()+"-"+pub_date+":00"
         print "pub_date: "+pub_date
         item["pub_date"]=pub_date
         if pub_date < self.endDate:
             less_than_endDate_time += 1
         
         article_id=str(result[i]["id"])
         user_id=str(result[i]["user_id"])
         item["id"]=user_id+"&&"+article_id+"&&"
         request = scrapy.Request(url,headers=self.headers,callback=self.parse_article)
         request.meta['item'] = item
         request.meta['user_id'] = user_id
         request.meta['id'] = article_id
         yield request
     
     if less_than_endDate_time > 4: #at least 4 times, 避免老数据以为修改显示在前面
         nextPage=False
         
     if nextPage is True:
         page=int(response.url[-1])+1
         time_stamp=str(time.time())[0:-3]
         url="https://xueqiu.com/statuses/topic.json?simple_user=1&filter_text=1&topicType=0&_="+time_stamp+"&page="+str(page)
         request = scrapy.Request(url,headers=self.headers,callback=self.parse_list)
         yield request
Esempio n. 5
0
    def parse_article(self, response):
        item = response.meta['item']
        item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()
        # retrieve document body
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath('//*[@id="artibody"]').extract()[0]
        if raw.find(u"原始正文start") != -1:
            real_content_start = raw.find(u"原始正文start") + 13
            raw = raw[real_content_start:].trim()

        content = converter.handle(raw)
        item['content'] = content

        # retrieve source
        src_raw = response.xpath('//span[@class="time-source"]').extract()[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt.split(" ", 1)[1]
        item['article_source'] = source
        pub_date = str(
            datetime.fromtimestamp(
                mktime(
                    time.strptime(
                        src_txt.replace(u"\xa0", "").replace(" ", "")[:16],
                        u"%Y年%m月%d日%H:%M"))))
        item['pub_date'] = pub_date
        self.logger.info('pub_date: ' + item['pub_date'])
        yield item
Esempio n. 6
0
    def parse(self, response):

        nextPage = True

        for sel in response.xpath('//div[@class="list"]/ul/li'):
            item = NewsItem()
            url = sel.xpath('a/@href').extract()[0]
            title = sel.xpath('a/text()').extract()[0]
            time = sel.xpath('span/text()').extract()[0] + ':00'
            if time < self.endDate:
                nextPage = False
                break
            item['url'] = url
            item['title'] = title
            item['pub_date'] = time
            item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()

            self.logger.debug("get article time %s" % time)

            request = scrapy.Request(url, callback=self.parse_article)
            request.meta['item'] = item

            yield request

        if nextPage is True:
            page = int(response.url[response.url.find("_") +
                                    1:len(response.url) - 5]) + 1
            url = ('http://finance.eastmoney.com/news/ccjdd_' + str(page) +
                   '.html')

            self.logger.debug("get nextPage url %s" % url)
            request = scrapy.Request(url, callback=self.parse)
            yield request
Esempio n. 7
0
    def parse(self, response):
        
        nextPage=True
            
        for sel in response.xpath('//div[@class="list"]/ul/li'):
            item = NewsItem()
            url = sel.xpath('a/@href').extract()[0]            
            title = sel.xpath('a/text()').extract()[0]
            time = sel.xpath('span/text()').extract()[0]+':00'
            if time < self.endDate : 
                nextPage=False
                break
            item['url'] = url
            item['title'] = title
            item['pub_date'] = time
            item['crawl_ts']=TimeUtils.getCurrentTimeStamp()    
            
            self.logger.debug("get article time %s" % time )
            
            request = scrapy.Request(url, callback=self.parse_article)
            request.meta['item'] = item

            yield request
            
            
        if  nextPage is True:
            page = int(response.url[response.url.find("_")+1:len(response.url) - 5]) + 1
            url = ('http://finance.eastmoney.com/news/ccjdd_' + str(page) + '.html')
            
            self.logger.debug("get nextPage url %s" % url)
            request = scrapy.Request(url, callback=self.parse)
            yield request
Esempio n. 8
0
def update_course_score(course_id, score):
    course = Course.objects.filter(id=course_id).first()
    if not course:
        log.error("update_course_score_course_not_found|course_id=", course_id)
        return False
    review_count = course.review_count
    rate = review_count / (review_count + 1)
    course.recommend_score = course.recommend_score * rate + score[
        "recommend_score"] / (review_count + 1)
    course.content_score = course.content_score * rate + score[
        "content_score"] / (review_count + 1)
    course.work_score = course.work_score * rate + score["work_score"] / (
        review_count + 1)
    course.exam_score = course.exam_score * rate + score["exam_score"] / (
        review_count + 1)
    course.review_count = review_count + 1
    course.last_review = TimeUtils.now_ts()
    try:
        course.save()
        return True
    except Exception as e:
        log.error(
            "update_course_score_exception|course_id={},exception={}".format(
                course_id, e))
        return False
Esempio n. 9
0
    def parse_article(self, response):
        item = response.meta["item"]
        item["crawl_ts"] = TimeUtils.getCurrentTimeStamp()
        #某些文章内容中没有title
        #title = response.xpath("//div[@class='status-content']/h4[@class='status-title']/text()").extract()[0]
        #print "title: "+title

        #retrieve content
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath(
            "//div[@class='status-content']/div[@class='detail']/text()"
        ).extract()[0]
        content = converter.handle(raw)
        item["content"] = content
        print "content: " + content

        # retrieve source
        src_raw = response.xpath(
            "//div[@class='subtitle']/span[@class='source']/text()").extract(
            )[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt[2:]
        item["article_source"] = source
        print "article_source: " + source

        yield item
Esempio n. 10
0
def generate_token(user):
    token = encode_jwt({
        "user_id": user.id,
        "email": user.email,
        "last_login": user.last_login,
        "expiry": TimeUtils.now_ts() + TimeUtils.DAY * 7
    })
    return token
Esempio n. 11
0
def interact_review(review_id, action, user_id):
    ReviewInteract.objects.update_or_create(review_id=review_id,
                                            create_by=user_id,
                                            defaults={
                                                "action": action,
                                                "create_time":
                                                TimeUtils.now_ts(),
                                            })
Esempio n. 12
0
 def __init__(self, *a, **kw):
     if kw.has_key("endDate"):
         if TimeUtils.isValidEndDate(kw["endDate"]):
             self.endDate=kw["endDate"]
         else:
             self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15')
             raise CloseSpider(kw["endDate"]+' error format')
         
     self.redis_conn=get_redis_conn()
Esempio n. 13
0
    def __init__(self, *a, **kw):
        if kw.has_key("endDate"):
            if TimeUtils.isValidEndDate(kw["endDate"]):
                self.endDate = kw["endDate"]
            else:
                self.logger.error(kw["endDate"] +
                                  ': error format, must be like 2016-05-15')
                raise CloseSpider(kw["endDate"] + ' error format')

        self.redis_conn = get_redis_conn()
Esempio n. 14
0
def upload_file(request):
    try:
        pi_img = request.FILES.get('pi_img')
        if pi_img:
            dest = '{0}/{1}.jpg'.format(settings.PI_IMG_STORE_PATH, TimeUtils.int_time_to_str_format())
            with open(dest, "wb+") as destination:
                for chunk in pi_img.chunks():
                    destination.write(chunk)
            return JsonResponse({'result': 'upload_success'})
    except Exception, e:
        print 'upload_file Exception:', e
Esempio n. 15
0
def upload_file(request):
    try:
        pi_img = request.FILES.get('pi_img')
        if pi_img:
            dest = '{0}/{1}.jpg'.format(settings.PI_IMG_STORE_PATH,
                                        TimeUtils.int_time_to_str_format())
            with open(dest, "wb+") as destination:
                for chunk in pi_img.chunks():
                    destination.write(chunk)
            return JsonResponse({'result': 'upload_success'})
    except Exception, e:
        print 'upload_file Exception:', e
Esempio n. 16
0
    def get_context_data(self, **kwargs):
        context = super().get_context_data(**kwargs)

        attach_email = models.Emails.objects.filter(
            ~Q(attaches='') & ~Q(sender='*****@*****.**'))

        # 总计
        thisweek = TimeUtils().get_this_week_start()

        context['totals'] = attach_email.filter(attaches__isnull=False).count()

        # 本周统计
        thisweek = TimeUtils().get_this_week_start()
        context['thisweek'] = attach_email.filter(
            send_date__gte=thisweek).count()

        # 本周统计
        thismonth = TimeUtils().get_this_month_start()
        context['thismonth'] = attach_email.filter(
            send_date__gte=thismonth).count()

        return context
 def __init__(self, *a, **kw):
     if kw.has_key("endDate"):
         if TimeUtils.isValidEndDate(kw["endDate"]):
             self.endDate=kw["endDate"]
         else:
             self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15')
             raise CloseSpider(kw["endDate"]+' error format')
     
     
     self.redis_conn=get_redis_conn()
     #if not self.redis_conn.exists('sina_individual_stock:requests'):
     #    print "set start urls"
     #    self.start_urls = [
     #         "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml",
     #     "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml"
     #     ]
     sha_stock_codes=self.redis_conn.smembers(RedisKeys.SHAStockCode)
     for code in sha_stock_codes:
         url=('http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1' % code)
         self.start_urls.append(url)
Esempio n. 18
0
    def __init__(self, *a, **kw):
        if kw.has_key("endDate"):
            if TimeUtils.isValidEndDate(kw["endDate"]):
                self.endDate = kw["endDate"]
            else:
                self.logger.error(kw["endDate"] +
                                  ': error format, must be like 2016-05-15')
                raise CloseSpider(kw["endDate"] + ' error format')

        self.redis_conn = get_redis_conn()
        #if not self.redis_conn.exists('sina_individual_stock:requests'):
        #    print "set start urls"
        #    self.start_urls = [
        #         "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml",
        #     "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml"
        #     ]
        sha_stock_codes = self.redis_conn.smembers(RedisKeys.SHAStockCode)
        for code in sha_stock_codes:
            url = (
                'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1'
                % code)
            self.start_urls.append(url)
Esempio n. 19
0
    def parse_article(self, response):
        item=response.meta["item"]
        item["crawl_ts"]=TimeUtils.getCurrentTimeStamp()
        #某些文章内容中没有title
        #title = response.xpath("//div[@class='status-content']/h4[@class='status-title']/text()").extract()[0]
        #print "title: "+title
        
        #retrieve content
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath("//div[@class='status-content']/div[@class='detail']/text()").extract()[0]
        content = converter.handle(raw)
        item["content"] = content
        print "content: "+content

        # retrieve source
        src_raw = response.xpath("//div[@class='subtitle']/span[@class='source']/text()").extract()[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt[2:]
        item["article_source"]=source
        print "article_source: "+source
        
        yield item
    def parse_article(self, response):
        item = response.meta['item']
        item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
        # retrieve document body
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        raw = response.xpath('//*[@id="artibody"]').extract()[0]
        if raw.find(u"原始正文start") !=-1:
            real_content_start=raw.find(u"原始正文start")+13
            raw=raw[real_content_start:].trim()
            
        content = converter.handle(raw)
        item['content'] = content

        # retrieve source
        src_raw = response.xpath('//span[@class="time-source"]').extract()[0]
        src_txt = converter.handle(src_raw).strip()
        source = src_txt.split(" ",1)[1]
        item['article_source']=source
        pub_date= str(datetime.fromtimestamp(mktime(time.strptime(src_txt.replace(u"\xa0", "").replace(" ", "")[:16], u"%Y年%m月%d日%H:%M"))))
        item['pub_date']= pub_date
        self.logger.info('pub_date: '+ item['pub_date'])
        yield item
Esempio n. 21
0
def create_review(course_id, teacher_id, class_id, title, content, score,
                  create_by):
    try:
        with transaction.atomic():
            review = Review.objects.create(course_id=course_id,
                                           class_id=class_id,
                                           teacher_id=teacher_id,
                                           title=title,
                                           content=content,
                                           **score,
                                           create_by=create_by,
                                           create_time=TimeUtils.now_ts())
            if not review:
                return False
            if not course_manager.update_course_score(course_id, score):
                raise IntegrityError
            if not class_manager.update_class_score(class_id, score):
                raise IntegrityError
            if not teacher_manager.update_teacher_score(teacher_id, score):
                raise IntegrityError
            return True
    except IntegrityError as e:
        print(e)
        return False
Esempio n. 22
0
    def parse(self, response):
        result = re.findall(r'type=(\d+)', response.url)
        flag = result[0]

        jsonresponse = json.loads(response.body_as_unicode())
        try:
            if self.endDate is None:
                historyOpinion = jsonresponse["data"]["historyOpinion"][
                    0]  #only get current day
                latest_date = historyOpinion['opinionTime']
                self.logger.debug("latest_date: " + latest_date)
                self.logger.debug("current_date: " + self.current_date)
                latest_date = latest_date.encode('UTF-8', 'ignore')
                if latest_date != self.current_date:
                    current_time = str(
                        time.strftime("%Y%m%d  %H:%M:%S", time.localtime()))
                    self.logger.debug("there is no data in current time: " +
                                      current_time)
                    return

                self.logger.debug("start to crawl date: " + latest_date)
                stocks = historyOpinion['hotSearchOpinionDetail']
                for i in range(5):
                    item = BaiduStockOpinionItem()
                    item['pub_date'] = latest_date
                    item['batch'] = int(self.batch)
                    item['code'] = stocks[i]['stockCode']
                    item['name'] = stocks[i]['stockName']
                    rankString = stocks[i]['showtext']  # No.1
                    item['rank'] = int(rankString.split('.')[1])
                    opinionKeywords = stocks[i]['opinionKeywords']
                    item['keywords'] = ",".join(opinionKeywords)
                    item['flag'] = int(flag)
                    item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()
                    yield item
            else:
                for historyOpinion in jsonresponse["data"]["historyOpinion"]:
                    latest_date = historyOpinion['opinionTime']
                    #print "test in redis"
                    if self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \
                        latest_date) is None:  #redis 是否已爬取
                        #print "crawl Date: "+latest_date
                        self.logger.info("start to crawl date: " + latest_date)
                        stocks = historyOpinion['hotSearchOpinionDetail']
                        for i in range(5):
                            item = BaiduStockOpinionItem()
                            item['pub_date'] = latest_date
                            item['batch'] = int(self.batch)
                            item['code'] = stocks[i]['stockCode']
                            item['name'] = stocks[i]['stockName']
                            rankString = stocks[i]['showtext']  # No.1
                            item['rank'] = int(rankString.split('.')[1])
                            opinionKeywords = stocks[i]['opinionKeywords']
                            item['keywords'] = ",".join(opinionKeywords)
                            item['flag'] = int(flag)
                            item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()
                            yield item
                    elif self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \
                        latest_date) is not None:
                        continue
                    else:
                        self.nextPage = False
                        break

                if self.nextPage:
                    page = int(response.url[-1]) + 1
                    url = response.url[0:-1] + str(page)
                    request = scrapy.Request(url, callback=self.parse)
                    yield request
        except KeyError as e:
            self.logger.error("exception is %s" % e)
Esempio n. 23
0
    def parse_comment(self, response):
        print "parse_comment, url: " + response.url
        article_hive_id = response.meta["article_hive_id"]
        nextPage = True

        respons_xpath = response.xpath(
            "//div[@class='comment-mod-bd']/div[@*]")
        if len(respons_xpath) == 0:  #last page
            return

        for sel in respons_xpath:
            item = CommentItem()
            comment_id = sel.xpath('@id').extract()[0][8:]
            print "comment_id:" + comment_id
            item["id"] = article_hive_id + "&&" + comment_id

            if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled,
                                      comment_id) is not None:
                nextPage = False
                print "nextPage=False"
                break

            userName = sel.xpath(
                "div[@class='comment-item-bd']/h4/a[@class='name']/text()"
            ).extract()[0]
            item["username"] = userName
            comment_content = sel.xpath(
                "div[@class='comment-item-bd']/div[@class='comment-item-content']"
            ).extract()[0]  #div[@class='detail']/i
            converter = html2text.HTML2Text()
            converter.ignore_links = True
            comment_content = converter.handle(comment_content)
            item["content"] = comment_content

            comment_pub_date = sel.xpath(
                "div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()"
            ).extract()[0]
            #print "comment_pub_date original:"+comment_pub_date
            if comment_pub_date.find(u"今天") != -1:  #今天 17:24
                comment_pub_date = TimeUtils.getCurrentDate(
                ) + comment_pub_date[2:] + ":00"
            elif comment_pub_date.find(u"分钟前") != -1:
                minute_before = comment_pub_date[0:comment_pub_date.find(u"分钟前"
                                                                         )]
                comment_pub_date = TimeUtils.getDateSubtractMinutes(
                    int(minute_before))
            else:  #07-13 14:08
                comment_pub_date = TimeUtils.getCurrentYear(
                ) + "-" + comment_pub_date + ":00"

            item["pub_date"] = comment_pub_date
            item['crawl_ts'] = TimeUtils.getCurrentTimeStamp()

            #print "userName: "******"comment_content: "+comment_content
            print "comment_pub_date: " + comment_pub_date
            self.logger.info("comment_pub_date:" + comment_pub_date)

            yield item

        if nextPage == True:
            page = int(response.url[-1]) + 1
            article_id = response.meta["article_id"]
            user_id = response.meta["user_id"]
            comment_url = "https://xueqiu.com/service/comment/list?id=" + article_id + "&user_id=" + user_id + "&type=status&sort=false&page=" + str(
                page)
            request = scrapy.Request(comment_url,
                                     headers=self.headers,
                                     callback=self.parse_comment)
            request.meta["article_hive_id"] = article_hive_id
            request.meta["article_id"] = article_id
            request.meta["user_id"] = user_id
            yield request
 def parse(self, response):
     result = re.findall(r'type=(\d+)',response.url)
     flag=result[0]
             
     jsonresponse = json.loads(response.body_as_unicode())
     try:
         if self.endDate is None:
             historyOpinion=jsonresponse["data"]["historyOpinion"][0]  #only get current day
             latest_date=historyOpinion['opinionTime']
             self.logger.debug("latest_date: "+latest_date)
             self.logger.debug("current_date: "+self.current_date)
             latest_date=latest_date.encode('UTF-8','ignore')
             if latest_date != self.current_date:
                 current_time=str(time.strftime("%Y%m%d  %H:%M:%S", time.localtime()))
                 self.logger.debug("there is no data in current time: "+current_time)
                 return
             
             self.logger.debug("start to crawl date: "+latest_date)
             stocks=historyOpinion['hotSearchOpinionDetail']
             for i in range(5):
                 item=BaiduStockOpinionItem()
                 item['pub_date']=latest_date
                 item['batch']=int(self.batch)
                 item['code']=stocks[i]['stockCode']
                 item['name']=stocks[i]['stockName']
                 rankString=stocks[i]['showtext']  # No.1
                 item['rank']=int(rankString.split('.')[1])
                 opinionKeywords=stocks[i]['opinionKeywords']
                 item['keywords']=",".join(opinionKeywords)
                 item['flag']=int(flag)
                 item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
                 yield item
         else:
             for historyOpinion in jsonresponse["data"]["historyOpinion"]:
                 latest_date=historyOpinion['opinionTime']
                 #print "test in redis"
                 if self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \
                     latest_date) is None:  #redis 是否已爬取
                     #print "crawl Date: "+latest_date
                     self.logger.info("start to crawl date: "+latest_date)
                     stocks=historyOpinion['hotSearchOpinionDetail']
                     for i in range(5):
                         item=BaiduStockOpinionItem()
                         item['pub_date']=latest_date
                         item['batch']=int(self.batch)
                         item['code']=stocks[i]['stockCode']
                         item['name']=stocks[i]['stockName']
                         rankString=stocks[i]['showtext']  # No.1
                         item['rank']=int(rankString.split('.')[1])
                         opinionKeywords=stocks[i]['opinionKeywords']
                         item['keywords']=",".join(opinionKeywords)
                         item['flag']=int(flag)
                         item['crawl_ts']=TimeUtils.getCurrentTimeStamp()
                         yield item
                 elif self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \
                     latest_date) is not None:
                     continue
                 else:
                     self.nextPage=False
                     break
             
             if self.nextPage:
                 page = int(response.url[-1])+1
                 url = response.url[0:-1] + str(page)
                 request = scrapy.Request(url, callback=self.parse)
                 yield request
     except KeyError as e:
         self.logger.error("exception is %s" % e)
Esempio n. 25
0
def import_courses(course_type):
	filename = os.path.join(os.getcwd(), COURSE_DATA_DIR, course_type + '.json')
	with open(filename, "r", encoding='utf8') as f:
		courses = json.load(f)
	course_objs = []
	teacher_names = []
	for course in courses:
		school_id = SchoolEnum.get(course['school'])
		if school_id is None:
			print("No school_id for {}".format(course['school']))
			continue
		type_name = COURSE_TYPE_MAP[course_type]
		if course_type == 'english':
			type_name = type_name + course['level']
		elif course_type == 'trans_choice':
			type_name = type_name + course['type']
		type_id = CourseTypeEnum.get(type_name)
		if type_id is None:
			print("No type_id for {}".format(type_name))
			continue
		credit = course['credits'] if course_type in ['english', 'trans_choice'] else course['credit']
		teacher_names.extend(course['teachers'])
		course_obj = Course(
			name=course['name'],
			course_no=course['id'],
			credit=credit,
			school_id=school_id,
			type=type_id,
			review_count=0,
			last_review=0,
			create_time=TimeUtils.now_ts(),
		)
		course_objs.append(course_obj)
	print("bulk_create courses {}".format(course_type))
	Course.objects.bulk_create(course_objs)
	print("bulk_create courses finished")

	teacher_names = list(set(teacher_names))
	created_names = set(Teacher.objects.filter(name__in=teacher_names).values_list("name", flat=True))
	teacher_objs = [
		Teacher(
			name=name,
			review_count=0,
			create_time=TimeUtils.now_ts()
		)
		for name in teacher_names if name not in created_names
	]
	print("bulk_create teachers")
	Teacher.objects.bulk_create(teacher_objs)
	print("bulk_create teachers finished")

	class_objs = []

	for course in courses:
		school_id = SchoolEnum.get(course['school'])
		course_id = Course.objects.filter(name=course['name'], school_id=school_id).first().id
		teachers = list(Teacher.objects.filter(name__in=course['teachers']))
		for teacher in teachers:
			class_objs.append(Class(
				course_id=course_id,
				teacher_id=teacher.id,
				semester="20-21-2",
				review_count=0,
				create_time=TimeUtils.now_ts()
			))
	print("bulk_create classes")
	Class.objects.bulk_create(class_objs)
	print("bulk_create classes finished")