def parse_list(self, response): result = json.loads(response.body_as_unicode()) scope = response.meta['scope'] nextPage = True less_than_endDate_time = int(0) for i in range(20): target = result[i]["target"] if target == "_blank": continue item = NewsItem() url = "https://xueqiu.com/" + target item["url"] = url if self.redis_conn.zscore(RedisKeys.xueqiu_url_crawled, url) is not None: self.logger.debug('url has benn got: ' + url) continue item["scope"] = scope item["title"] = result[i]["topic_title"] print "title: " + item["title"] pub_date = result[i]['timeBefore'] #str print "pub_date: " + pub_date if pub_date.find(u"今天") != -1: #今天 17:24 pub_date = TimeUtils.getCurrentDate() + pub_date[2:] + ":00" elif pub_date.find(u"分钟前") != -1: minute_before = pub_date[0:pub_date.find(u"分钟前")] pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before)) else: #07-13 14:08 pub_date = TimeUtils.getCurrentYear() + "-" + pub_date + ":00" print "pub_date: " + pub_date item["pub_date"] = pub_date if pub_date < self.endDate: less_than_endDate_time += 1 article_id = str(result[i]["id"]) user_id = str(result[i]["user_id"]) item["id"] = user_id + "&&" + article_id + "&&" request = scrapy.Request(url, headers=self.headers, callback=self.parse_article) request.meta['item'] = item request.meta['user_id'] = user_id request.meta['id'] = article_id yield request if less_than_endDate_time > 4: #at least 4 times, 避免老数据以为修改显示在前面 nextPage = False if nextPage is True: page = int(response.url[-1]) + 1 time_stamp = str(time.time())[0:-3] url = "https://xueqiu.com/statuses/topic.json?simple_user=1&filter_text=1&topicType=0&_=" + time_stamp + "&page=" + str( page) request = scrapy.Request(url, headers=self.headers, callback=self.parse_list) yield request
def parse_comment(self, response): print "parse_comment, url: "+response.url article_hive_id=response.meta["article_hive_id"] nextPage=True respons_xpath=response.xpath("//div[@class='comment-mod-bd']/div[@*]") if len(respons_xpath) == 0 : #last page return for sel in respons_xpath: item=CommentItem() comment_id=sel.xpath('@id').extract()[0][8:] print "comment_id:"+comment_id item["id"]=article_hive_id+"&&"+comment_id if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled, comment_id) is not None: nextPage=False print "nextPage=False" break userName=sel.xpath("div[@class='comment-item-bd']/h4/a[@class='name']/text()").extract()[0] item["username"]=userName comment_content=sel.xpath("div[@class='comment-item-bd']/div[@class='comment-item-content']").extract()[0] #div[@class='detail']/i converter = html2text.HTML2Text() converter.ignore_links = True comment_content = converter.handle(comment_content) item["content"]=comment_content comment_pub_date=sel.xpath("div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()").extract()[0] #print "comment_pub_date original:"+comment_pub_date if comment_pub_date.find(u"今天") != -1: #今天 17:24 comment_pub_date=TimeUtils.getCurrentDate()+comment_pub_date[2:]+":00" elif comment_pub_date.find(u"分钟前") != -1: minute_before=comment_pub_date[0:comment_pub_date.find(u"分钟前")] comment_pub_date = TimeUtils.getDateSubtractMinutes(int(minute_before)) else: #07-13 14:08 comment_pub_date=TimeUtils.getCurrentYear()+"-"+comment_pub_date+":00" item["pub_date"]=comment_pub_date item['crawl_ts']=TimeUtils.getCurrentTimeStamp() #print "userName: "******"comment_content: "+comment_content print "comment_pub_date: "+comment_pub_date self.logger.info("comment_pub_date:"+comment_pub_date) yield item if nextPage == True: page=int(response.url[-1])+1 article_id=response.meta["article_id"] user_id=response.meta["user_id"] comment_url="https://xueqiu.com/service/comment/list?id="+article_id+"&user_id="+user_id+"&type=status&sort=false&page="+str(page) request = scrapy.Request(comment_url,headers=self.headers,callback=self.parse_comment) request.meta["article_hive_id"] = article_hive_id request.meta["article_id"]=article_id request.meta["user_id"]=user_id yield request
def get_or_create_user_by_email(email): defaults = { 'is_superuser': False, 'is_staff': False, 'last_login': TimeUtils.now_ts(), 'create_time': TimeUtils.now_ts(), } user, created = User.objects.get_or_create(email=email, defaults=defaults) return user
def parse_list(self, response): result = json.loads(response.body_as_unicode()) scope=response.meta['scope'] nextPage=True less_than_endDate_time=int(0) for i in range(20): target=result[i]["target"] if target == "_blank": continue item = NewsItem() url="https://xueqiu.com/"+target item["url"] = url if self.redis_conn.zscore(RedisKeys.xueqiu_url_crawled, url) is not None: self.logger.debug('url has benn got: '+url) continue item["scope"]=scope item["title"]=result[i]["topic_title"] print "title: "+item["title"] pub_date = result[i]['timeBefore'] #str print "pub_date: "+pub_date if pub_date.find(u"今天") != -1: #今天 17:24 pub_date=TimeUtils.getCurrentDate()+pub_date[2:]+":00" elif pub_date.find(u"分钟前") !=-1: minute_before=pub_date[0:pub_date.find(u"分钟前")] pub_date=TimeUtils.getDateSubtractMinutes(int(minute_before)) else: #07-13 14:08 pub_date=TimeUtils.getCurrentYear()+"-"+pub_date+":00" print "pub_date: "+pub_date item["pub_date"]=pub_date if pub_date < self.endDate: less_than_endDate_time += 1 article_id=str(result[i]["id"]) user_id=str(result[i]["user_id"]) item["id"]=user_id+"&&"+article_id+"&&" request = scrapy.Request(url,headers=self.headers,callback=self.parse_article) request.meta['item'] = item request.meta['user_id'] = user_id request.meta['id'] = article_id yield request if less_than_endDate_time > 4: #at least 4 times, 避免老数据以为修改显示在前面 nextPage=False if nextPage is True: page=int(response.url[-1])+1 time_stamp=str(time.time())[0:-3] url="https://xueqiu.com/statuses/topic.json?simple_user=1&filter_text=1&topicType=0&_="+time_stamp+"&page="+str(page) request = scrapy.Request(url,headers=self.headers,callback=self.parse_list) yield request
def parse_article(self, response): item = response.meta['item'] item['crawl_ts'] = TimeUtils.getCurrentTimeStamp() # retrieve document body converter = html2text.HTML2Text() converter.ignore_links = True raw = response.xpath('//*[@id="artibody"]').extract()[0] if raw.find(u"原始正文start") != -1: real_content_start = raw.find(u"原始正文start") + 13 raw = raw[real_content_start:].trim() content = converter.handle(raw) item['content'] = content # retrieve source src_raw = response.xpath('//span[@class="time-source"]').extract()[0] src_txt = converter.handle(src_raw).strip() source = src_txt.split(" ", 1)[1] item['article_source'] = source pub_date = str( datetime.fromtimestamp( mktime( time.strptime( src_txt.replace(u"\xa0", "").replace(" ", "")[:16], u"%Y年%m月%d日%H:%M")))) item['pub_date'] = pub_date self.logger.info('pub_date: ' + item['pub_date']) yield item
def parse(self, response): nextPage = True for sel in response.xpath('//div[@class="list"]/ul/li'): item = NewsItem() url = sel.xpath('a/@href').extract()[0] title = sel.xpath('a/text()').extract()[0] time = sel.xpath('span/text()').extract()[0] + ':00' if time < self.endDate: nextPage = False break item['url'] = url item['title'] = title item['pub_date'] = time item['crawl_ts'] = TimeUtils.getCurrentTimeStamp() self.logger.debug("get article time %s" % time) request = scrapy.Request(url, callback=self.parse_article) request.meta['item'] = item yield request if nextPage is True: page = int(response.url[response.url.find("_") + 1:len(response.url) - 5]) + 1 url = ('http://finance.eastmoney.com/news/ccjdd_' + str(page) + '.html') self.logger.debug("get nextPage url %s" % url) request = scrapy.Request(url, callback=self.parse) yield request
def parse(self, response): nextPage=True for sel in response.xpath('//div[@class="list"]/ul/li'): item = NewsItem() url = sel.xpath('a/@href').extract()[0] title = sel.xpath('a/text()').extract()[0] time = sel.xpath('span/text()').extract()[0]+':00' if time < self.endDate : nextPage=False break item['url'] = url item['title'] = title item['pub_date'] = time item['crawl_ts']=TimeUtils.getCurrentTimeStamp() self.logger.debug("get article time %s" % time ) request = scrapy.Request(url, callback=self.parse_article) request.meta['item'] = item yield request if nextPage is True: page = int(response.url[response.url.find("_")+1:len(response.url) - 5]) + 1 url = ('http://finance.eastmoney.com/news/ccjdd_' + str(page) + '.html') self.logger.debug("get nextPage url %s" % url) request = scrapy.Request(url, callback=self.parse) yield request
def update_course_score(course_id, score): course = Course.objects.filter(id=course_id).first() if not course: log.error("update_course_score_course_not_found|course_id=", course_id) return False review_count = course.review_count rate = review_count / (review_count + 1) course.recommend_score = course.recommend_score * rate + score[ "recommend_score"] / (review_count + 1) course.content_score = course.content_score * rate + score[ "content_score"] / (review_count + 1) course.work_score = course.work_score * rate + score["work_score"] / ( review_count + 1) course.exam_score = course.exam_score * rate + score["exam_score"] / ( review_count + 1) course.review_count = review_count + 1 course.last_review = TimeUtils.now_ts() try: course.save() return True except Exception as e: log.error( "update_course_score_exception|course_id={},exception={}".format( course_id, e)) return False
def parse_article(self, response): item = response.meta["item"] item["crawl_ts"] = TimeUtils.getCurrentTimeStamp() #某些文章内容中没有title #title = response.xpath("//div[@class='status-content']/h4[@class='status-title']/text()").extract()[0] #print "title: "+title #retrieve content converter = html2text.HTML2Text() converter.ignore_links = True raw = response.xpath( "//div[@class='status-content']/div[@class='detail']/text()" ).extract()[0] content = converter.handle(raw) item["content"] = content print "content: " + content # retrieve source src_raw = response.xpath( "//div[@class='subtitle']/span[@class='source']/text()").extract( )[0] src_txt = converter.handle(src_raw).strip() source = src_txt[2:] item["article_source"] = source print "article_source: " + source yield item
def generate_token(user): token = encode_jwt({ "user_id": user.id, "email": user.email, "last_login": user.last_login, "expiry": TimeUtils.now_ts() + TimeUtils.DAY * 7 }) return token
def interact_review(review_id, action, user_id): ReviewInteract.objects.update_or_create(review_id=review_id, create_by=user_id, defaults={ "action": action, "create_time": TimeUtils.now_ts(), })
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate=kw["endDate"] else: self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"]+' error format') self.redis_conn=get_redis_conn()
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate = kw["endDate"] else: self.logger.error(kw["endDate"] + ': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"] + ' error format') self.redis_conn = get_redis_conn()
def upload_file(request): try: pi_img = request.FILES.get('pi_img') if pi_img: dest = '{0}/{1}.jpg'.format(settings.PI_IMG_STORE_PATH, TimeUtils.int_time_to_str_format()) with open(dest, "wb+") as destination: for chunk in pi_img.chunks(): destination.write(chunk) return JsonResponse({'result': 'upload_success'}) except Exception, e: print 'upload_file Exception:', e
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) attach_email = models.Emails.objects.filter( ~Q(attaches='') & ~Q(sender='*****@*****.**')) # 总计 thisweek = TimeUtils().get_this_week_start() context['totals'] = attach_email.filter(attaches__isnull=False).count() # 本周统计 thisweek = TimeUtils().get_this_week_start() context['thisweek'] = attach_email.filter( send_date__gte=thisweek).count() # 本周统计 thismonth = TimeUtils().get_this_month_start() context['thismonth'] = attach_email.filter( send_date__gte=thismonth).count() return context
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate=kw["endDate"] else: self.logger.error(kw["endDate"]+': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"]+' error format') self.redis_conn=get_redis_conn() #if not self.redis_conn.exists('sina_individual_stock:requests'): # print "set start urls" # self.start_urls = [ # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml", # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml" # ] sha_stock_codes=self.redis_conn.smembers(RedisKeys.SHAStockCode) for code in sha_stock_codes: url=('http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1' % code) self.start_urls.append(url)
def __init__(self, *a, **kw): if kw.has_key("endDate"): if TimeUtils.isValidEndDate(kw["endDate"]): self.endDate = kw["endDate"] else: self.logger.error(kw["endDate"] + ': error format, must be like 2016-05-15') raise CloseSpider(kw["endDate"] + ' error format') self.redis_conn = get_redis_conn() #if not self.redis_conn.exists('sina_individual_stock:requests'): # print "set start urls" # self.start_urls = [ # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600000.phtml", # "http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/sh600004.phtml" # ] sha_stock_codes = self.redis_conn.smembers(RedisKeys.SHAStockCode) for code in sha_stock_codes: url = ( 'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol=sh%s&Page=1' % code) self.start_urls.append(url)
def parse_article(self, response): item=response.meta["item"] item["crawl_ts"]=TimeUtils.getCurrentTimeStamp() #某些文章内容中没有title #title = response.xpath("//div[@class='status-content']/h4[@class='status-title']/text()").extract()[0] #print "title: "+title #retrieve content converter = html2text.HTML2Text() converter.ignore_links = True raw = response.xpath("//div[@class='status-content']/div[@class='detail']/text()").extract()[0] content = converter.handle(raw) item["content"] = content print "content: "+content # retrieve source src_raw = response.xpath("//div[@class='subtitle']/span[@class='source']/text()").extract()[0] src_txt = converter.handle(src_raw).strip() source = src_txt[2:] item["article_source"]=source print "article_source: "+source yield item
def parse_article(self, response): item = response.meta['item'] item['crawl_ts']=TimeUtils.getCurrentTimeStamp() # retrieve document body converter = html2text.HTML2Text() converter.ignore_links = True raw = response.xpath('//*[@id="artibody"]').extract()[0] if raw.find(u"原始正文start") !=-1: real_content_start=raw.find(u"原始正文start")+13 raw=raw[real_content_start:].trim() content = converter.handle(raw) item['content'] = content # retrieve source src_raw = response.xpath('//span[@class="time-source"]').extract()[0] src_txt = converter.handle(src_raw).strip() source = src_txt.split(" ",1)[1] item['article_source']=source pub_date= str(datetime.fromtimestamp(mktime(time.strptime(src_txt.replace(u"\xa0", "").replace(" ", "")[:16], u"%Y年%m月%d日%H:%M")))) item['pub_date']= pub_date self.logger.info('pub_date: '+ item['pub_date']) yield item
def create_review(course_id, teacher_id, class_id, title, content, score, create_by): try: with transaction.atomic(): review = Review.objects.create(course_id=course_id, class_id=class_id, teacher_id=teacher_id, title=title, content=content, **score, create_by=create_by, create_time=TimeUtils.now_ts()) if not review: return False if not course_manager.update_course_score(course_id, score): raise IntegrityError if not class_manager.update_class_score(class_id, score): raise IntegrityError if not teacher_manager.update_teacher_score(teacher_id, score): raise IntegrityError return True except IntegrityError as e: print(e) return False
def parse(self, response): result = re.findall(r'type=(\d+)', response.url) flag = result[0] jsonresponse = json.loads(response.body_as_unicode()) try: if self.endDate is None: historyOpinion = jsonresponse["data"]["historyOpinion"][ 0] #only get current day latest_date = historyOpinion['opinionTime'] self.logger.debug("latest_date: " + latest_date) self.logger.debug("current_date: " + self.current_date) latest_date = latest_date.encode('UTF-8', 'ignore') if latest_date != self.current_date: current_time = str( time.strftime("%Y%m%d %H:%M:%S", time.localtime())) self.logger.debug("there is no data in current time: " + current_time) return self.logger.debug("start to crawl date: " + latest_date) stocks = historyOpinion['hotSearchOpinionDetail'] for i in range(5): item = BaiduStockOpinionItem() item['pub_date'] = latest_date item['batch'] = int(self.batch) item['code'] = stocks[i]['stockCode'] item['name'] = stocks[i]['stockName'] rankString = stocks[i]['showtext'] # No.1 item['rank'] = int(rankString.split('.')[1]) opinionKeywords = stocks[i]['opinionKeywords'] item['keywords'] = ",".join(opinionKeywords) item['flag'] = int(flag) item['crawl_ts'] = TimeUtils.getCurrentTimeStamp() yield item else: for historyOpinion in jsonresponse["data"]["historyOpinion"]: latest_date = historyOpinion['opinionTime'] #print "test in redis" if self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \ latest_date) is None: #redis 是否已爬取 #print "crawl Date: "+latest_date self.logger.info("start to crawl date: " + latest_date) stocks = historyOpinion['hotSearchOpinionDetail'] for i in range(5): item = BaiduStockOpinionItem() item['pub_date'] = latest_date item['batch'] = int(self.batch) item['code'] = stocks[i]['stockCode'] item['name'] = stocks[i]['stockName'] rankString = stocks[i]['showtext'] # No.1 item['rank'] = int(rankString.split('.')[1]) opinionKeywords = stocks[i]['opinionKeywords'] item['keywords'] = ",".join(opinionKeywords) item['flag'] = int(flag) item['crawl_ts'] = TimeUtils.getCurrentTimeStamp() yield item elif self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \ latest_date) is not None: continue else: self.nextPage = False break if self.nextPage: page = int(response.url[-1]) + 1 url = response.url[0:-1] + str(page) request = scrapy.Request(url, callback=self.parse) yield request except KeyError as e: self.logger.error("exception is %s" % e)
def parse_comment(self, response): print "parse_comment, url: " + response.url article_hive_id = response.meta["article_hive_id"] nextPage = True respons_xpath = response.xpath( "//div[@class='comment-mod-bd']/div[@*]") if len(respons_xpath) == 0: #last page return for sel in respons_xpath: item = CommentItem() comment_id = sel.xpath('@id').extract()[0][8:] print "comment_id:" + comment_id item["id"] = article_hive_id + "&&" + comment_id if self.redis_conn.zscore(RedisKeys.xueqiu_comment_crawled, comment_id) is not None: nextPage = False print "nextPage=False" break userName = sel.xpath( "div[@class='comment-item-bd']/h4/a[@class='name']/text()" ).extract()[0] item["username"] = userName comment_content = sel.xpath( "div[@class='comment-item-bd']/div[@class='comment-item-content']" ).extract()[0] #div[@class='detail']/i converter = html2text.HTML2Text() converter.ignore_links = True comment_content = converter.handle(comment_content) item["content"] = comment_content comment_pub_date = sel.xpath( "div[@class='comment-item-ft']/div[@class='comment-meta']/div[@class='meta-info']/span[@class='time']/text()" ).extract()[0] #print "comment_pub_date original:"+comment_pub_date if comment_pub_date.find(u"今天") != -1: #今天 17:24 comment_pub_date = TimeUtils.getCurrentDate( ) + comment_pub_date[2:] + ":00" elif comment_pub_date.find(u"分钟前") != -1: minute_before = comment_pub_date[0:comment_pub_date.find(u"分钟前" )] comment_pub_date = TimeUtils.getDateSubtractMinutes( int(minute_before)) else: #07-13 14:08 comment_pub_date = TimeUtils.getCurrentYear( ) + "-" + comment_pub_date + ":00" item["pub_date"] = comment_pub_date item['crawl_ts'] = TimeUtils.getCurrentTimeStamp() #print "userName: "******"comment_content: "+comment_content print "comment_pub_date: " + comment_pub_date self.logger.info("comment_pub_date:" + comment_pub_date) yield item if nextPage == True: page = int(response.url[-1]) + 1 article_id = response.meta["article_id"] user_id = response.meta["user_id"] comment_url = "https://xueqiu.com/service/comment/list?id=" + article_id + "&user_id=" + user_id + "&type=status&sort=false&page=" + str( page) request = scrapy.Request(comment_url, headers=self.headers, callback=self.parse_comment) request.meta["article_hive_id"] = article_hive_id request.meta["article_id"] = article_id request.meta["user_id"] = user_id yield request
def parse(self, response): result = re.findall(r'type=(\d+)',response.url) flag=result[0] jsonresponse = json.loads(response.body_as_unicode()) try: if self.endDate is None: historyOpinion=jsonresponse["data"]["historyOpinion"][0] #only get current day latest_date=historyOpinion['opinionTime'] self.logger.debug("latest_date: "+latest_date) self.logger.debug("current_date: "+self.current_date) latest_date=latest_date.encode('UTF-8','ignore') if latest_date != self.current_date: current_time=str(time.strftime("%Y%m%d %H:%M:%S", time.localtime())) self.logger.debug("there is no data in current time: "+current_time) return self.logger.debug("start to crawl date: "+latest_date) stocks=historyOpinion['hotSearchOpinionDetail'] for i in range(5): item=BaiduStockOpinionItem() item['pub_date']=latest_date item['batch']=int(self.batch) item['code']=stocks[i]['stockCode'] item['name']=stocks[i]['stockName'] rankString=stocks[i]['showtext'] # No.1 item['rank']=int(rankString.split('.')[1]) opinionKeywords=stocks[i]['opinionKeywords'] item['keywords']=",".join(opinionKeywords) item['flag']=int(flag) item['crawl_ts']=TimeUtils.getCurrentTimeStamp() yield item else: for historyOpinion in jsonresponse["data"]["historyOpinion"]: latest_date=historyOpinion['opinionTime'] #print "test in redis" if self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \ latest_date) is None: #redis 是否已爬取 #print "crawl Date: "+latest_date self.logger.info("start to crawl date: "+latest_date) stocks=historyOpinion['hotSearchOpinionDetail'] for i in range(5): item=BaiduStockOpinionItem() item['pub_date']=latest_date item['batch']=int(self.batch) item['code']=stocks[i]['stockCode'] item['name']=stocks[i]['stockName'] rankString=stocks[i]['showtext'] # No.1 item['rank']=int(rankString.split('.')[1]) opinionKeywords=stocks[i]['opinionKeywords'] item['keywords']=",".join(opinionKeywords) item['flag']=int(flag) item['crawl_ts']=TimeUtils.getCurrentTimeStamp() yield item elif self.endDate <= latest_date and self.redis_conn.zscore(RedisKeys.baidu_opinion_crawled+flag, \ latest_date) is not None: continue else: self.nextPage=False break if self.nextPage: page = int(response.url[-1])+1 url = response.url[0:-1] + str(page) request = scrapy.Request(url, callback=self.parse) yield request except KeyError as e: self.logger.error("exception is %s" % e)
def import_courses(course_type): filename = os.path.join(os.getcwd(), COURSE_DATA_DIR, course_type + '.json') with open(filename, "r", encoding='utf8') as f: courses = json.load(f) course_objs = [] teacher_names = [] for course in courses: school_id = SchoolEnum.get(course['school']) if school_id is None: print("No school_id for {}".format(course['school'])) continue type_name = COURSE_TYPE_MAP[course_type] if course_type == 'english': type_name = type_name + course['level'] elif course_type == 'trans_choice': type_name = type_name + course['type'] type_id = CourseTypeEnum.get(type_name) if type_id is None: print("No type_id for {}".format(type_name)) continue credit = course['credits'] if course_type in ['english', 'trans_choice'] else course['credit'] teacher_names.extend(course['teachers']) course_obj = Course( name=course['name'], course_no=course['id'], credit=credit, school_id=school_id, type=type_id, review_count=0, last_review=0, create_time=TimeUtils.now_ts(), ) course_objs.append(course_obj) print("bulk_create courses {}".format(course_type)) Course.objects.bulk_create(course_objs) print("bulk_create courses finished") teacher_names = list(set(teacher_names)) created_names = set(Teacher.objects.filter(name__in=teacher_names).values_list("name", flat=True)) teacher_objs = [ Teacher( name=name, review_count=0, create_time=TimeUtils.now_ts() ) for name in teacher_names if name not in created_names ] print("bulk_create teachers") Teacher.objects.bulk_create(teacher_objs) print("bulk_create teachers finished") class_objs = [] for course in courses: school_id = SchoolEnum.get(course['school']) course_id = Course.objects.filter(name=course['name'], school_id=school_id).first().id teachers = list(Teacher.objects.filter(name__in=course['teachers'])) for teacher in teachers: class_objs.append(Class( course_id=course_id, teacher_id=teacher.id, semester="20-21-2", review_count=0, create_time=TimeUtils.now_ts() )) print("bulk_create classes") Class.objects.bulk_create(class_objs) print("bulk_create classes finished")