def create_product_to_sell(self, product_params): """ create a product and save into db """ try: # change the size of the image big_img, small_img = change_img_size(product_params["image"]) product_to_sell = ProductToSell( owner=product_params["owner"], product_name=product_params["productname"], broad_type=product_params["broadtype"], sub_type=product_params["subtype"], belong_campus=product_params["belongcampus"], trade_type=product_params["tradetype"], purity=product_params["purity"], big_img=big_img, small_img=small_img, price=product_params["price"], trade_title=product_params["tradetitle"], trade_detail=product_params["tradedetail"], release_time=product_params["releasetime"], ) product_to_sell.save() nid = create_nid(product_to_sell.id) product_to_sell.nid = nid product_to_sell.save() return product_to_sell except Exception, e: print e return None
def test_crawler(fiction_url): fiction_infor = get_book_infor( 'http://qidian.com', 'qidian', fiction_url, ) if not fiction_infor: return print fiction_infor #save tag #save all the fiction into database fiction_title = '武炼巅峰' author_name = '莫默' ids = ['2494758'] types = '1' try: web_site = FictionWebSite.objects.get(title='qidian') except: web_site = FictionWebSite( title='qidian', url='http://qidian.com', ) web_site.save() try: #如果同样的标题的小说已经被收录,唯一性由标题和作者确定 fic = Fiction.objects.get(fiction_title=fiction_title, author=author_name) #如果所在网站相同,则不继续处理 DG.trace("get it") print fic.source_site.title print web_site.title if fic.source_site.title == web_site.title: return except Exception, e: print e fic = Fiction( fiction_title=fiction_title, fiction_avatar_url=fiction_infor['avatar'], fiction_intro=fiction_infor['intro'], fiction_id=ids[0], fiction_style=types, total_word=fiction_infor['total_word'], stock_time=10, com_word="", source_site=web_site, click_time=fiction_infor['click_time'], rec_time=fiction_infor['rec_time'], author=author_name, author_url="", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() if isinstance(fiction_title, unicode): fiction_title = fiction_title.encode('utf-8') #如果是新加入的小说,则为其建立索引 t1 = threading.Thread(target=build_index_database, args=(fiction_title, fic, '1')) t1.start() t1.join()
def register(request): if request.user.is_authenticated(): return HttpResponseRedirect('/') if request.method == 'GET': return RTR('register.html', {'error_msg': False}, context_instance=RequestContext(request)) #get validationg code validate_code = request.POST.get('validate', '') if not_legal(validate_code) or validate_code.lower( ) != request.session['validate'].lower(): return RTR('register.html', {'error_msg': True}, context_instance=RequestContext(request)) email = request.POST.get('email', '') password = request.POST.get('password', '') name = request.POST.get('name', '') remember = request.POST.get('remember', '') try: try: user = Account.objects.get(name=name) return HttpResponse("you ren le") except: pass #build shelf shelf = Shelf(fiction_number=0) shelf.save() user = Account(name=name, password=hashlib.md5(password).hexdigest(), is_active='0', email=email, shelf=shelf) user.save() user.nid = create_nid(user.id) user.save() user = authenticate(name=user.name, password=password) if user: login(request, user) response = HttpResponseRedirect('/' + user.nid) if remember == 'on': response.set_cookie('login', 'True', max_age=7 * 24 * 60 * 60) return response else: return HttpResponse('ok') except: HttpResponse("服务器故障,请稍候再试<a href = '%s'>返回<a/>" % request.META['HTTP_REFERER'])
def create_user(self, user_params): """ create a user and save into db """ try: user = Account(nick_name = user_params['nickname'], real_name = user_params['realname'], password = hashlib.md5(user_params['password']).hexdigest(), email = user_params['email'], qq = user_params['qq'], person_intro = user_params['intro'], phone = user_params['phone']) user.save() nid = create_nid(user.id) user.nid = nid user.save() return user except Exception,e: print e return None
def create_product_to_buy(self, product_params): """ create a product and save into db """ try: product_to_buy = ProductToBuy( owner=product_params["owner"], product_name=product_params["productname"], broad_type=product_params["broadtype"], trade_type=product_params["tradetype"], sub_type=product_params["subtype"], release_time=product_params["releasetime"], ) product_to_buy.save() nid = create_nid(product_to_buy.id) product_to_buy.nid = nid product_to_buy.save() return product_to_buy except Exception, e: print e return None
def crawler_types(types, web_site, url, ): #index stands for the pageIndex pattern = ALL_PATTERN[web_site.title] for index in range(1, 100): new_url = build_url_page(url, web_site.title, index, types) html_page = urllib2.urlopen(new_url) html_content = html_page.read() html_content = gzip_content(html_content) #get content content = BeautifulSoup(html_content) out = content.findAll(pattern['all_content_tag'], pattern['all_content_dict']) contents = ''.join([str(item) for item in out]) chapter = BeautifulSoup(contents) fictions = chapter.findAll(pattern['all_fiction_tag'], pattern['all_fiction_dict']) for item in fictions: #get fiction title contents = str(item) try: fiction_title = re.findall(pattern['all_fiction_title'], contents) fiction_title = ''.join([str(_item) for _item in fiction_title]) #get fiction url fiction_url = re.findall(pattern['all_fiction_url'], contents) fiction_url = ''.join([str(_item) for _item in fiction_url[0]]) #get fiction type fiction_type = re.findall(pattern['all_fiction_type'], contents) fiction_type = ''.join([str(_item) for _item in fiction_type]) #get fiction id in thissite author_name = re.findall(pattern['all_author_name'], contents) author_name = ''.join([str(_item) for _item in author_name]) ids = re.findall(pattern['ids_pattern'], fiction_url) if not fiction_title or not fiction_url or not fiction_type or not author_name: continue #get write information except: continue print '小说%s抓取开始' % fiction_title try: #if the fiction has been select into the database #which is indetified by author and title hash_url = HashUrl.objects.get(urls = fiction_url) continue except: hash_url = HashUrl(urls = fiction_url) hash_url.save() fiction_infor = get_book_infor(web_site.url, web_site.title, fiction_url, ) if not fiction_infor: continue #save tag #save all the fiction into database fic = Fiction(fiction_title = fiction_title, fiction_avatar_url = fiction_infor['avatar'], fiction_intro = fiction_infor['intro'], fiction_id = ids[0], fiction_style = types, total_word= fiction_infor['total_word'], stock_time = 10, com_word = "", source_site = web_site, click_time = fiction_infor['click_time'], rec_time = fiction_infor['rec_time'], author = author_name, author_url = "", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() print '小说 %s 基本信息存入数据库' % fiction_title #save tags for tags in fiction_infor['tags']: try: tag = Tag.objects.get(tag = tags) except: tag = Tag(tag = tags) tag.save() fic.tag.add(tag) fic.save() ms = MemberShip(fiction = fic, website = web_site) ms.save() threads = [] print '获取小说 %s 章节' % fiction_title chapter_url = build_url_fiction(ids[0], web_site.title) #chapter_infor = chapter_func[web_site.title](chapter_url) fiction_intro = fiction_infor['intro'] if isinstance(fiction_title, unicode): fiction_title = fiction_title.encode('utf-8') t1 = threading.Thread(target = build_index_database, args = (fiction_title, fic, '1')) t2 = threading.Thread(target = chapter_func[web_site.title], args = (chapter_url, fic, web_site)) t1.start() t2.start() t1.join() t2.join()
def crawler_types( types, web_site, url, ): #index stands for the pageIndex pattern = ALL_PATTERN[web_site.title] for index in range(1, 100): new_url = build_url_page(url, web_site.title, index, types) html_page = urllib2.urlopen(new_url) html_content = html_page.read() html_content = gzip_content(html_content) #get content content = BeautifulSoup(html_content) out = content.findAll(pattern['all_content_tag'], pattern['all_content_dict']) contents = ''.join([str(item) for item in out]) chapter = BeautifulSoup(contents) fictions = chapter.findAll(pattern['all_fiction_tag'], pattern['all_fiction_dict']) for item in fictions: #get fiction title contents = str(item) try: fiction_title = re.findall(pattern['all_fiction_title'], contents) fiction_title = ''.join( [str(_item) for _item in fiction_title]) #get fiction url fiction_url = re.findall(pattern['all_fiction_url'], contents) fiction_url = ''.join([str(_item) for _item in fiction_url[0]]) #get fiction type fiction_type = re.findall(pattern['all_fiction_type'], contents) fiction_type = ''.join([str(_item) for _item in fiction_type]) #get fiction id in thissite author_name = re.findall(pattern['all_author_name'], contents) author_name = ''.join([str(_item) for _item in author_name]) ids = re.findall(pattern['ids_pattern'], fiction_url) if not fiction_title or not fiction_url or not fiction_type or not author_name: continue #get write information except: continue print '小说%s抓取开始' % fiction_title try: #if the fiction has been select into the database #which is indetified by author and title hash_url = HashUrl.objects.get(urls=fiction_url) continue except: hash_url = HashUrl(urls=fiction_url) hash_url.save() fiction_infor = get_book_infor( web_site.url, web_site.title, fiction_url, ) if not fiction_infor: continue #save tag #save all the fiction into database fic = Fiction( fiction_title=fiction_title, fiction_avatar_url=fiction_infor['avatar'], fiction_intro=fiction_infor['intro'], fiction_id=ids[0], fiction_style=types, total_word=fiction_infor['total_word'], stock_time=10, com_word="", source_site=web_site, click_time=fiction_infor['click_time'], rec_time=fiction_infor['rec_time'], author=author_name, author_url="", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() print '小说 %s 基本信息存入数据库' % fiction_title #save tags for tags in fiction_infor['tags']: try: tag = Tag.objects.get(tag=tags) except: tag = Tag(tag=tags) tag.save() fic.tag.add(tag) fic.save() ms = MemberShip(fiction=fic, website=web_site) ms.save() threads = [] print '获取小说 %s 章节' % fiction_title chapter_url = build_url_fiction(ids[0], web_site.title) #chapter_infor = chapter_func[web_site.title](chapter_url) fiction_intro = fiction_infor['intro'] if isinstance(fiction_title, unicode): fiction_title = fiction_title.encode('utf-8') t1 = threading.Thread(target=build_index_database, args=(fiction_title, fic, '1')) t2 = threading.Thread(target=chapter_func[web_site.title], args=(chapter_url, fic, web_site)) t1.start() t2.start() t1.join() t2.join()
def run(self): """thread method""" #get all the fresh information _headers = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Accept": "text/plain" } print self.newest_url request = urllib2.Request(self.newest_url, headers=_headers) html_page = urllib2.urlopen(request).read() try: import gzip, StringIO data = html_page data = StringIO.StringIO(data) gzipper = gzip.GzipFile(fileobj=data) html = gzipper.read() html_page = html except: pass html_page = BeautifulSoup(str(html_page)) content = html_page.findAll(self.content_tag, self.content_dict) contents = ''.join([str(item) for item in content]) chapter_infor = BeautifulSoup(contents) content = chapter_infor.findAll(self.chapter_tag, self.chapter_dict) indexs = 1 print content for item in content: indexs += 1 contents = str(item) types = ''.join(re.findall(self.types_pattern, contents)) title = ''.join(re.findall(self.title_pattern, contents)) chapter = ''.join(re.findall(self.chapter_pattern, contents)) author = ''.join(re.findall(self.author_pattern, contents)) fiction_url = ''.join( re.findall(self.fiction_url_pattern, contents)) chapter_url = ''.join( re.findall(self.chapter_url_pattern, contents)) if not types or not title or \ not chapter or not author or not fiction_url or not chapter_url: continue newest_chapter_url = chapter_url host = self.host print author if self.host[len(self.host) - 1] == '/': host = self.host[:len(self.host) - 1] if chapter_url[0] == '/': chapter_url = host + chapter_url if fiction_url[0] == '/': fiction_url = host + fiction_url try: web_site = FictionWebSite.objects.get(url=self.host) except: web_site = FictionWebSite(title=self.thread_name, url=self.host) web_site.save() try: hash_url = HashUrl.objects.get(urls=fiction_url) is_exit = True fic = Fiction.objects.get(fiction_title=title, author=author) except: is_exit = False print 'here' if not is_exit: try: hash_url = HashUrl(urls=fiction_url) hash_url.save() except: continue #if the fiction got by crawler is the newest one #get the book infor book_infor = get_book_infor(self.host, self.thread_name, fiction_url, True) ids = re.findall(ALL_PATTERN[web_site.title]['ids_pattern'], fiction_url) types = '4' if not STYLE[self.thread_name].has_key(book_infor['types']) else \ STYLE[self.thread_name][(book_infor['types'])] try: fic = Fiction( fiction_title=title, fiction_avatar_url=book_infor['avatar'], fiction_intro=book_infor['intro'], fiction_id=ids[0], fiction_style=types, total_word=book_infor['total_word'], com_word="", source_site=web_site, click_time=book_infor['click_time'], rec_time=book_infor['rec_time'], author=author, stock_time=0, author_url="", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() member = MemberShip(fiction=fic, website=web_site, fiction_url=fiction_url) member.save() del member except Exception, e: print 'Ever' continue #search only by fiction title for item in mmseg.Algorithm(title): try: index = Index.objects.get(key=item.text) except: index = Index(key=item.text) index.save() IndexFictionRelationship.objects.create( key=index, fiction=fic, position=','.join([str(item.start), str(item.end)]), bit='2', #chapter ) #get all chapters if book_infor.has_key('read_url'): chapter_url = book_infor['read_url'] else: chapter_url = build_url_fiction(ids[0], web_site.title) get_chapters_thread = threading.Thread( target=chapter_func[web_site.title], args=(chapter_url, fic, web_site, '1')) get_chapters_thread.start() get_chapters_thread.join() #if the fiction has been inserted into the database before else: print "yes it is!" #get the max index of chapters try: chapter_index = ChapterIndex.objects.get( fiction=fic.id, web_site=web_site.title) except Exception, e1: try: chapter_index = ChapterIndex.objects.filter( fiction=fic.id, web_site=web_site.title)[0] except Exception, e: print e print e1 continue
def run(self): """thread method""" #get all the fresh information _headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", "Accept": "text/plain"} print self.newest_url request = urllib2.Request(self.newest_url, headers = _headers) html_page = urllib2.urlopen(request).read() try: import gzip, StringIO data = html_page data = StringIO.StringIO(data) gzipper = gzip.GzipFile(fileobj=data) html = gzipper.read() html_page = html except: pass html_page = BeautifulSoup(str(html_page)) content = html_page.findAll(self.content_tag, self.content_dict) contents = ''.join([str(item) for item in content]) chapter_infor = BeautifulSoup(contents) content = chapter_infor.findAll(self.chapter_tag, self.chapter_dict) indexs = 1 print content for item in content: indexs += 1 contents = str(item) types = ''.join(re.findall(self.types_pattern, contents)) title = ''.join(re.findall(self.title_pattern, contents)) chapter = ''.join(re.findall(self.chapter_pattern, contents)) author = ''.join(re.findall(self.author_pattern, contents)) fiction_url = ''.join(re.findall(self.fiction_url_pattern, contents)) chapter_url = ''.join(re.findall(self.chapter_url_pattern, contents)) if not types or not title or \ not chapter or not author or not fiction_url or not chapter_url: continue newest_chapter_url = chapter_url host = self.host print author if self.host[len(self.host) - 1] == '/': host = self.host[:len(self.host) - 1] if chapter_url[0] == '/': chapter_url = host + chapter_url if fiction_url[0] == '/': fiction_url = host + fiction_url try: web_site = FictionWebSite.objects.get(url = self.host) except: web_site = FictionWebSite(title = self.thread_name, url = self.host) web_site.save() try: hash_url = HashUrl.objects.get(urls = fiction_url) is_exit = True fic = Fiction.objects.get(fiction_title = title, author = author) except: is_exit = False print 'here' if not is_exit: try: hash_url = HashUrl(urls = fiction_url) hash_url.save() except: continue #if the fiction got by crawler is the newest one #get the book infor book_infor = get_book_infor(self.host, self.thread_name, fiction_url, True) ids = re.findall(ALL_PATTERN[web_site.title]['ids_pattern'], fiction_url) types = '4' if not STYLE[self.thread_name].has_key(book_infor['types']) else \ STYLE[self.thread_name][(book_infor['types'])] try: fic = Fiction(fiction_title = title, fiction_avatar_url = book_infor['avatar'], fiction_intro = book_infor['intro'], fiction_id = ids[0], fiction_style = types, total_word = book_infor['total_word'], com_word = "", source_site = web_site, click_time = book_infor['click_time'], rec_time = book_infor['rec_time'], author = author, stock_time = 0, author_url = "", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() member = MemberShip(fiction = fic, website = web_site, fiction_url = fiction_url) member.save() del member except Exception,e: print 'Ever' continue #search only by fiction title for item in mmseg.Algorithm(title): try: index = Index.objects.get(key = item.text) except: index = Index(key = item.text) index.save() IndexFictionRelationship.objects.create(key = index, fiction = fic, position = ','.join([str(item.start), str(item.end)]), bit = '2',#chapter ) #get all chapters if book_infor.has_key('read_url'): chapter_url = book_infor['read_url'] else: chapter_url = build_url_fiction(ids[0], web_site.title) get_chapters_thread = threading.Thread(target = chapter_func[web_site.title], args = (chapter_url, fic, web_site, '1')) get_chapters_thread.start() get_chapters_thread.join() #if the fiction has been inserted into the database before else: print "yes it is!" #get the max index of chapters try: chapter_index = ChapterIndex.objects.get(fiction = fic.id, web_site = web_site.title) except Exception, e1: try: chapter_index = ChapterIndex.objects.filter(fiction = fic.id, web_site = web_site.title)[0] except Exception, e: print e print e1 continue
def crawler_types( types, web_site, url, ): #index stands for the pageIndex pattern = ALL_PATTERN[web_site.title] for index in range(1, 100): new_url = build_url_page(url, web_site.title, index, types) html_page = urllib2.urlopen(new_url) html_content = html_page.read() html_content = gzip_content(html_content) #get content content = BeautifulSoup(html_content) out = content.findAll(pattern['all_content_tag'], pattern['all_content_dict']) if not out: break contents = ''.join([str(item) for item in out]) chapter = BeautifulSoup(contents) fictions = chapter.findAll(pattern['all_fiction_tag'], pattern['all_fiction_dict']) for item in fictions: #get fiction title contents = str(item) try: fiction_title = re.findall(pattern['all_fiction_title'], contents) fiction_title = ''.join( [str(_item) for _item in fiction_title]) #get fiction url fiction_url = re.findall(pattern['all_fiction_url'], contents) fiction_url = ''.join([str(_item) for _item in fiction_url[0]]) #get fiction type fiction_type = re.findall(pattern['all_fiction_type'], contents) fiction_type = ''.join([str(_item) for _item in fiction_type]) #get fiction id in thissite author_name = re.findall(pattern['all_author_name'], contents) author_name = ''.join([str(_item) for _item in author_name]) ids = re.findall(pattern['ids_pattern'], fiction_url) if not fiction_title or not fiction_url or not author_name: continue #get write information except Exception, e: continue try: hash_url = HashUrl.objects.get(urls=fiction_url) continue except: hash_url = HashUrl(urls=fiction_url) hash_url.save() fiction_infor = get_book_infor( web_site.url, web_site.title, fiction_url, ) if not fiction_infor: continue #save tag #save all the fiction into database try: #如果同样的标题的小说已经被收录,唯一性由标题和作者确定 fic = Fiction.objects.get(fiction_title=fiction_title, author=author_name) #如果所在网站相同,则不继续处理 DG.trace("get it") if fic.source_site.title == web_site.title: continue except: fic = Fiction( fiction_title=fiction_title, fiction_avatar_url=fiction_infor['avatar'], fiction_intro=fiction_infor['intro'], fiction_id=ids[0], fiction_style=types, total_word=fiction_infor['total_word'], stock_time=10, com_word="", source_site=web_site, click_time=fiction_infor['click_time'], rec_time=fiction_infor['rec_time'], author=author_name, author_url="", ) fic.save() fic.fiction_nid = create_nid(fic.id) fic.save() if isinstance(fiction_title, unicode): fiction_title = fiction_title.encode('utf-8') #如果是新加入的小说,则为其建立索引 #t1 = threading.Thread(target = build_index_database, args = (fiction_title, fic, '1')) #t1.start() #t1.join() #save tags for tags in fiction_infor['tags']: try: tag = Tag.objects.get(tag=tags) except: tag = Tag(tag=tags) tag.save() fic.tag.add(tag) fic.save() #新添小说和网站的联系表 ms = MemberShip(fiction=fic, website=web_site, fiction_url=fiction_url) ms.save() #获取该小说的所有章节 chapter_url = build_url_fiction(ids[0], web_site.title) t2 = threading.Thread(target=chapter_func[web_site.title], args=(chapter_url, fic, web_site)) t2.start() t2.join()