def get(self): newsHtmlList = Contents.all().filter("status =", "2").fetch(1) if newsHtmlList: try: r = newsHtmlList[0].rss except: newsHtmlList[0].status = "1" # db.delete(newsHtmlList) logging.error("delete one news ,has no rss") return # ''' # 接下来就是要处理原始材料了。这是第一个版本的剥皮程序。 # 1.找寻所有的<p></p> 之间的内容。 # (根据我的观察,能发布新闻RSS的网站都是大型网站,有优化html代码的习惯。使得新闻html很简化。因此我觉得这个方法有一定的可行性。) # ''' newsHtmlList = Contents.all().filter("rss =", newsHtmlList[0].rss).filter("status =", "2").fetch(20) oldHtmlList = memcache.get("oldhtmllist" + str(newsHtmlList[0].rss.key().id())) if not oldHtmlList: oldHtmlList = Contents.all().filter("rss =", newsHtmlList[0].rss).filter("status >", "2").fetch(10) try: memcache.set("oldhtmllist" + str(newsHtmlList[0].rss.key().id()), oldHtmlList, 3600 * 24 * 3) except Exception, e: pass # content=newsHtmlList[0] # news=skinSubOne(content.content) skinSubTwo(oldHtmlList, newsHtmlList)
def get(self): cid = self.request.get("content") if cid: content = Contents.get_by_id(int(cid)) self.render("templates/look.html", {"content": content, "view": True}) return rssid = self.request.get("rss") if rssid: c = Contents.all().filter("realContentResult =", 0).filter("rss =", RSS.get_by_id(int(rssid))).fetch(1) if not rssid or not c: c = Contents.all().filter("realContentResult =", 0).fetch(1) if c: content = c[0] self.render("templates/look.html", {"content": content}) else: self.redirect("/")
def parse(self): contents = Contents(file=str(self.xcloc_path)) self.__parse_contents_json(contents) self.__parse_xliff(contents) self.__parse_screenshots(contents) return contents
def write_file_to_database(self, m_group_id, m_group_id2, m_title, m_path, m_url, m_level): print 'write_to_database' '''doc = Documents.objects.filter(url = m_url) if (len(doc) == 0): doc = Documents(url=m_url, uid=self.uid, title=m_title, level = m_level) doc.save() print 'create\n' else: doc[0].uid = self.uid doc[0].title = m_title # doc.update() doc[0].save() doc = doc[0] print 'update\n''' # doc = Documents(title='title', url='url') # doc.save() # print 'docid is: %s' % str(doc) # Delete old records res = ResAddr() id = res.saveResaddr(m_url, m_level, '', m_title) doc = res.getById(id) oldContents = Contents.objects.filter(doc=doc) oldContents.delete() try: print '\n\n\nstart write url:', m_url reader = ReadBigFile(m_path, blockSize=1024 * 16) block = reader.getNextBlock() while (block != None): # print 'm_content' + para con = Contents(group_id=doc.id, group_id2=m_group_id2, doc=doc, paragraph=unicode(block), tag='p') con.save() block = reader.getNextBlock() os.remove(m_path) return doc.id except Exception as e: print e print str(e) exit() return None
def get(self): rssid = self.request.get("rss") c = ( Contents.all() .filter("rss =", RSS.get_by_id(int(rssid))) .filter("realContentResult >", 0) .filter("realContentResult !=", None) ) self.render("templates/detailLook.html", {"content": c})
def parse(self): contents = Contents(file=str(self.file_path)) wb = openpyxl.load_workbook(self.file_path) self.__parse_metadata_sheet(contents, wb) self.__parse_translation_sheets(contents, wb) return contents
def get(self): # nocode=Contents.all().filter('status =','1').filter('code =',None).fetch(10) # db.delete(nocode) nocontent = Contents.all().filter("status =", "1").filter("hasContent =", False).fetch(10) db.delete(nocontent) deletecontent = Contents.all().filter("status =", "1").filter("hasDelete =", True).fetch(10, 30) db.delete(deletecontent) # oldpic=Picture.all().filter('datetime <',datetime.now()+timedelta(hours=-72)).fetch(300) # db.delete(oldpic) # norss=Contents().all().filter('status =','2').fetch(100) rss = 0 # for c in norss: # try: # r=c.rss.code # except : # logging.info('delete 1') # c.status='1' # c.put() # rss+=1 logging.info("nocontent:" + str(len(nocontent)) + "-" + "deletecontent:" + str(len(deletecontent)))
def post(self): link = self.request.get("link") c = Contents.all().filter("link =", link).fetch(1) if c: content = c[0] content.realContentResult = int(self.request.get("realContentResult")) content.realContentBz = self.request.get("realContentBz") content.put() self.redirect("/look?rss=%s" % self.request.get("rss")) return self.redirect("/")
def write_file_to_database(self,m_group_id, m_group_id2, m_title, m_path, m_url, m_level): print 'write_to_database' '''doc = Documents.objects.filter(url = m_url) if (len(doc) == 0): doc = Documents(url=m_url, uid=self.uid, title=m_title, level = m_level) doc.save() print 'create\n' else: doc[0].uid = self.uid doc[0].title = m_title # doc.update() doc[0].save() doc = doc[0] print 'update\n''' # doc = Documents(title='title', url='url') # doc.save() # print 'docid is: %s' % str(doc) # Delete old records res = ResAddr() id = res.saveResaddr(m_url, m_level, '', m_title) doc = res.getById(id) oldContents = Contents.objects.filter(doc = doc) oldContents.delete() try: print '\n\n\nstart write url:', m_url reader = ReadBigFile(m_path, blockSize=1024*16) block = reader.getNextBlock() while(block!=None): # print 'm_content' + para con = Contents(group_id=doc.id, group_id2=m_group_id2, doc=doc, paragraph=unicode(block), tag='p') con.save() block = reader.getNextBlock() os.remove(m_path) return doc.id except Exception as e: print e print str(e) exit() return None
def write_to_database(self,m_group_id, m_group_id2, m_title, m_content, m_url, m_level): print 'write_to_database' '''doc = Documents.objects.filter(url = m_url) if (len(doc) == 0): doc = Documents(url=m_url, uid=self.uid, title=m_title, level = m_level) doc.save() print 'create\n' else: doc[0].uid = self.uid doc[0].title = m_title # doc.update() doc[0].save() doc = doc[0] print 'update\n''' # doc = Documents(title='title', url='url') # doc.save() # print 'docid is: %s' % str(doc) # Delete old records res = ResAddr() id = res.saveResaddr(m_url, m_level, '', m_title) doc = res.getById(id) oldContents = Contents.objects.filter(doc = doc) oldContents.delete() import types if type(m_content) is types.ListType: for para in m_content: # print 'm_content' + para con = Contents(group_id=doc.id, group_id2=m_group_id2, doc=doc, paragraph=para, tag='p') con.save() else: con = Contents(group_id=doc.id, group_id2=m_group_id2, doc=doc, paragraph=m_content, tag='p') con.save() return doc.id
def get(self): rsslist = [] for r in RSS.all(): r.r0 = Contents.all().filter("rss =", r).filter("realContentResult =", 0).count() r.r1 = Contents.all().filter("rss =", r).filter("realContentResult =", 1).count() r.r2 = Contents.all().filter("rss =", r).filter("realContentResult =", 2).count() r.r3 = Contents.all().filter("rss =", r).filter("realContentResult =", 3).count() r.r4 = Contents.all().filter("rss =", r).filter("realContentResult =", 4).count() r.r5 = Contents.all().filter("rss =", r).filter("realContentResult =", 5).count() r.r6 = Contents.all().filter("rss =", r).filter("realContentResult =", None).count() rsslist.append(r) self.render("templates/analysis.html", {"RSSs": rsslist})
def write_to_database(self, m_group_id, m_group_id2, m_title, m_content, m_url, m_level): print 'write_to_database' '''doc = Documents.objects.filter(url = m_url) if (len(doc) == 0): doc = Documents(url=m_url, uid=self.uid, title=m_title, level = m_level) doc.save() print 'create\n' else: doc[0].uid = self.uid doc[0].title = m_title # doc.update() doc[0].save() doc = doc[0] print 'update\n''' # doc = Documents(title='title', url='url') # doc.save() # print 'docid is: %s' % str(doc) # Delete old records res = ResAddr() id = res.saveResaddr(m_url, m_level, '', m_title) doc = res.getById(id) oldContents = Contents.objects.filter(doc=doc) oldContents.delete() import types if type(m_content) is types.ListType: for para in m_content: # print 'm_content' + para con = Contents(group_id=doc.id, group_id2=m_group_id2, doc=doc, paragraph=para, tag='p') con.save() else: con = Contents(group_id=doc.id, group_id2=m_group_id2, doc=doc, paragraph=m_content, tag='p') con.save() return doc.id
def get(self, limit): self.urls = [] for content in Contents.all().filter("status =", "1").fetch(int(limit)): self.urls.append((content, content.link)) self.searchRSS()