def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = array, signs_to_remove = ["?!#%&_"])) hoy = date.today() query = News3.query( News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política" ) query2 = News3.query(News3.date == hoy,News3.news_from == "uy_press",News3.category == "deportes") query4 = News3.query(News3.date == hoy,News3.news_from == "uy_press",News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = array, signs_to_remove = ["?!#%&"])) #print unknownInstance classification = newsClassifier.classify("Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo") # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
def if_exist(self, title, news_from, id=None, url=""): try: if isinstance(title, unicode): pass else: title = title.decode('utf-8') except: print traceback.format_exc() hoy = date.today() #edit = (ndb.Key("News", date==hoy)).urlsafe()# obtiene la key #print edit #e = News.query(ndb.GenericProperty("__key__ =") == edit)# filtra por la key #print e #sys.exit() edit = News3.query( News3.url == url, News3.news_from == news_from, News3.date == hoy, ).get() #edit = edit.get() #print edit.date #print [edit.title] #print edit.news_from #print edit #f = News.query(News.news_from == news_from,News.date == hoy,).get() #edit = f.filter(ndb.GenericProperty('news_from') == news_from) #print f #sys.exit() if edit: edit.id = int(id) edit.put() print "Exist and update" return True else: print "No exist" return False
def get(self, idd): try: #c = UyPress() #idd = self.request.get('idd') print idd #c = c.get() print "uy press" hoy = date.today() try: data = News3.query( #News3.date == hoy, News3.key == ndb.Key(News3, int(idd)), ) except: print traceback.format_exc() dia = hoy.strftime('%d') mes = hoy.strftime('%m') year = hoy.strftime('%Y') week_day = datetime.today().weekday() days_week = [ "Lunes", "Martes", "Miércoles", "Jueves", "Viernes", "Sábado", "Domingo" ] week_day = days_week[week_day] hora = hoy - timedelta(hours=3) hora = hora.strftime("%H:%M:%S") try: clima = Yahoo() cli = clima.get() except: cli = "" template_values = { 'data': data, 'clima': cli, 'dia': dia, 'mes': mes, 'year': year, 'week_day': week_day, 'hora': hora, } template = JINJA_ENVIRONMENT.get_template('/templates/mono4.html') self.response.write(template.render(template_values)) except: error = traceback.format_exc() print error template_values = { 'error': error, } template = JINJA_ENVIRONMENT.get_template('/templates/mono5.html') self.response.write(template.render(template_values))
def similitude(self, news_from): try: hoy = date.today() edit = News3.query( #News.title == title, News3.news_from != news_from, News3.date == hoy, ).fetch() for i in edit: print i.keyword sys.exit() except: print traceback.format_exc() return traceback.format_exc()
def get(self): hoy = date.today() f = News3.query( News3.date == hoy, News3.news_from.IN([ "ciento_ochenta", ]) ).fetch() for i in f: i.key.delete() template_values = { 'data': f, 'fecha': hoy, #'data3': data3, } template = JINJA_ENVIRONMENT.get_template('/templates/index.html') self.response.write(template.render(template_values))
def get(self): try: print " " print "TestClassifier start" print " " # pasar los stop words a lista desde el file with open("stop_words.txt", "r") as ins: array = [] for line in ins: array.append((line.rstrip('\n')).decode('unicode-escape')) #print array newsTrainer = Trainer( tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&_"])) hoy = date.today() query = News3.query(News3.date == hoy, News3.news_from.IN([ "uy_press", ]), News3.category == "Política") # You need to train the system passing each text one by one to the trainer module. #newsSet =[ # {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'}, # {'text': 'Russia try to invade Ukraine', 'category': 'politics'}, # {'text': 'do not neglect exercise', 'category': 'health'}, # {'text': 'Syria is the main issue, Obama says', 'category': 'politics'}, # {'text': 'eat to lose weight', 'category': 'health'}, # {'text': 'you should not eat much', 'category': 'health'} #] query2 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "deportes") query4 = News3.query(News3.date == hoy, News3.news_from == "uy_press", News3.category == "salud") #for news in newsSet: # newsTrainer.train(news['text'], news['category']) c = 0 #print query for i in query: print " " print i.category newsTrainer.train(i.html, 'politica') #if c == 10: break c += 1 #for i in query2: # newsTrainer.train(i.html, 'deportes') #raise Exception('I know Python!') #for i in query4: # newsTrainer.train(i.html, 'salud') # When you have sufficient trained data, you are almost done and can start to use # a classifier. # Now you have a classifier which can give a try to classifiy text of news whose # category is unknown, yet. query3 = News3.query( News3.date == hoy, News3.news_from.IN([ "el_pais", ]), News3.id.IN([0]), ) ### newsClassifier = Classifier( newsTrainer.data, tokenizer.Tokenizer(stop_words=array, signs_to_remove=["?!#%&"])) #print unknownInstance classification = newsClassifier.classify( "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo" ) # the classification variable holds the detected categories sorted print " classification " print(classification) except: print traceback.format_exc()
def get(self): try: page = "http://www.uypress.net/acategoria.aspx?57" sock = urllib2.urlopen(page) htmlSource = sock.read() sock.close() soup = BeautifulSoup(htmlSource, "html.parser") # solo la portada portada = soup.find_all("div", attrs={"id": "principalUC"}) portada = portada[0].find_all("div") l = [] c = 0 for i in portada: try: print " " print c if (c == 0): c += 1 continue url = i.find_all("a")[0]["href"] title = (i.find_all("a")[0]).string print[title] category = "deportes" img = "" try: #e = SaveNews() #e = e.if_exist(title, "uy_press_politica", c, url) edit = News3.query( News3.url == url, News3.news_from == "uy_press_deportes", ).get() if edit: print "Existe" c = c + 1 #print i continue else: print "No Existe" except: print traceback.format_exc() try: print "pre mk" text = HtmlToTextMain() text = text.main(url) #print text print "" print[text] print "post mk" except: #print text print traceback.format_exc() continue #break try: print "PRE SAVE" s = SaveNews() sav = s.saveme( title=title, subtitle="", image="", url=url, category=category, keyword=[], news_from="uy_press_deportes", id=c, html=text, ) print "SAVE" print sav except: print traceback.format_exc() continue print "sleep" time.sleep(2) print "wake up" c += 1 print c except: print traceback.format_exc() return traceback.format_exc() except: print traceback.format_exc() return traceback.format_exc()