def get(self):
		try:
			print "  "
			print "TestClassifier start"
			print "  "
			# pasar  los stop words a lista desde el file
			with open("stop_words.txt", "r") as ins:
				array = []
				for line in ins:
					array.append((line.rstrip('\n')).decode('unicode-escape'))
			#print array
			newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = array, signs_to_remove = ["?!#%&_"]))

			hoy = date.today()

			query = News3.query(
				News3.date == hoy,
				News3.news_from.IN([
                    "uy_press",
                ]),
                News3.category == "Política"
            )


			query2 = News3.query(News3.date == hoy,News3.news_from == "uy_press",News3.category == "deportes")

			query4 = News3.query(News3.date == hoy,News3.news_from == "uy_press",News3.category == "salud")

			#for news in newsSet:
			#    newsTrainer.train(news['text'], news['category'])
			c = 0
			#print query
			for i in query:
				print "  "
				print i.category
				newsTrainer.train(i.html, 'politica')
				#if c == 10: break
				c += 1

			query3 = News3.query(
		    	News3.date == hoy,
                News3.news_from.IN([
                    "el_pais",
                ]),
                News3.id.IN([0]),
            )

			###
			newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = array, signs_to_remove = ["?!#%&"]))
			#print unknownInstance
			classification = newsClassifier.classify("Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo")

			# the classification variable holds the detected categories sorted
			print " classification "
			print(classification)
		except:
			print traceback.format_exc()
Beispiel #2
0
 def if_exist(self, title, news_from, id=None, url=""):
     try:
         if isinstance(title, unicode): pass
         else: title = title.decode('utf-8')
     except:
         print traceback.format_exc()
     hoy = date.today()
     #edit = (ndb.Key("News", date==hoy)).urlsafe()# obtiene la key
     #print edit
     #e = News.query(ndb.GenericProperty("__key__ =") == edit)# filtra por la key
     #print e
     #sys.exit()
     edit = News3.query(
         News3.url == url,
         News3.news_from == news_from,
         News3.date == hoy,
     ).get()
     #edit = edit.get()
     #print edit.date
     #print [edit.title]
     #print edit.news_from
     #print edit
     #f = News.query(News.news_from == news_from,News.date == hoy,).get()
     #edit = f.filter(ndb.GenericProperty('news_from') == news_from)
     #print f
     #sys.exit()
     if edit:
         edit.id = int(id)
         edit.put()
         print "Exist and update"
         return True
     else:
         print "No exist"
         return False
    def get(self, idd):
        try:
            #c = UyPress()
            #idd = self.request.get('idd')
            print idd
            #c = c.get()
            print "uy press"
            hoy = date.today()
            try:
                data = News3.query(
                    #News3.date == hoy,
                    News3.key == ndb.Key(News3, int(idd)), )
            except:
                print traceback.format_exc()

            dia = hoy.strftime('%d')
            mes = hoy.strftime('%m')
            year = hoy.strftime('%Y')
            week_day = datetime.today().weekday()
            days_week = [
                "Lunes", "Martes", "Miércoles", "Jueves", "Viernes",
                "Sábado", "Domingo"
            ]
            week_day = days_week[week_day]
            hora = hoy - timedelta(hours=3)
            hora = hora.strftime("%H:%M:%S")

            try:
                clima = Yahoo()
                cli = clima.get()
            except:
                cli = ""

            template_values = {
                'data': data,
                'clima': cli,
                'dia': dia,
                'mes': mes,
                'year': year,
                'week_day': week_day,
                'hora': hora,
            }

            template = JINJA_ENVIRONMENT.get_template('/templates/mono4.html')
            self.response.write(template.render(template_values))
        except:
            error = traceback.format_exc()
            print error
            template_values = {
                'error': error,
            }
            template = JINJA_ENVIRONMENT.get_template('/templates/mono5.html')
            self.response.write(template.render(template_values))
Beispiel #4
0
    def similitude(self, news_from):
        try:
            hoy = date.today()
            edit = News3.query(
                #News.title == title,
                News3.news_from != news_from,
                News3.date == hoy,
            ).fetch()

            for i in edit:
                print i.keyword

            sys.exit()
        except:
            print traceback.format_exc()
            return traceback.format_exc()
Beispiel #5
0
    def get(self):
        hoy = date.today()
        f = News3.query(
            News3.date == hoy,
            News3.news_from.IN([
                "ciento_ochenta",
            ])
        ).fetch()
        for i in f:
            i.key.delete()

        template_values = {
            'data': f,
            'fecha': hoy,
            #'data3': data3,
        }
        template = JINJA_ENVIRONMENT.get_template('/templates/index.html')
        self.response.write(template.render(template_values))
Beispiel #6
0
    def get(self):
        try:
            print "  "
            print "TestClassifier start"
            print "  "
            # pasar  los stop words a lista desde el file
            with open("stop_words.txt", "r") as ins:
                array = []
                for line in ins:
                    array.append((line.rstrip('\n')).decode('unicode-escape'))
            #print array
            newsTrainer = Trainer(
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&_"]))

            hoy = date.today()

            query = News3.query(News3.date == hoy,
                                News3.news_from.IN([
                                    "uy_press",
                                ]), News3.category == "Política")

            # You need to train the system passing each text one by one to the trainer module.
            #newsSet =[
            #    {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
            #    {'text': 'Russia try to invade Ukraine', 'category': 'politics'},
            #    {'text': 'do not neglect exercise', 'category': 'health'},
            #    {'text': 'Syria is the main issue, Obama says', 'category': 'politics'},
            #    {'text': 'eat to lose weight', 'category': 'health'},
            #    {'text': 'you should not eat much', 'category': 'health'}
            #]

            query2 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "deportes")

            query4 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "salud")

            #for news in newsSet:
            #    newsTrainer.train(news['text'], news['category'])
            c = 0
            #print query
            for i in query:
                print "  "
                print i.category
                newsTrainer.train(i.html, 'politica')
                #if c == 10: break
                c += 1

            #for i in query2:
            #	newsTrainer.train(i.html, 'deportes')
            #raise Exception('I know Python!')

            #for i in query4:
            #	newsTrainer.train(i.html, 'salud')

            # When you have sufficient trained data, you are almost done and can start to use
            # a classifier.

            # Now you have a classifier which can give a try to classifiy text of news whose
            # category is unknown, yet.
            query3 = News3.query(
                News3.date == hoy,
                News3.news_from.IN([
                    "el_pais",
                ]),
                News3.id.IN([0]),
            )

            ###
            newsClassifier = Classifier(
                newsTrainer.data,
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&"]))
            #print unknownInstance
            classification = newsClassifier.classify(
                "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo"
            )

            # the classification variable holds the detected categories sorted
            print " classification "
            print(classification)
        except:
            print traceback.format_exc()
    def get(self):
        try:
            page = "http://www.uypress.net/acategoria.aspx?57"
            sock = urllib2.urlopen(page)
            htmlSource = sock.read()
            sock.close()
            soup = BeautifulSoup(htmlSource, "html.parser")
            # solo la portada
            portada = soup.find_all("div", attrs={"id": "principalUC"})
            portada = portada[0].find_all("div")
            l = []
            c = 0
            for i in portada:
                try:
                    print " "
                    print c
                    if (c == 0):
                        c += 1
                        continue

                    url = i.find_all("a")[0]["href"]
                    title = (i.find_all("a")[0]).string
                    print[title]
                    category = "deportes"
                    img = ""
                    try:
                        #e = SaveNews()
                        #e = e.if_exist(title, "uy_press_politica", c, url)
                        edit = News3.query(
                            News3.url == url,
                            News3.news_from == "uy_press_deportes",
                        ).get()
                        if edit:
                            print "Existe"
                            c = c + 1
                            #print i
                            continue
                        else:
                            print "No Existe"

                    except:
                        print traceback.format_exc()

                    try:
                        print "pre mk"
                        text = HtmlToTextMain()
                        text = text.main(url)
                        #print text
                        print ""
                        print[text]
                        print "post mk"
                    except:
                        #print text
                        print traceback.format_exc()
                        continue

                    #break
                    try:
                        print "PRE SAVE"
                        s = SaveNews()
                        sav = s.saveme(
                            title=title,
                            subtitle="",
                            image="",
                            url=url,
                            category=category,
                            keyword=[],
                            news_from="uy_press_deportes",
                            id=c,
                            html=text,
                        )
                        print "SAVE"
                        print sav
                    except:
                        print traceback.format_exc()
                        continue

                    print "sleep"
                    time.sleep(2)
                    print "wake up"

                    c += 1
                    print c

                except:
                    print traceback.format_exc()
                    return traceback.format_exc()
        except:
            print traceback.format_exc()
            return traceback.format_exc()