from datastorage import Stock # Interactua con mongodb db = Stock() site = db.url() while( site ): #db.update(site) site = db.url() print site['url']
import re from datastorage import Stock db = Stock() for page in db.visit(): try: page['text'] = u" ".join(page['text'].replace(u"\xa0", u" ").strip().split()) print str(page['_id']) + " " + re.sub(r'[-_\/]',' ',re.sub(r'[^a-zA-Z\-\ ]', '', page['text'].lower() )) except Exception: continue
# remove the css styles p = re.compile(r'< style[^<>]*?>.*?< / style >') data = p.sub('', data) # remove html comments p = re.compile(r'') data = p.sub('', data) # remove all the tags p = re.compile(r'<[^<]*?>') data = p.sub('', data) return data db = Stock() pages = db.visit() for page in pages: try: if (page['html'].__len__() > 100): html = page['html'] else: html = page['text'] clear_html = re.sub('<[^<]+?>', '', html) normalizado = normalize('NFKD', clear_html.decode('utf-8')).encode( 'ASCII', 'ignore').lower() text = re.sub(r'[^a-zA-Z\-\ ]', '', normalizado) text = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+', ' ', text) token = text.split()
import time #import nltk # NLP import hashlib from spider import Spider # Clase para visitar los sitios web from datastorage import Stock # Interactua con mongodb from unidecode import unidecode stop = True db = Stock() # instancia para almacenamiento if (not db.count()): db.save_data({'visit':False,'url':''}); while( stop ): break if ( not db.url() ): break site = db.url() # obtenemos una url no visitada url = site['url'] # separo la url m = hashlib.sha1() date = time.strftime("%Y-%m-%d %H:%m") print "[ Visit ] " + url response = Spider.get_source(url) # obtiene el html de la url if not response : #si no hay respuesta lo marca como visitado
# remove the css styles p = re.compile(r'< style[^<>]*?>.*?< / style >') data = p.sub('', data) # remove html comments p = re.compile(r'') data = p.sub('', data) # remove all the tags p = re.compile(r'<[^<]*?>') data = p.sub('', data) return data db = Stock() pages = db.visit(); for page in pages: try: if (page['html'].__len__() > 100): html = page['html'] else: html = page['text'] clear_html = re.sub('<[^<]+?>','',html) normalizado = normalize('NFKD',clear_html.decode('utf-8')).encode('ASCII','ignore').lower() text = re.sub(r'[^a-zA-Z\-\ ]','',normalizado) text = re.sub(r'[-_\/]|[a-z]{13,}|\W+|[ \t]+',' ',text)