def setUp(self): self.url = "https://aspirantes.presidencia.gov.co/" self.driver.setup(self.url, 2) self.btnBusqAvan = self.driver.getelement(self.xpathBusqAvan) self.selectElement = self.driver.getelement(self.xpathSelect) self.btnBuscar = self.driver.getelement(self.xpathBuscar) self.btnCancelar = self.driver.getelement(self.xpathCancelar) self.metadatapages = metadataDAO.MetadataDAO(database) self.pages = pageDAO.PageDAO(database) self.redis = connection_to_redis(1) self.redispeople = connection_to_redis(2) self.metadata = self.metadatapages.get_metadata_by_entity( "ASPIRANTESPRESIDENCIA")
def search_item(): msg = '' result=None template=None if request.method == 'POST': keywords = request.form["keywords"] item = request.form["items"] if item == 'person': template='index.html' redispeople=connection_to_redis(2) result1=redispeople.search(keywords) result=[prettify(item) for item in result1] else: database = connection_to_pages() db_mongo =PageDAO(database.get_db()) results=db_mongo.search(keywords) if len(results)>0: try: result2=json.dumps(results, sort_keys = True, indent = 4, separators = (',', ': '), ensure_ascii=False).encode("latin1").decode('latin1') except Exception as e: result2=json.dumps(results, sort_keys = True, indent = 4, separators = (',', ': ')) pass result=result2 template='result_crawler.html' else: template='index.html' return render_template(template,result2=result)
def get_item(): database = connection_to_pages() keyword = request.args.get("keyword") redispeople=connection_to_redis(2) result2=[] result_ids=redispeople.get_mongoid(keyword) page=PageDAO(database.get_db()) for version_id in result_ids: version=page.get_page_formatted(version_id) result2.append(version) try: result2=json.dumps(result2, sort_keys = True, indent = 4, separators = (',', ': '), ensure_ascii=False).encode("latin1").decode('latin1') except Exception as e: result2=json.dumps(result2, sort_keys = True, indent = 4, separators = (',', ': ')) return render_template('result_crawler.html',result2=result2)
import pymongo import metadataDAO import json import sys from datetime import datetime from connection_redis import connection_to_redis from connection_mongo import connection_to_pages redis = connection_to_redis(1) db_mongo = connection_to_pages() database = PageDAO(db_mongo.get_db()) metadatapages = metadataDAO.MetadataDAO(database) def fecha_auditoria(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open('json_pages.json', 'r') as f: pages_dict = json.load(f) for metadata in pages_dict: metadatapages.add_metadata(metadata) print metadata["_id"] for enlace in metadata['urls']: redis.new_link( enlace, { 'estado': 0, "metadata_id": metadata['_id'], "fecha": fecha_auditoria(), 'tipo': metadata['t_estructura']
from datetime import datetime from sets import Set import pymongo import metadataDAO import pageDAO from CrawlerSigep import extraerSigep from CrawlerPresidencia import extraerPresidencia from json_to_neo4j import JsonToNeo4j import urlparse from connection_mongo import connection_to_pages db_mongo = connection_to_pages() database = PageDAO(db_mongo.get_db()) metadatapages = metadataDAO.MetadataDAO(database) pages = pageDAO.PageDAO(database) redis = connection_to_redis(1) redispeople = connection_to_redis(2) def exp_pag_sencilla(metadata, url): driver = dcrawl() driver.setup(url, 2) persona = Person() new_version = Pagina(url, metadata['entidad']) for dato in metadata['info']: result_data = driver.explotar_tipo(metadata['info'][dato]) persona.add_attribute(dato, result_data) driver.close() persona.set_timestamp() if persona.name: new_version.agregar_persona(persona.persona)