コード例 #1
0
ファイル: CrawlerPresidencia.py プロジェクト: AngyeM/Palliri
 def setUp(self):
     self.url = "https://aspirantes.presidencia.gov.co/"
     self.driver.setup(self.url, 2)
     self.btnBusqAvan = self.driver.getelement(self.xpathBusqAvan)
     self.selectElement = self.driver.getelement(self.xpathSelect)
     self.btnBuscar = self.driver.getelement(self.xpathBuscar)
     self.btnCancelar = self.driver.getelement(self.xpathCancelar)
     self.metadatapages = metadataDAO.MetadataDAO(database)
     self.pages = pageDAO.PageDAO(database)
     self.redis = connection_to_redis(1)
     self.redispeople = connection_to_redis(2)
     self.metadata = self.metadatapages.get_metadata_by_entity(
         "ASPIRANTESPRESIDENCIA")
コード例 #2
0
ファイル: main.py プロジェクト: AngyeM/Palliri
def search_item():
    msg = ''
    result=None
    template=None
    if request.method == 'POST':
        keywords = request.form["keywords"]
        item = request.form["items"]
        if item == 'person':
            template='index.html'
            redispeople=connection_to_redis(2)
            result1=redispeople.search(keywords)
            result=[prettify(item) for item in result1]
        else:
            database = connection_to_pages()
            db_mongo =PageDAO(database.get_db())
            results=db_mongo.search(keywords)           
            if len(results)>0:
                try:
                    result2=json.dumps(results, sort_keys = True, indent = 4, separators = (',', ': '), ensure_ascii=False).encode("latin1").decode('latin1')
                except Exception as e:
                    result2=json.dumps(results, sort_keys = True, indent = 4, separators = (',', ': '))
                    pass
                result=result2
                template='result_crawler.html'
            else:
                template='index.html'

    return render_template(template,result2=result)
コード例 #3
0
ファイル: main.py プロジェクト: AngyeM/Palliri
def get_item():
    database = connection_to_pages()
    keyword = request.args.get("keyword")
    redispeople=connection_to_redis(2)
    result2=[]
    result_ids=redispeople.get_mongoid(keyword)
    page=PageDAO(database.get_db())
    for version_id in result_ids:
        version=page.get_page_formatted(version_id)
        result2.append(version)
    try:
        result2=json.dumps(result2, sort_keys = True, indent = 4, separators = (',', ': '), ensure_ascii=False).encode("latin1").decode('latin1')
    except Exception as e:
        result2=json.dumps(result2, sort_keys = True, indent = 4, separators = (',', ': '))
    return render_template('result_crawler.html',result2=result2)
コード例 #4
0
ファイル: initmetadata.py プロジェクト: AngyeM/Palliri
import pymongo
import metadataDAO
import json
import sys
from datetime import datetime
from connection_redis import connection_to_redis
from connection_mongo import connection_to_pages
redis = connection_to_redis(1)

db_mongo = connection_to_pages()
database = PageDAO(db_mongo.get_db())
metadatapages = metadataDAO.MetadataDAO(database)


def fecha_auditoria():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")


with open('json_pages.json', 'r') as f:
    pages_dict = json.load(f)

for metadata in pages_dict:
    metadatapages.add_metadata(metadata)
    print metadata["_id"]
    for enlace in metadata['urls']:
        redis.new_link(
            enlace, {
                'estado': 0,
                "metadata_id": metadata['_id'],
                "fecha": fecha_auditoria(),
                'tipo': metadata['t_estructura']
コード例 #5
0
from datetime import datetime
from sets import Set
import pymongo
import metadataDAO
import pageDAO
from CrawlerSigep import extraerSigep
from CrawlerPresidencia import extraerPresidencia
from json_to_neo4j import JsonToNeo4j
import urlparse
from connection_mongo import connection_to_pages

db_mongo = connection_to_pages()
database = PageDAO(db_mongo.get_db())
metadatapages = metadataDAO.MetadataDAO(database)
pages = pageDAO.PageDAO(database)
redis = connection_to_redis(1)
redispeople = connection_to_redis(2)


def exp_pag_sencilla(metadata, url):
    driver = dcrawl()
    driver.setup(url, 2)
    persona = Person()
    new_version = Pagina(url, metadata['entidad'])
    for dato in metadata['info']:
        result_data = driver.explotar_tipo(metadata['info'][dato])
        persona.add_attribute(dato, result_data)
    driver.close()
    persona.set_timestamp()
    if persona.name:
        new_version.agregar_persona(persona.persona)