Beispiel #1
0
 def __init__(self, unoporuno_root, regex_type):
     self.exit_at_first = True
     logging.basicConfig(level=logging.DEBUG)
     if not unoporuno_root in sys.path:
         sys.path.append(unoporuno_root + '/module/')
     from dospordos.tools import Limpieza
     self.limpieza = Limpieza()
     self.compiled_regex_list = []
     if regex_type=='organization':
         regex_file_path = unoporuno_root + '/resources/regex/organization.regex'
     elif regex_type=='biographical phrases':
         regex_file_path = unoporuno_root + '/resources/regex/biographical.phrases.regex'
     elif regex_type=='profession':
         regex_file_path = unoporuno_root + '/resources/regex/profession.regex'
     elif regex_type=='degree':
         regex_file_path = unoporuno_root + '/resources/regex/degree.regex'
     elif regex_type=='cv general':
         regex_file_path = unoporuno_root + '/resources/regex/cv.regex'
     elif regex_type=='cv http':
         regex_file_path = unoporuno_root + '/resources/regex/cv.http.regex'
     elif regex_type=='latin nationalities':
         regex_file_path = unoporuno_root + '/resources/regex/latin.american.nat.regex'
     elif regex_type=='world nationalities es':
         regex_file_path = unoporuno_root + '/resources/regex/world.nat.esp.regex'
     elif regex_type=='email':
         regex_file_path = unoporuno_root + '/resources/regex/email.regex'
     elif regex_type=='publication':
         regex_file_path = unoporuno_root + '/resources/regex/publication.regex'
     elif regex_type=='publication http':
         regex_file_path = unoporuno_root + '/resources/regex/publication.http.regex'
     elif regex_type=='thesis':
         regex_file_path = unoporuno_root + '/resources/regex/thesis.regex'
     elif regex_type=='thesis http':
         regex_file_path = unoporuno_root + '/resources/regex/thesis.http.regex'
     elif regex_type=='blacklist http':
         regex_file_path = unoporuno_root + '/resources/regex/blacklist.http.regex'
     else:
         raise FeatureError, 'Unrecognized regex feature type '+regex_type
     try:
         regex_file = open(regex_file_path, 'r')
     except:
         logging.error('Error opening regex resource file '+regex_file_path)
         raise FeatureError, 'Error opening regex resource file '+p_regex_file_path
     logging.info("Start loading gazetteer from " +regex_file_path+ " at " +time.asctime())
     for line in regex_file:
         if len(line.strip()) == 0:
             continue
         regex_line = line.split('\t')
         if len(regex_line) > 1:  
             case = regex_line[1].strip()
         else:
             case = 'cd'            
Beispiel #2
0
    def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False):
        logging.basicConfig(level=logging.DEBUG)
        self.case_dependency = case_dependent
        self.exit_at_first = True
        if not unoporuno_root in sys.path:
            sys.path.append(unoporuno_root + '/module/')
        from dospordos.tools import Limpieza
        self.limpieza = Limpieza()
        self.compiled_regex = []
        
        if gazetteer_type=='country':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/country.gazt'
        elif gazetteer_type=='city':
            if case_dependent:
                gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.cd.gazt'
            else:
                gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.ci.gazt'                
        elif gazetteer_type=='accronym':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/accronyms.cd.gazt'
        elif gazetteer_type=='profession':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/profession.ci.gazt'
        elif gazetteer_type=='degree':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/degree.ci.gazt'
        elif gazetteer_type=='world nationalities en':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.nat.eng.ci.gazt'
        else:
            raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type
        try:
            gazetteer_file = open(gazetteer_file_path, 'r')
        except:
            logging.error('Error opening gazetteer resource file '+gazetteer_file_path)
            raise FeatureError, 'Error opening gazetteer resource file '+p_gazetteer_file_path

        logging.info("Start loading gazetteer list from " +gazetteer_file_path)
        
        for line in gazetteer_file:
            if len(line.strip()) == 0:
                continue            
            #logging.debug('adding ' +line+ ' to gazetteer.self.set')
            gazet_str = self.limpieza.limpia_reservados_regex(line.strip())
            if self.case_dependency:
                gazet_str = self.limpieza.limpia_acentos(gazet_str)
                #logging.debug (gazet_str+ ' added to case independant gazetter self set')
            else:
                gazet_str = self.limpieza.limpia_acentos(gazet_str).lower()
                #logging.debug (gazet_str+ ' added to case dependant gazetter self set')
            regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}'
            regex = re.compile(regex_str)
            if regex:
                self.compiled_regex.append((regex, gazet_str))
                
        logging.info("End loading gazetteer list from " +gazetteer_file_path)
Beispiel #3
0
    def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False):
        logging.basicConfig(level=logging.DEBUG)
        self.case_dependency = case_dependent
        self.exit_at_first = True
        if not unoporuno_root in sys.path:
            sys.path.append(unoporuno_root + '/module/')
        from dospordos.tools import Limpieza
        self.limpieza = Limpieza()
        self.compiled_regex = []
        self.type_dict = dict()
        
        case = 'cd' if case_dependent else 'ci'
        if gazetteer_type=='country':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.countries.'+case+'.gazt'
        elif gazetteer_type=='city':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.country.'+case+'.gazt'
        else:
            raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type
        try:
            gazetteer_file = open(gazetteer_file_path, 'r')
        except:
            logging.error('Error opening gazetteer resource file '+gazetteer_file_path)
            raise FeatureError, 'Error opening gazetteer resource file '+gazetteer_file_path

        logging.info("Start loading gazetteer list from " +gazetteer_file_path)

        for line in gazetteer_file:
            line_n = line.strip()
            qualified_expression = re.split('\t',line_n)
            if len(line_n)==0 or len(qualified_expression)<2:
                continue
            gazet_str = self.limpieza.limpia_reservados_regex(qualified_expression[0])
            gazet_type = qualified_expression[1]            
            if self.case_dependency:
                gazet_str = self.limpieza.limpia_acentos(gazet_str)
                logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case dependant gazetter self set')
            else:
                gazet_str = self.limpieza.limpia_acentos(gazet_str).lower()
                logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case independant gazetter self set')
            #regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}'
            regex_str = '(^|[ .\-,:;–(<"¿¡]{1,1})'+gazet_str+'([ .\-,:;)–>"?!]{1,1}|$)'
            regex = re.compile(regex_str)
            if regex:
                self.compiled_regex.append((regex, gazet_str))
                d = dict({gazet_str:gazet_type})
                self.type_dict.update(d)
        logging.info("End loading gazetteer list from " +gazetteer_file_path)
Beispiel #4
0
from unoporuno.models import Busqueda, Persona, Snippet, Vinculo

if not UNOPORUNO_PATH in sys.path:
    sys.path.append(UNOPORUNO_PATH)
from dospordos.tools import DiasporaOutput, Limpieza

try:
    xml_file = sys.argv[1]
    busqueda_xml = etree.parse(xml_file)
except:
    logging.error('Invalid file' + xml_file)
    logging.error('Usage: python unoporuno_import xml_file_name')
    exit(-1)

logging.info('Processing file ' + xml_file)
L = Limpieza()
x_busqueda = busqueda_xml.getroot()
x_nombre = x_busqueda.find('nombre')
x_fecha = x_busqueda.find('fecha')
x_usuario = x_busqueda.find('usuario')
x_descripcion = x_busqueda.find('descripcion')
busqueda = Busqueda()
busqueda.nombre = L.limpia_reservados_xml(x_nombre.text)
busqueda.fecha = x_fecha.text
busqueda.usuario = L.limpia_reservados_xml(x_usuario.text)
busqueda.descripcion = L.limpia_reservados_xml(x_descripcion.text)
busqueda.save()
logging.info('Importing busqueda ' + busqueda.nombre)
x_personas = x_busqueda.find('personas')
x_personas_set = x_personas.findall('person')
limpia = L.limpia_reservados_xml