def __init__(self, unoporuno_root, regex_type): self.exit_at_first = True logging.basicConfig(level=logging.DEBUG) if not unoporuno_root in sys.path: sys.path.append(unoporuno_root + '/module/') from dospordos.tools import Limpieza self.limpieza = Limpieza() self.compiled_regex_list = [] if regex_type=='organization': regex_file_path = unoporuno_root + '/resources/regex/organization.regex' elif regex_type=='biographical phrases': regex_file_path = unoporuno_root + '/resources/regex/biographical.phrases.regex' elif regex_type=='profession': regex_file_path = unoporuno_root + '/resources/regex/profession.regex' elif regex_type=='degree': regex_file_path = unoporuno_root + '/resources/regex/degree.regex' elif regex_type=='cv general': regex_file_path = unoporuno_root + '/resources/regex/cv.regex' elif regex_type=='cv http': regex_file_path = unoporuno_root + '/resources/regex/cv.http.regex' elif regex_type=='latin nationalities': regex_file_path = unoporuno_root + '/resources/regex/latin.american.nat.regex' elif regex_type=='world nationalities es': regex_file_path = unoporuno_root + '/resources/regex/world.nat.esp.regex' elif regex_type=='email': regex_file_path = unoporuno_root + '/resources/regex/email.regex' elif regex_type=='publication': regex_file_path = unoporuno_root + '/resources/regex/publication.regex' elif regex_type=='publication http': regex_file_path = unoporuno_root + '/resources/regex/publication.http.regex' elif regex_type=='thesis': regex_file_path = unoporuno_root + '/resources/regex/thesis.regex' elif regex_type=='thesis http': regex_file_path = unoporuno_root + '/resources/regex/thesis.http.regex' elif regex_type=='blacklist http': regex_file_path = unoporuno_root + '/resources/regex/blacklist.http.regex' else: raise FeatureError, 'Unrecognized regex feature type '+regex_type try: regex_file = open(regex_file_path, 'r') except: logging.error('Error opening regex resource file '+regex_file_path) raise FeatureError, 'Error opening regex resource file '+p_regex_file_path logging.info("Start loading gazetteer from " +regex_file_path+ " at " +time.asctime()) for line in regex_file: if len(line.strip()) == 0: continue regex_line = line.split('\t') if len(regex_line) > 1: case = regex_line[1].strip() else: case = 'cd'
def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False): logging.basicConfig(level=logging.DEBUG) self.case_dependency = case_dependent self.exit_at_first = True if not unoporuno_root in sys.path: sys.path.append(unoporuno_root + '/module/') from dospordos.tools import Limpieza self.limpieza = Limpieza() self.compiled_regex = [] if gazetteer_type=='country': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/country.gazt' elif gazetteer_type=='city': if case_dependent: gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.cd.gazt' else: gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.ci.gazt' elif gazetteer_type=='accronym': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/accronyms.cd.gazt' elif gazetteer_type=='profession': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/profession.ci.gazt' elif gazetteer_type=='degree': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/degree.ci.gazt' elif gazetteer_type=='world nationalities en': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.nat.eng.ci.gazt' else: raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type try: gazetteer_file = open(gazetteer_file_path, 'r') except: logging.error('Error opening gazetteer resource file '+gazetteer_file_path) raise FeatureError, 'Error opening gazetteer resource file '+p_gazetteer_file_path logging.info("Start loading gazetteer list from " +gazetteer_file_path) for line in gazetteer_file: if len(line.strip()) == 0: continue #logging.debug('adding ' +line+ ' to gazetteer.self.set') gazet_str = self.limpieza.limpia_reservados_regex(line.strip()) if self.case_dependency: gazet_str = self.limpieza.limpia_acentos(gazet_str) #logging.debug (gazet_str+ ' added to case independant gazetter self set') else: gazet_str = self.limpieza.limpia_acentos(gazet_str).lower() #logging.debug (gazet_str+ ' added to case dependant gazetter self set') regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}' regex = re.compile(regex_str) if regex: self.compiled_regex.append((regex, gazet_str)) logging.info("End loading gazetteer list from " +gazetteer_file_path)
def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False): logging.basicConfig(level=logging.DEBUG) self.case_dependency = case_dependent self.exit_at_first = True if not unoporuno_root in sys.path: sys.path.append(unoporuno_root + '/module/') from dospordos.tools import Limpieza self.limpieza = Limpieza() self.compiled_regex = [] self.type_dict = dict() case = 'cd' if case_dependent else 'ci' if gazetteer_type=='country': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.countries.'+case+'.gazt' elif gazetteer_type=='city': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.country.'+case+'.gazt' else: raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type try: gazetteer_file = open(gazetteer_file_path, 'r') except: logging.error('Error opening gazetteer resource file '+gazetteer_file_path) raise FeatureError, 'Error opening gazetteer resource file '+gazetteer_file_path logging.info("Start loading gazetteer list from " +gazetteer_file_path) for line in gazetteer_file: line_n = line.strip() qualified_expression = re.split('\t',line_n) if len(line_n)==0 or len(qualified_expression)<2: continue gazet_str = self.limpieza.limpia_reservados_regex(qualified_expression[0]) gazet_type = qualified_expression[1] if self.case_dependency: gazet_str = self.limpieza.limpia_acentos(gazet_str) logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case dependant gazetter self set') else: gazet_str = self.limpieza.limpia_acentos(gazet_str).lower() logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case independant gazetter self set') #regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}' regex_str = '(^|[ .\-,:;–(<"¿¡]{1,1})'+gazet_str+'([ .\-,:;)–>"?!]{1,1}|$)' regex = re.compile(regex_str) if regex: self.compiled_regex.append((regex, gazet_str)) d = dict({gazet_str:gazet_type}) self.type_dict.update(d) logging.info("End loading gazetteer list from " +gazetteer_file_path)
from unoporuno.models import Busqueda, Persona, Snippet, Vinculo if not UNOPORUNO_PATH in sys.path: sys.path.append(UNOPORUNO_PATH) from dospordos.tools import DiasporaOutput, Limpieza try: xml_file = sys.argv[1] busqueda_xml = etree.parse(xml_file) except: logging.error('Invalid file' + xml_file) logging.error('Usage: python unoporuno_import xml_file_name') exit(-1) logging.info('Processing file ' + xml_file) L = Limpieza() x_busqueda = busqueda_xml.getroot() x_nombre = x_busqueda.find('nombre') x_fecha = x_busqueda.find('fecha') x_usuario = x_busqueda.find('usuario') x_descripcion = x_busqueda.find('descripcion') busqueda = Busqueda() busqueda.nombre = L.limpia_reservados_xml(x_nombre.text) busqueda.fecha = x_fecha.text busqueda.usuario = L.limpia_reservados_xml(x_usuario.text) busqueda.descripcion = L.limpia_reservados_xml(x_descripcion.text) busqueda.save() logging.info('Importing busqueda ' + busqueda.nombre) x_personas = x_busqueda.find('personas') x_personas_set = x_personas.findall('person') limpia = L.limpia_reservados_xml