Beispiel #1
0
class QualifiedGazetteerFeature(object):
    #a qualified gazetter file has the following structure:
    # expression \t type
    # expression is the item to search in the snippet
    #
    def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False):
        logging.basicConfig(level=logging.DEBUG)
        self.case_dependency = case_dependent
        self.exit_at_first = True
        if not unoporuno_root in sys.path:
            sys.path.append(unoporuno_root + '/module/')
        from dospordos.tools import Limpieza
        self.limpieza = Limpieza()
        self.compiled_regex = []
        self.type_dict = dict()
        
        case = 'cd' if case_dependent else 'ci'
        if gazetteer_type=='country':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.countries.'+case+'.gazt'
        elif gazetteer_type=='city':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.country.'+case+'.gazt'
        else:
            raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type
        try:
            gazetteer_file = open(gazetteer_file_path, 'r')
        except:
            logging.error('Error opening gazetteer resource file '+gazetteer_file_path)
            raise FeatureError, 'Error opening gazetteer resource file '+gazetteer_file_path

        logging.info("Start loading gazetteer list from " +gazetteer_file_path)

        for line in gazetteer_file:
            line_n = line.strip()
            qualified_expression = re.split('\t',line_n)
            if len(line_n)==0 or len(qualified_expression)<2:
                continue
            gazet_str = self.limpieza.limpia_reservados_regex(qualified_expression[0])
            gazet_type = qualified_expression[1]            
            if self.case_dependency:
                gazet_str = self.limpieza.limpia_acentos(gazet_str)
                logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case dependant gazetter self set')
            else:
                gazet_str = self.limpieza.limpia_acentos(gazet_str).lower()
                logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case independant gazetter self set')
            #regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}'
            regex_str = '(^|[ .\-,:;–(<"¿¡]{1,1})'+gazet_str+'([ .\-,:;)–>"?!]{1,1}|$)'
            regex = re.compile(regex_str)
            if regex:
                self.compiled_regex.append((regex, gazet_str))
                d = dict({gazet_str:gazet_type})
                self.type_dict.update(d)
        logging.info("End loading gazetteer list from " +gazetteer_file_path)

    def test(self,line):
        clean_line = self.limpieza.limpia_acentos(line)
        #logging.debug('clean line = ' +clean_line)
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line)
                if result:
                    logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    return True
            return False
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line.lower())
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    return True
            return False

    def typed_test(self,line):
        clean_line = self.limpieza.limpia_acentos(line)
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line)
                if result:                    
                    if self.type_dict.has_key(tupla[1]):
                        tipo = self.type_dict[tupla[1]]                        
                        logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line)
                        return tipo.split(',')
                    else:
                        logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line)            
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line.lower())
                if result:
                    if self.type_dict.has_key(tupla[1]):
                        tipo = self.type_dict[tupla[1]]
                        logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line)
                        return tipo.split(',')
                    else:
                        logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line)                  
        return ''

    def typed_list_test(self,line):
        clean_line = self.limpieza.limpia_acentos(line)
        type_set = set()
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line)
                if result:                    
                    if self.type_dict.has_key(tupla[1]):
                        tipo = self.type_dict[tupla[1]]
                        logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line)
                        type_set.update(tipo.split(','))
                    else:
                        logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line)
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line.lower())
                if result:
                    if self.type_dict.has_key(tupla[1]):
                        tipo = self.type_dict[tupla[1]]
                        logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line)
                        type_set.update(tipo.split(','))
                    else:
                        logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line)                  
        return type_set


    def list_test(self,line):
        clean_line = self.limpieza.limpia_acentos(line)
        entity_list = []
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].findall(clean_line)
                if len(result)>0:
                    entity_list.append(tupla[1])
                    logging.debug('case dependent gazetteer hit! ' +tupla[0].pattern+ ' entities=' +str(result)+ ' in line ' +line)
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].findall(clean_line.lower())
                if len(result)>0:
                    if self.type_dict.has_key(tupla[1]):
                        entity_list.append(tupla[1])
                        logging.debug('case dependent gazetteer hit! ' +tupla[0].pattern+ ' entities=' +str(result)+ ' in line ' +line)

        return entity_list
Beispiel #2
0
class GazetteerFeature(object):
    """
    ci = case independent
    cd = case dependent
    Para la búsqueda tipo cd sólo se necesita un set para buscar cada palabra upper.lower case en la lista
    Para la búsqueda de tipo ci se construye una regex para aislar la palabra con [ .,:;(<"¿]{1,1}palabra[ .,:;)>"?]{1,1}
    """
    def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False):
        logging.basicConfig(level=logging.DEBUG)
        self.case_dependency = case_dependent
        self.exit_at_first = True
        if not unoporuno_root in sys.path:
            sys.path.append(unoporuno_root + '/module/')
        from dospordos.tools import Limpieza
        self.limpieza = Limpieza()
        self.compiled_regex = []
        
        if gazetteer_type=='country':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/country.gazt'
        elif gazetteer_type=='city':
            if case_dependent:
                gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.cd.gazt'
            else:
                gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.ci.gazt'                
        elif gazetteer_type=='accronym':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/accronyms.cd.gazt'
        elif gazetteer_type=='profession':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/profession.ci.gazt'
        elif gazetteer_type=='degree':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/degree.ci.gazt'
        elif gazetteer_type=='world nationalities en':
            gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.nat.eng.ci.gazt'
        else:
            raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type
        try:
            gazetteer_file = open(gazetteer_file_path, 'r')
        except:
            logging.error('Error opening gazetteer resource file '+gazetteer_file_path)
            raise FeatureError, 'Error opening gazetteer resource file '+p_gazetteer_file_path

        logging.info("Start loading gazetteer list from " +gazetteer_file_path)
        
        for line in gazetteer_file:
            if len(line.strip()) == 0:
                continue            
            #logging.debug('adding ' +line+ ' to gazetteer.self.set')
            gazet_str = self.limpieza.limpia_reservados_regex(line.strip())
            if self.case_dependency:
                gazet_str = self.limpieza.limpia_acentos(gazet_str)
                #logging.debug (gazet_str+ ' added to case independant gazetter self set')
            else:
                gazet_str = self.limpieza.limpia_acentos(gazet_str).lower()
                #logging.debug (gazet_str+ ' added to case dependant gazetter self set')
            regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}'
            regex = re.compile(regex_str)
            if regex:
                self.compiled_regex.append((regex, gazet_str))
                
        logging.info("End loading gazetteer list from " +gazetteer_file_path)

    def test(self, line):
        clean_line = self.limpieza.limpia_acentos(line)
        #logging.debug('clean line = ' +clean_line)
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line)
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    return True
            return False
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line.lower())
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    return True
            return False

    def entity_test(self, line):
        clean_line = self.limpieza.limpia_acentos(line)
        #logging.debug('clean line = ' +clean_line)
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line)
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    return tupla[1]
            return ''
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line.lower())
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    return tupla[1]
            return ''

    def list_test(self, line):
        clean_line = self.limpieza.limpia_acentos(line)
        entity_list = []
        if self.case_dependency:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line)
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    entity_list.append(tupla[1])
        else:
            for tupla in self.compiled_regex:
                result = tupla[0].search(clean_line.lower())
                if result:
                    logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line)
                    entity_list.append(tupla[1].lower())
        return entity_list