Python Limpieza.limpia_reservados_regex Exemples

Langage de programmation: Python

Espace de nommage/Pack: dospordos.tools

Class/Type: Limpieza

Méthode/Fonction: limpia_reservados_regex

Exemples au hotexamples.com: 2

Python Limpieza.limpia_reservados_regex - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de dospordos.tools.Limpieza.limpia_reservados_regex extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

Limpieza(4)

limpia_acentos(2)

limpia_reservados_regex(2)

limpia_reservados_xml(1)

Méthodes fréquemment utilisées

Limpieza (4)

limpia_acentos (2)

limpia_reservados_regex (2)

limpia_reservados_xml (1)

Exemple #1

0

Afficher le fichier

class QualifiedGazetteerFeature(object): #a qualified gazetter file has the following structure: # expression \t type # expression is the item to search in the snippet # def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False): logging.basicConfig(level=logging.DEBUG) self.case_dependency = case_dependent self.exit_at_first = True if not unoporuno_root in sys.path: sys.path.append(unoporuno_root + '/module/') from dospordos.tools import Limpieza self.limpieza = Limpieza() self.compiled_regex = [] self.type_dict = dict() case = 'cd' if case_dependent else 'ci' if gazetteer_type=='country': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.countries.'+case+'.gazt' elif gazetteer_type=='city': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.country.'+case+'.gazt' else: raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type try: gazetteer_file = open(gazetteer_file_path, 'r') except: logging.error('Error opening gazetteer resource file '+gazetteer_file_path) raise FeatureError, 'Error opening gazetteer resource file '+gazetteer_file_path logging.info("Start loading gazetteer list from " +gazetteer_file_path) for line in gazetteer_file: line_n = line.strip() qualified_expression = re.split('\t',line_n) if len(line_n)==0 or len(qualified_expression)<2: continue gazet_str = self.limpieza.limpia_reservados_regex(qualified_expression[0]) gazet_type = qualified_expression[1] if self.case_dependency: gazet_str = self.limpieza.limpia_acentos(gazet_str) logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case dependant gazetter self set') else: gazet_str = self.limpieza.limpia_acentos(gazet_str).lower() logging.debug (gazet_str+ ' of type ' +gazet_type+ ' added to case independant gazetter self set') #regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}' regex_str = '(^|[ .\-,:;–(<"¿¡]{1,1})'+gazet_str+'([ .\-,:;)–>"?!]{1,1}|$)' regex = re.compile(regex_str) if regex: self.compiled_regex.append((regex, gazet_str)) d = dict({gazet_str:gazet_type}) self.type_dict.update(d) logging.info("End loading gazetteer list from " +gazetteer_file_path) def test(self,line): clean_line = self.limpieza.limpia_acentos(line) #logging.debug('clean line = ' +clean_line) if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].search(clean_line) if result: logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' in line ' +line) return True return False else: for tupla in self.compiled_regex: result = tupla[0].search(clean_line.lower()) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) return True return False def typed_test(self,line): clean_line = self.limpieza.limpia_acentos(line) if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].search(clean_line) if result: if self.type_dict.has_key(tupla[1]): tipo = self.type_dict[tupla[1]] logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line) return tipo.split(',') else: logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line) else: for tupla in self.compiled_regex: result = tupla[0].search(clean_line.lower()) if result: if self.type_dict.has_key(tupla[1]): tipo = self.type_dict[tupla[1]] logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line) return tipo.split(',') else: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line) return '' def typed_list_test(self,line): clean_line = self.limpieza.limpia_acentos(line) type_set = set() if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].search(clean_line) if result: if self.type_dict.has_key(tupla[1]): tipo = self.type_dict[tupla[1]] logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line) type_set.update(tipo.split(',')) else: logging.debug('case dependent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line) else: for tupla in self.compiled_regex: result = tupla[0].search(clean_line.lower()) if result: if self.type_dict.has_key(tupla[1]): tipo = self.type_dict[tupla[1]] logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type:' +tipo+ ' in line ' +line) type_set.update(tipo.split(',')) else: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' of type: NOT FOUND in line ' +line) return type_set def list_test(self,line): clean_line = self.limpieza.limpia_acentos(line) entity_list = [] if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].findall(clean_line) if len(result)>0: entity_list.append(tupla[1]) logging.debug('case dependent gazetteer hit! ' +tupla[0].pattern+ ' entities=' +str(result)+ ' in line ' +line) else: for tupla in self.compiled_regex: result = tupla[0].findall(clean_line.lower()) if len(result)>0: if self.type_dict.has_key(tupla[1]): entity_list.append(tupla[1]) logging.debug('case dependent gazetteer hit! ' +tupla[0].pattern+ ' entities=' +str(result)+ ' in line ' +line) return entity_list

Exemple #2

0

Afficher le fichier

class GazetteerFeature(object): """ ci = case independent cd = case dependent Para la búsqueda tipo cd sólo se necesita un set para buscar cada palabra upper.lower case en la lista Para la búsqueda de tipo ci se construye una regex para aislar la palabra con [ .,:;(<"¿]{1,1}palabra[ .,:;)>"?]{1,1} """ def __init__(self, unoporuno_root, gazetteer_type, case_dependent=False): logging.basicConfig(level=logging.DEBUG) self.case_dependency = case_dependent self.exit_at_first = True if not unoporuno_root in sys.path: sys.path.append(unoporuno_root + '/module/') from dospordos.tools import Limpieza self.limpieza = Limpieza() self.compiled_regex = [] if gazetteer_type=='country': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/country.gazt' elif gazetteer_type=='city': if case_dependent: gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.cd.gazt' else: gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.cities.ci.gazt' elif gazetteer_type=='accronym': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/accronyms.cd.gazt' elif gazetteer_type=='profession': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/profession.ci.gazt' elif gazetteer_type=='degree': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/degree.ci.gazt' elif gazetteer_type=='world nationalities en': gazetteer_file_path = unoporuno_root + '/resources/gazetteer/world.nat.eng.ci.gazt' else: raise GazetteerFeatureError, 'Unrecognized gazetteer feature type '+gazetteer_type try: gazetteer_file = open(gazetteer_file_path, 'r') except: logging.error('Error opening gazetteer resource file '+gazetteer_file_path) raise FeatureError, 'Error opening gazetteer resource file '+p_gazetteer_file_path logging.info("Start loading gazetteer list from " +gazetteer_file_path) for line in gazetteer_file: if len(line.strip()) == 0: continue #logging.debug('adding ' +line+ ' to gazetteer.self.set') gazet_str = self.limpieza.limpia_reservados_regex(line.strip()) if self.case_dependency: gazet_str = self.limpieza.limpia_acentos(gazet_str) #logging.debug (gazet_str+ ' added to case independant gazetter self set') else: gazet_str = self.limpieza.limpia_acentos(gazet_str).lower() #logging.debug (gazet_str+ ' added to case dependant gazetter self set') regex_str = '[ .\-,:;–(<"¿¡]{1,1}'+gazet_str+'[ .\-,:;)–>"?!]{1,1}' regex = re.compile(regex_str) if regex: self.compiled_regex.append((regex, gazet_str)) logging.info("End loading gazetteer list from " +gazetteer_file_path) def test(self, line): clean_line = self.limpieza.limpia_acentos(line) #logging.debug('clean line = ' +clean_line) if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].search(clean_line) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) return True return False else: for tupla in self.compiled_regex: result = tupla[0].search(clean_line.lower()) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) return True return False def entity_test(self, line): clean_line = self.limpieza.limpia_acentos(line) #logging.debug('clean line = ' +clean_line) if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].search(clean_line) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) return tupla[1] return '' else: for tupla in self.compiled_regex: result = tupla[0].search(clean_line.lower()) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) return tupla[1] return '' def list_test(self, line): clean_line = self.limpieza.limpia_acentos(line) entity_list = [] if self.case_dependency: for tupla in self.compiled_regex: result = tupla[0].search(clean_line) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) entity_list.append(tupla[1]) else: for tupla in self.compiled_regex: result = tupla[0].search(clean_line.lower()) if result: logging.debug('case independent gazetteer hit! ' +tupla[1]+ ' in line ' +line) entity_list.append(tupla[1].lower()) return entity_list