def get_label_names(self, sequence, pos, escape_ascii=False): ''' escape_ascii: solo afecta al stem, word se emite tal cual es ''' x = sequence.x[pos] pos_id = sequence.pos[pos] word = sequence.sequence_list.x_dict.get_label_name(x) pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN low_word = '' stem = '' if word in filter_names: low_word = stem = word else: low_word = word.lower() if escape_ascii: word_ascii = unicodedata.normalize('NFKD', low_word).encode( 'ascii', 'ignore').decode('unicode_escape') stem = stemAugmented(word_ascii) else: stem = stemAugmented(low_word) return (word, low_word, stem)
def include_gazzeter(self): careers_gazzeter = open( os.path.join(EXTERNAL_GAZZETER_DIR, 'carreras'), 'r') outter_careers_gazzeter = open( os.path.join(EXTERNAL_GAZZETER_DIR, 'outter_carreras'), 'r') for carrera in careers_gazzeter: carrera = unicodedata.normalize( 'NFKD', carrera.lower().strip('\n')).encode( 'ascii', 'ignore').decode('unicode_escape') carrera = stemAugmented(carrera) self.inner_trigger_words['I'].append(carrera) self.inner_trigger_words['I'] = set(self.inner_trigger_words['I']) self.inner_trigger_words['B'] = self.inner_trigger_words['I'] self.outer_trigger_words['B'] = set() for outter in outter_careers_gazzeter: outter = unicodedata.normalize( 'NFKD', outter.lower().strip('\n')).encode( 'ascii', 'ignore').decode('unicode_escape') outter = stemAugmented(outter) self.outer_trigger_words['B'].add(outter) #self.outer_trigger_words['B'] = set(self.outer_trigger_words['B']) ### CAMBIO!!! # MEDIDA DESESPERADA xD self.outer_trigger_words['I'] = self.outer_trigger_words[ 'B'] # | set(self.outer_trigger_words['I'])
def get_label_names(self,sequence,pos, escape_ascii=False): ''' escape_ascii: solo afecta al stem, word se emite tal cual es ''' x = sequence.x[pos] pos_id = sequence.pos[pos] word = sequence.sequence_list.x_dict.get_label_name(x) pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN low_word = '' stem = '' if word in filter_names: low_word = stem = word else: low_word = word.lower() if escape_ascii: word_ascii = unicodedata.normalize('NFKD', low_word).encode('ascii','ignore').decode('unicode_escape') stem = stemAugmented(word_ascii) else: stem = stemAugmented(low_word) return (word,low_word,stem)
def include_gazzeter(self): careers_gazzeter = open(os.path.join(EXTERNAL_GAZZETER_DIR,'carreras'),'r') outter_careers_gazzeter = open(os.path.join(EXTERNAL_GAZZETER_DIR,'outter_carreras'),'r') for carrera in careers_gazzeter: carrera = unicodedata.normalize('NFKD', carrera.lower().strip('\n')).encode('ascii','ignore').decode('unicode_escape') carrera = stemAugmented(carrera) self.inner_trigger_words['I'].append(carrera) self.inner_trigger_words['I'] = set(self.inner_trigger_words['I']) self.inner_trigger_words['B'] = self.inner_trigger_words['I'] self.outer_trigger_words['B']=set() for outter in outter_careers_gazzeter: outter = unicodedata.normalize('NFKD', outter.lower().strip('\n')).encode('ascii','ignore').decode('unicode_escape') outter = stemAugmented(outter) self.outer_trigger_words['B'].add(outter) #self.outer_trigger_words['B'] = set(self.outer_trigger_words['B']) ### CAMBIO!!! # MEDIDA DESESPERADA xD self.outer_trigger_words['I'] = self.outer_trigger_words['B']# | set(self.outer_trigger_words['I'])
def get_trigger_features(self, word, y_name, prefix, pos_tag=False, _dict={}, pos=None, features=[]): name_pattern = prefix + '::' if pos!=None: name_pattern += str(pos) + ':' if word not in filter_names: word = unicodedata.normalize('NFKD', word.lower()).encode('ascii','ignore').decode('unicode_escape') word = stemAugmented(word) if word in _dict: feat_name = name_pattern + y_name features = self.insert_feature(feat_name, features) return features
def get_trigger_features(self, word, y_name, prefix, pos_tag=False, _dict={}, pos=None, features=[]): name_pattern = prefix + "::" if pos != None: name_pattern += str(pos) + ":" if word not in filter_names: word = unicodedata.normalize("NFKD", word.lower()).encode("ascii", "ignore").decode("unicode_escape") # confiando q y_name esta en _dict if not pos_tag: word = stemAugmented(word) if word in _dict[y_name]: feat_name = name_pattern + y_name features = self.insert_feature(feat_name, features) return features
def get_label_names(self,sequence,pos): x = sequence.x[pos] y = sequence.y[pos] pos_id = sequence.pos[pos] word = sequence.sequence_list.x_dict.get_label_name(x) pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN low_word = '' stem = '' if word in filter_names: low_word = stem = word else: low_word = word.lower() stem = stemAugmented(low_word) return (word,low_word,pos_tag,stem)
def get_label_names(self, sequence, pos): x = sequence.x[pos] y = sequence.y[pos] pos_id = sequence.pos[pos] word = sequence.sequence_list.x_dict.get_label_name(x) pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN low_word = '' stem = '' if word in filter_names: low_word = stem = word else: low_word = word.lower() stem = stemAugmented(low_word) return (word, low_word, pos_tag, stem)
def update_tw(self, sequence, pos_current): ''' if B: update inner and extern context elif I: update inner elif O: only update -inner,extern- if it's first O after I ''' length = len(sequence.x) y_name = self.get_y_name(sequence,pos_current) y_1_name = self.get_y_name(sequence,pos_current-1) TW_WINDOW = 4 #extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length)) """ ## outer TRIGGER WORD & POS if any(['B'==y_name[0], 'I'==y_name[0], #'O'==y_name[0] and 'I'==y_1_name[0], #'O'==y_name[0] and 'B'==y_1_name[0] ]): extremos = [i for i in range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length)) if self.get_y_name(sequence,i)!='B' and self.get_y_name(sequence,i)!='I'] for pos in extremos: x = sequence.x[pos] word = sequence.sequence_list.x_dict.get_label_name(x) if word not in filter_names: word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') word = stemAugmented(word.lower()) pos_id = sequence.pos[pos] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if any([pos_tag[0] =='s', # PREPOS pos_tag[0] =='c', # CONJ pos_tag[0] =='d', # DETERM ]): continue if y_name not in self.outer_trigger_words: self.outer_trigger_words[y_name] = {} if y_name not in self.outer_trigger_pos: self.outer_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.outer_trigger_words[y_name]: self.outer_trigger_words[y_name][word] = 0 self.outer_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.outer_trigger_pos[y_name]: self.outer_trigger_pos[y_name][pos_tag] = 0 self.outer_trigger_pos[y_name][pos_tag] += 1 """ ## INNER TRIGGER WORD & POS x = sequence.x[pos_current] word = sequence.sequence_list.x_dict.get_label_name(x) if y_name[0]!='O' and y_name[0]!=END_TAG and word not in filter_names: word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') word = stemAugmented(word.lower()) pos_id = sequence.pos[pos_current] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if all([pos_tag[0] !='s', # PREPOS pos_tag[0] !='c', # CONJ pos_tag[0] !='d', # DETERM ]): if y_name not in self.inner_trigger_pos: self.inner_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.inner_trigger_words['I']: self.inner_trigger_words['I'][word] = 0 self.inner_trigger_words['I'][word] += 1 # TRIGGER POS if pos_tag not in self.inner_trigger_pos[y_name]: self.inner_trigger_pos[y_name][pos_tag] = 0 self.inner_trigger_pos[y_name][pos_tag] += 1
def update_tw(self, sequence, pos_current): ''' if B: update inner and extern context elif I: update inner elif O: only update -inner,extern- if it's first O after I ''' length = len(sequence.x) y_name = self.get_y_name(sequence, pos_current) y_1_name = self.get_y_name(sequence, pos_current - 1) TW_WINDOW = 4 #extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length)) """ ## outer TRIGGER WORD & POS if any(['B'==y_name[0], 'I'==y_name[0], #'O'==y_name[0] and 'I'==y_1_name[0], #'O'==y_name[0] and 'B'==y_1_name[0] ]): extremos = [i for i in range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length)) if self.get_y_name(sequence,i)!='B' and self.get_y_name(sequence,i)!='I'] for pos in extremos: x = sequence.x[pos] word = sequence.sequence_list.x_dict.get_label_name(x) if word not in filter_names: word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') word = stemAugmented(word.lower()) pos_id = sequence.pos[pos] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if any([pos_tag[0] =='s', # PREPOS pos_tag[0] =='c', # CONJ pos_tag[0] =='d', # DETERM ]): continue if y_name not in self.outer_trigger_words: self.outer_trigger_words[y_name] = {} if y_name not in self.outer_trigger_pos: self.outer_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.outer_trigger_words[y_name]: self.outer_trigger_words[y_name][word] = 0 self.outer_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.outer_trigger_pos[y_name]: self.outer_trigger_pos[y_name][pos_tag] = 0 self.outer_trigger_pos[y_name][pos_tag] += 1 """ ## INNER TRIGGER WORD & POS x = sequence.x[pos_current] word = sequence.sequence_list.x_dict.get_label_name(x) if y_name[0] != 'O' and y_name[ 0] != END_TAG and word not in filter_names: word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('unicode_escape') word = stemAugmented(word.lower()) pos_id = sequence.pos[pos_current] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if all([ pos_tag[0] != 's', # PREPOS pos_tag[0] != 'c', # CONJ pos_tag[0] != 'd', # DETERM ]): if y_name not in self.inner_trigger_pos: self.inner_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.inner_trigger_words['I']: self.inner_trigger_words['I'][word] = 0 self.inner_trigger_words['I'][word] += 1 # TRIGGER POS if pos_tag not in self.inner_trigger_pos[y_name]: self.inner_trigger_pos[y_name][pos_tag] = 0 self.inner_trigger_pos[y_name][pos_tag] += 1
def update_tw(self, sequence, pos_current): ''' if B: update inner and extern context elif I: update inner elif O: only update -inner,extern- if it's first O after I ''' length = len(sequence.x) y_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current]) y_1_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current-1]) TW_WINDOW = 5 extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length)) ## outer TRIGGER WORD & POS if any(['B'==y_name[0], 'O'==y_name[0] and 'I'==y_1_name[0], 'O'==y_name[0] and 'B'==y_1_name[0]]): for pos in extremos: if y_name[0] == 'O' and pos < pos_current: continue if y_name[0] == 'B' and pos >= pos_current: continue x = sequence.x[pos] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in filter_names and stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if any([pos_tag[0] =='s', # PREPOS pos_tag[0] =='c', # CONJ pos_tag[0] =='d', # DETERM ]): continue if y_name not in self.outer_trigger_words: self.outer_trigger_words[y_name] = {} if y_name not in self.outer_trigger_pos: self.outer_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.outer_trigger_words[y_name]: self.outer_trigger_words[y_name][word] = 0 self.outer_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.outer_trigger_pos[y_name]: self.outer_trigger_pos[y_name][pos_tag] = 0 self.outer_trigger_pos[y_name][pos_tag] += 1 ## INNER TRIGGER WORD & POS if y_name[0] != 'O' and y_name!=BR: x = sequence.x[pos_current] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos_current] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if all([pos_tag[0] !='s', # PREPOS pos_tag[0] !='c', # CONJ pos_tag[0] !='d', # DETERM ]): if y_name not in self.inner_trigger_words: self.inner_trigger_words[y_name] = {} if y_name not in self.inner_trigger_pos: self.inner_trigger_pos[y_name] = {} # TRIGGER WORD if y_name not in self.inner_trigger_words[y_name]: self.inner_trigger_words[y_name][word] = 0 self.inner_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.inner_trigger_pos[y_name]: self.inner_trigger_pos[y_name][pos_tag] = 0 self.inner_trigger_pos[y_name][pos_tag] += 1
import os,sys import unicodedata import pdb,ipdb path_utils = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) PROJECT_DIR = os.path.dirname(path_utils) CRAWLER_DIR = os.path.join(PROJECT_DIR,'crawler') IDENTIFIER_DIR = os.path.join(CRAWLER_DIR, 'Identifiers') IDENTIFIER_STEM_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'identifiers') sys.path.append(path_utils) from utils_new import stemAugmented #pdb.set_trace() for root, dirs, filenames in os.walk(IDENTIFIER_DIR): for f in filenames: if f[-1]!='~': dest = open(os.path.join(IDENTIFIER_STEM_DIR, f), 'w') for line in open(os.path.join(root, f), 'r'): line = line.lower().strip('\n').strip(' ').replace('.','') if line!='': text = unicodedata.normalize('NFKD', line).encode('ascii','ignore').decode('utf-8') ident = ' '.join([stemAugmented(word) for word in text.split(' ')]) dest.write(ident+'\n')
import os, sys import unicodedata import pdb, ipdb path_utils = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) PROJECT_DIR = os.path.dirname(path_utils) CRAWLER_DIR = os.path.join(PROJECT_DIR, 'crawler') IDENTIFIER_DIR = os.path.join(CRAWLER_DIR, 'Identifiers') IDENTIFIER_STEM_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'identifiers') sys.path.append(path_utils) from utils_new import stemAugmented #pdb.set_trace() for root, dirs, filenames in os.walk(IDENTIFIER_DIR): for f in filenames: if f[-1] != '~': dest = open(os.path.join(IDENTIFIER_STEM_DIR, f), 'w') for line in open(os.path.join(root, f), 'r'): line = line.lower().strip('\n').strip(' ').replace('.', '') if line != '': text = unicodedata.normalize('NFKD', line).encode( 'ascii', 'ignore').decode('utf-8') ident = ' '.join( [stemAugmented(word) for word in text.split(' ')]) dest.write(ident + '\n')
def update_tw(self, sequence, pos_current): ''' if B: update inner and extern context elif I: update inner elif O: only update -inner,extern- if it's first O after I ''' length = len(sequence.x) y_name = sequence.sequence_list.y_dict.get_label_name( sequence.y[pos_current]) y_1_name = sequence.sequence_list.y_dict.get_label_name( sequence.y[pos_current - 1]) TW_WINDOW = 5 extremos = range(max(0, pos_current - TW_WINDOW), min(pos_current + TW_WINDOW + 1, length)) ## outer TRIGGER WORD & POS if any([ 'B' == y_name[0], 'O' == y_name[0] and 'I' == y_1_name[0], 'O' == y_name[0] and 'B' == y_1_name[0] ]): for pos in extremos: if y_name[0] == 'O' and pos < pos_current: continue if y_name[0] == 'B' and pos >= pos_current: continue x = sequence.x[pos] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in filter_names and stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos] pos_tag = sequence.sequence_list.pos_dict.get_label_name( pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if any([ pos_tag[0] == 's', # PREPOS pos_tag[0] == 'c', # CONJ pos_tag[0] == 'd', # DETERM ]): continue if y_name not in self.outer_trigger_words: self.outer_trigger_words[y_name] = {} if y_name not in self.outer_trigger_pos: self.outer_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.outer_trigger_words[y_name]: self.outer_trigger_words[y_name][word] = 0 self.outer_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.outer_trigger_pos[y_name]: self.outer_trigger_pos[y_name][pos_tag] = 0 self.outer_trigger_pos[y_name][pos_tag] += 1 ## INNER TRIGGER WORD & POS if y_name[0] != 'O' and y_name != BR: x = sequence.x[pos_current] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos_current] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if all([ pos_tag[0] != 's', # PREPOS pos_tag[0] != 'c', # CONJ pos_tag[0] != 'd', # DETERM ]): if y_name not in self.inner_trigger_words: self.inner_trigger_words[y_name] = {} if y_name not in self.inner_trigger_pos: self.inner_trigger_pos[y_name] = {} # TRIGGER WORD if y_name not in self.inner_trigger_words[y_name]: self.inner_trigger_words[y_name][word] = 0 self.inner_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.inner_trigger_pos[y_name]: self.inner_trigger_pos[y_name][pos_tag] = 0 self.inner_trigger_pos[y_name][pos_tag] += 1