def get_label_names(self, sequence, pos, escape_ascii=False):
        '''
            escape_ascii: solo afecta al stem, word se emite tal cual es
        '''
        x = sequence.x[pos]
        pos_id = sequence.pos[pos]

        word = sequence.sequence_list.x_dict.get_label_name(x)
        pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)

        if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
            pos_tag = NOUN

        low_word = ''
        stem = ''
        if word in filter_names:
            low_word = stem = word
        else:
            low_word = word.lower()
            if escape_ascii:
                word_ascii = unicodedata.normalize('NFKD', low_word).encode(
                    'ascii', 'ignore').decode('unicode_escape')
                stem = stemAugmented(word_ascii)
            else:
                stem = stemAugmented(low_word)

        return (word, low_word, stem)
コード例 #2
0
    def include_gazzeter(self):
        careers_gazzeter = open(
            os.path.join(EXTERNAL_GAZZETER_DIR, 'carreras'), 'r')
        outter_careers_gazzeter = open(
            os.path.join(EXTERNAL_GAZZETER_DIR, 'outter_carreras'), 'r')

        for carrera in careers_gazzeter:
            carrera = unicodedata.normalize(
                'NFKD',
                carrera.lower().strip('\n')).encode(
                    'ascii', 'ignore').decode('unicode_escape')
            carrera = stemAugmented(carrera)
            self.inner_trigger_words['I'].append(carrera)
        self.inner_trigger_words['I'] = set(self.inner_trigger_words['I'])
        self.inner_trigger_words['B'] = self.inner_trigger_words['I']

        self.outer_trigger_words['B'] = set()
        for outter in outter_careers_gazzeter:
            outter = unicodedata.normalize(
                'NFKD',
                outter.lower().strip('\n')).encode(
                    'ascii', 'ignore').decode('unicode_escape')
            outter = stemAugmented(outter)
            self.outer_trigger_words['B'].add(outter)
        #self.outer_trigger_words['B'] = set(self.outer_trigger_words['B'])
        ###                                                                     CAMBIO!!!
        # MEDIDA DESESPERADA xD
        self.outer_trigger_words['I'] = self.outer_trigger_words[
            'B']  # | set(self.outer_trigger_words['I'])
    def get_label_names(self,sequence,pos, escape_ascii=False):
        '''
            escape_ascii: solo afecta al stem, word se emite tal cual es
        '''
        x = sequence.x[pos]
        pos_id = sequence.pos[pos]

        word    = sequence.sequence_list.x_dict.get_label_name(x)
        pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
        
        if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
            pos_tag = NOUN
        
        low_word = ''
        stem = ''
        if word in filter_names:
            low_word = stem = word
        else:
            low_word = word.lower()
            if escape_ascii:
                word_ascii = unicodedata.normalize('NFKD', low_word).encode('ascii','ignore').decode('unicode_escape')
                stem = stemAugmented(word_ascii)
            else:
                stem = stemAugmented(low_word)

        return (word,low_word,stem)
コード例 #4
0
    def include_gazzeter(self):
        careers_gazzeter = open(os.path.join(EXTERNAL_GAZZETER_DIR,'carreras'),'r')
        outter_careers_gazzeter = open(os.path.join(EXTERNAL_GAZZETER_DIR,'outter_carreras'),'r')

        for carrera in careers_gazzeter:
            carrera = unicodedata.normalize('NFKD', carrera.lower().strip('\n')).encode('ascii','ignore').decode('unicode_escape')
            carrera = stemAugmented(carrera)
            self.inner_trigger_words['I'].append(carrera)
        self.inner_trigger_words['I'] = set(self.inner_trigger_words['I'])
        self.inner_trigger_words['B'] = self.inner_trigger_words['I']

        self.outer_trigger_words['B']=set()
        for outter in outter_careers_gazzeter:
            outter = unicodedata.normalize('NFKD', outter.lower().strip('\n')).encode('ascii','ignore').decode('unicode_escape')
            outter = stemAugmented(outter)
            self.outer_trigger_words['B'].add(outter)
        #self.outer_trigger_words['B'] = set(self.outer_trigger_words['B'])
        ###                                                                     CAMBIO!!!
        # MEDIDA DESESPERADA xD
        self.outer_trigger_words['I'] = self.outer_trigger_words['B']# | set(self.outer_trigger_words['I'])
    def get_trigger_features(self, word, y_name, prefix, pos_tag=False, _dict={}, pos=None, features=[]):
        name_pattern = prefix + '::'
        if pos!=None:
            name_pattern += str(pos) + ':'

        if word not in filter_names:
            word = unicodedata.normalize('NFKD', word.lower()).encode('ascii','ignore').decode('unicode_escape')
            word = stemAugmented(word)

        if word in _dict:
            feat_name = name_pattern + y_name
            features = self.insert_feature(feat_name, features)
        return features
コード例 #6
0
    def get_trigger_features(self, word, y_name, prefix, pos_tag=False, _dict={}, pos=None, features=[]):
        name_pattern = prefix + "::"
        if pos != None:
            name_pattern += str(pos) + ":"

        if word not in filter_names:
            word = unicodedata.normalize("NFKD", word.lower()).encode("ascii", "ignore").decode("unicode_escape")
            # confiando q y_name esta en _dict
            if not pos_tag:
                word = stemAugmented(word)

        if word in _dict[y_name]:
            feat_name = name_pattern + y_name
            features = self.insert_feature(feat_name, features)
        return features
コード例 #7
0
    def get_label_names(self,sequence,pos):
        x = sequence.x[pos]
        y = sequence.y[pos]
        pos_id = sequence.pos[pos]

        word    = sequence.sequence_list.x_dict.get_label_name(x)
        pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
        
        if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
            pos_tag = NOUN
        
        low_word = ''
        stem = ''
        if word in filter_names:
            low_word = stem = word
        else:
            low_word = word.lower()
            stem = stemAugmented(low_word)

        return (word,low_word,pos_tag,stem)
コード例 #8
0
    def get_label_names(self, sequence, pos):
        x = sequence.x[pos]
        y = sequence.y[pos]
        pos_id = sequence.pos[pos]

        word = sequence.sequence_list.x_dict.get_label_name(x)
        pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)

        if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
            pos_tag = NOUN

        low_word = ''
        stem = ''
        if word in filter_names:
            low_word = stem = word
        else:
            low_word = word.lower()
            stem = stemAugmented(low_word)

        return (word, low_word, pos_tag, stem)
コード例 #9
0
    def update_tw(self, sequence, pos_current):
        '''
        if B: update inner and extern context
        elif I: update inner
        elif O: only update -inner,extern- if it's first O after I
        '''
        length = len(sequence.x)
        y_name = self.get_y_name(sequence,pos_current)
        y_1_name = self.get_y_name(sequence,pos_current-1)


        TW_WINDOW = 4
        #extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length))
        """
        ## outer TRIGGER WORD & POS
        if any(['B'==y_name[0],
                'I'==y_name[0],
                #'O'==y_name[0] and 'I'==y_1_name[0],
                #'O'==y_name[0] and 'B'==y_1_name[0]
                ]):

            extremos = [i for i in range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length))
                            if self.get_y_name(sequence,i)!='B' and self.get_y_name(sequence,i)!='I']
            for pos in extremos:
                x = sequence.x[pos]
                word = sequence.sequence_list.x_dict.get_label_name(x)
                if word not in filter_names:
                    word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')
                    word = stemAugmented(word.lower())
                
                pos_id = sequence.pos[pos]
                pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
                if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                    pos_tag = NOUN

                if any([pos_tag[0] =='s',   # PREPOS
                        pos_tag[0] =='c',   # CONJ
                        pos_tag[0] =='d',   # DETERM
                        ]):
                    continue

                if y_name not in self.outer_trigger_words:
                    self.outer_trigger_words[y_name] = {}
                if y_name not in self.outer_trigger_pos:
                    self.outer_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.outer_trigger_words[y_name]:
                    self.outer_trigger_words[y_name][word] = 0
                self.outer_trigger_words[y_name][word] += 1

                # TRIGGER POS
                if pos_tag not in self.outer_trigger_pos[y_name]:
                    self.outer_trigger_pos[y_name][pos_tag] = 0
                self.outer_trigger_pos[y_name][pos_tag] += 1
        """
        ## INNER TRIGGER WORD & POS
        x = sequence.x[pos_current]
        word = sequence.sequence_list.x_dict.get_label_name(x)
        if y_name[0]!='O' and y_name[0]!=END_TAG and word not in filter_names:
            word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')
            word = stemAugmented(word.lower())

            pos_id = sequence.pos[pos_current]
            pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
            if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                pos_tag = NOUN

            if all([pos_tag[0] !='s',   # PREPOS
                    pos_tag[0] !='c',   # CONJ
                    pos_tag[0] !='d',   # DETERM
                    ]):
                if y_name not in self.inner_trigger_pos:
                    self.inner_trigger_pos[y_name] = {}
                
                # TRIGGER WORD
                if word not in self.inner_trigger_words['I']:
                    self.inner_trigger_words['I'][word] = 0
                self.inner_trigger_words['I'][word] += 1
                # TRIGGER POS
                if pos_tag not in self.inner_trigger_pos[y_name]:
                    self.inner_trigger_pos[y_name][pos_tag] = 0
                self.inner_trigger_pos[y_name][pos_tag] += 1
コード例 #10
0
    def update_tw(self, sequence, pos_current):
        '''
        if B: update inner and extern context
        elif I: update inner
        elif O: only update -inner,extern- if it's first O after I
        '''
        length = len(sequence.x)
        y_name = self.get_y_name(sequence, pos_current)
        y_1_name = self.get_y_name(sequence, pos_current - 1)

        TW_WINDOW = 4
        #extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length))
        """
        ## outer TRIGGER WORD & POS
        if any(['B'==y_name[0],
                'I'==y_name[0],
                #'O'==y_name[0] and 'I'==y_1_name[0],
                #'O'==y_name[0] and 'B'==y_1_name[0]
                ]):

            extremos = [i for i in range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length))
                            if self.get_y_name(sequence,i)!='B' and self.get_y_name(sequence,i)!='I']
            for pos in extremos:
                x = sequence.x[pos]
                word = sequence.sequence_list.x_dict.get_label_name(x)
                if word not in filter_names:
                    word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')
                    word = stemAugmented(word.lower())
                
                pos_id = sequence.pos[pos]
                pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
                if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                    pos_tag = NOUN

                if any([pos_tag[0] =='s',   # PREPOS
                        pos_tag[0] =='c',   # CONJ
                        pos_tag[0] =='d',   # DETERM
                        ]):
                    continue

                if y_name not in self.outer_trigger_words:
                    self.outer_trigger_words[y_name] = {}
                if y_name not in self.outer_trigger_pos:
                    self.outer_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.outer_trigger_words[y_name]:
                    self.outer_trigger_words[y_name][word] = 0
                self.outer_trigger_words[y_name][word] += 1

                # TRIGGER POS
                if pos_tag not in self.outer_trigger_pos[y_name]:
                    self.outer_trigger_pos[y_name][pos_tag] = 0
                self.outer_trigger_pos[y_name][pos_tag] += 1
        """
        ## INNER TRIGGER WORD & POS
        x = sequence.x[pos_current]
        word = sequence.sequence_list.x_dict.get_label_name(x)
        if y_name[0] != 'O' and y_name[
                0] != END_TAG and word not in filter_names:
            word = unicodedata.normalize('NFKD', word).encode(
                'ascii', 'ignore').decode('unicode_escape')
            word = stemAugmented(word.lower())

            pos_id = sequence.pos[pos_current]
            pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
            if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                pos_tag = NOUN

            if all([
                    pos_tag[0] != 's',  # PREPOS
                    pos_tag[0] != 'c',  # CONJ
                    pos_tag[0] != 'd',  # DETERM
            ]):
                if y_name not in self.inner_trigger_pos:
                    self.inner_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.inner_trigger_words['I']:
                    self.inner_trigger_words['I'][word] = 0
                self.inner_trigger_words['I'][word] += 1
                # TRIGGER POS
                if pos_tag not in self.inner_trigger_pos[y_name]:
                    self.inner_trigger_pos[y_name][pos_tag] = 0
                self.inner_trigger_pos[y_name][pos_tag] += 1
コード例 #11
0
    def update_tw(self, sequence, pos_current):
        '''
        if B: update inner and extern context
        elif I: update inner
        elif O: only update -inner,extern- if it's first O after I
        '''
        length = len(sequence.x)
        y_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current])
        y_1_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current-1])

        TW_WINDOW = 5
        extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length))

        ## outer TRIGGER WORD & POS
        if any(['B'==y_name[0],
                'O'==y_name[0] and 'I'==y_1_name[0],
                'O'==y_name[0] and 'B'==y_1_name[0]]):
            for pos in extremos:
                if y_name[0] == 'O' and pos < pos_current:
                    continue
                if y_name[0] == 'B' and pos >= pos_current:
                    continue

                x = sequence.x[pos]
                word = sequence.sequence_list.x_dict.get_label_name(x).lower()
                word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')
                stem = stemAugmented(word)
                if stem not in filter_names and stem not in self.dataset.stem_vocabulary:
                    word = assignFilterTag(word)
                
                pos_id = sequence.pos[pos]
                pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
                if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                    pos_tag = NOUN

                if any([pos_tag[0] =='s',   # PREPOS
                        pos_tag[0] =='c',   # CONJ
                        pos_tag[0] =='d',   # DETERM
                        ]):
                    continue

                if y_name not in self.outer_trigger_words:
                    self.outer_trigger_words[y_name] = {}
                if y_name not in self.outer_trigger_pos:
                    self.outer_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.outer_trigger_words[y_name]:
                    self.outer_trigger_words[y_name][word] = 0
                self.outer_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.outer_trigger_pos[y_name]:
                    self.outer_trigger_pos[y_name][pos_tag] = 0
                self.outer_trigger_pos[y_name][pos_tag] += 1

        ## INNER TRIGGER WORD & POS
        if y_name[0] != 'O' and y_name!=BR:
            x = sequence.x[pos_current]
            word = sequence.sequence_list.x_dict.get_label_name(x).lower()
            word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')

            stem = stemAugmented(word)
            if stem not in self.dataset.stem_vocabulary:
                word = assignFilterTag(word)

            pos_id = sequence.pos[pos_current]
            pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
            if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                pos_tag = NOUN

            if all([pos_tag[0] !='s',   # PREPOS
                    pos_tag[0] !='c',   # CONJ
                    pos_tag[0] !='d',   # DETERM
                    ]):
                if y_name not in self.inner_trigger_words:
                    self.inner_trigger_words[y_name] = {}
                if y_name not in self.inner_trigger_pos:
                    self.inner_trigger_pos[y_name] = {}
                # TRIGGER WORD
                if y_name not in self.inner_trigger_words[y_name]:
                    self.inner_trigger_words[y_name][word] = 0
                self.inner_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.inner_trigger_pos[y_name]:
                    self.inner_trigger_pos[y_name][pos_tag] = 0
                self.inner_trigger_pos[y_name][pos_tag] += 1
コード例 #12
0
import os,sys
import unicodedata
import pdb,ipdb

path_utils = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

PROJECT_DIR = os.path.dirname(path_utils)
CRAWLER_DIR = os.path.join(PROJECT_DIR,'crawler')
IDENTIFIER_DIR = os.path.join(CRAWLER_DIR, 'Identifiers')
IDENTIFIER_STEM_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'identifiers')

sys.path.append(path_utils)

from utils_new import stemAugmented

#pdb.set_trace()

for root, dirs, filenames in os.walk(IDENTIFIER_DIR):
  for f in filenames:
    if f[-1]!='~':
      dest = open(os.path.join(IDENTIFIER_STEM_DIR, f), 'w')
      for line in open(os.path.join(root, f), 'r'):
        line = line.lower().strip('\n').strip(' ').replace('.','')
        if line!='':
          text = unicodedata.normalize('NFKD', line).encode('ascii','ignore').decode('utf-8')
          ident = ' '.join([stemAugmented(word) for word in text.split(' ')])
          dest.write(ident+'\n')
import os, sys
import unicodedata
import pdb, ipdb

path_utils = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

PROJECT_DIR = os.path.dirname(path_utils)
CRAWLER_DIR = os.path.join(PROJECT_DIR, 'crawler')
IDENTIFIER_DIR = os.path.join(CRAWLER_DIR, 'Identifiers')
IDENTIFIER_STEM_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   'identifiers')

sys.path.append(path_utils)

from utils_new import stemAugmented

#pdb.set_trace()

for root, dirs, filenames in os.walk(IDENTIFIER_DIR):
    for f in filenames:
        if f[-1] != '~':
            dest = open(os.path.join(IDENTIFIER_STEM_DIR, f), 'w')
            for line in open(os.path.join(root, f), 'r'):
                line = line.lower().strip('\n').strip(' ').replace('.', '')
                if line != '':
                    text = unicodedata.normalize('NFKD', line).encode(
                        'ascii', 'ignore').decode('utf-8')
                    ident = ' '.join(
                        [stemAugmented(word) for word in text.split(' ')])
                    dest.write(ident + '\n')
コード例 #14
0
    def update_tw(self, sequence, pos_current):
        '''
        if B: update inner and extern context
        elif I: update inner
        elif O: only update -inner,extern- if it's first O after I
        '''
        length = len(sequence.x)
        y_name = sequence.sequence_list.y_dict.get_label_name(
            sequence.y[pos_current])
        y_1_name = sequence.sequence_list.y_dict.get_label_name(
            sequence.y[pos_current - 1])

        TW_WINDOW = 5
        extremos = range(max(0, pos_current - TW_WINDOW),
                         min(pos_current + TW_WINDOW + 1, length))

        ## outer TRIGGER WORD & POS
        if any([
                'B' == y_name[0], 'O' == y_name[0] and 'I' == y_1_name[0],
                'O' == y_name[0] and 'B' == y_1_name[0]
        ]):
            for pos in extremos:
                if y_name[0] == 'O' and pos < pos_current:
                    continue
                if y_name[0] == 'B' and pos >= pos_current:
                    continue

                x = sequence.x[pos]
                word = sequence.sequence_list.x_dict.get_label_name(x).lower()
                word = unicodedata.normalize('NFKD', word).encode(
                    'ascii', 'ignore').decode('unicode_escape')
                stem = stemAugmented(word)
                if stem not in filter_names and stem not in self.dataset.stem_vocabulary:
                    word = assignFilterTag(word)

                pos_id = sequence.pos[pos]
                pos_tag = sequence.sequence_list.pos_dict.get_label_name(
                    pos_id)
                if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                    pos_tag = NOUN

                if any([
                        pos_tag[0] == 's',  # PREPOS
                        pos_tag[0] == 'c',  # CONJ
                        pos_tag[0] == 'd',  # DETERM
                ]):
                    continue

                if y_name not in self.outer_trigger_words:
                    self.outer_trigger_words[y_name] = {}
                if y_name not in self.outer_trigger_pos:
                    self.outer_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.outer_trigger_words[y_name]:
                    self.outer_trigger_words[y_name][word] = 0
                self.outer_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.outer_trigger_pos[y_name]:
                    self.outer_trigger_pos[y_name][pos_tag] = 0
                self.outer_trigger_pos[y_name][pos_tag] += 1

        ## INNER TRIGGER WORD & POS
        if y_name[0] != 'O' and y_name != BR:
            x = sequence.x[pos_current]
            word = sequence.sequence_list.x_dict.get_label_name(x).lower()
            word = unicodedata.normalize('NFKD', word).encode(
                'ascii', 'ignore').decode('unicode_escape')

            stem = stemAugmented(word)
            if stem not in self.dataset.stem_vocabulary:
                word = assignFilterTag(word)

            pos_id = sequence.pos[pos_current]
            pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
            if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                pos_tag = NOUN

            if all([
                    pos_tag[0] != 's',  # PREPOS
                    pos_tag[0] != 'c',  # CONJ
                    pos_tag[0] != 'd',  # DETERM
            ]):
                if y_name not in self.inner_trigger_words:
                    self.inner_trigger_words[y_name] = {}
                if y_name not in self.inner_trigger_pos:
                    self.inner_trigger_pos[y_name] = {}
                # TRIGGER WORD
                if y_name not in self.inner_trigger_words[y_name]:
                    self.inner_trigger_words[y_name][word] = 0
                self.inner_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.inner_trigger_pos[y_name]:
                    self.inner_trigger_pos[y_name][pos_tag] = 0
                self.inner_trigger_pos[y_name][pos_tag] += 1