Ejemplo n.º 1
0
 def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
     """
     Generates a token dictionary based on the given sentences.
     
     :param dict_size: Max number of tokens to be included in the dictionary.
     :param minimum_occurrences: Minimum number of times that a token must
         appear in the text in order to be included in the dictionary. 
     """
     logger = logging.getLogger("Logger")
             
     tokens = [token for sent in self.sentences for token, _ in sent]
     self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences)
     logger.info("Created dictionary with %d types" %
                 self.word_dict.num_tokens)
Ejemplo n.º 2
0
def convertWord2embedding(word2embeddingsFile, types_feats_file,
                          word_dict_file, vocabularyFile):
    m = pickle.load(open(word2embeddingsFile))
    numpy.save(types_feats_file, m.get_word_embeddings())

    words = ['<UNK>', '<S>', '</S>', '<PAD>']
    words.extend(
        open(vocabularyFile, 'rb').read().decode('utf-8').strip().splitlines())
    out = open(word_dict_file, 'w')
    pickle.dump(WordDictionary(None, wordlist=words, variant='polyglot'), out)
Ejemplo n.º 3
0
def convertSenna(sennaFile, types_feats_file, word_dict_file):
    words = []
    embeddings = []
    for line in open(sennaFile):
        items = line.split()
        words.append(items[0].decode('utf-8'))
        embeddings.append([float(x) for x in items[1:]])
    numpy.save(types_feats_file, embeddings)
    out = open(word_dict_file, 'w')
    pickle.dump(WordDictionary(None, wordlist=words, variant='senna'), out)
Ejemplo n.º 4
0
    def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
        '''
        Generates a token dictionary based on the given sentences.

        :param dict_size: Max number of tokens
            to be included in the dictionary.
        :param minimum_occurrences: Minimum number of times that a token must
            appear in the text in order to be included in the dictionary.
        '''
        logger = logging.getLogger('Logger')
        all_tokens = [token.word
                      for sent in self.sentences
                      for token in sent]
        self.word_dict = WordDictionary(
            all_tokens, dict_size, minimum_occurrences)
        logger.info(
            'Created dictionary with %d tokens' % self.word_dict.num_tokens)
Ejemplo n.º 5
0
 def load_dictionary(self):
     """Read a file with a word list and create a dictionary."""
     logger = logging.getLogger("Logger")
     logger.info("Loading vocabulary")
     
     # try to load vocabulary specific for the task
     key = 'vocabulary_%s' % self.task
     filename = self.md.paths[key]
     if not os.path.isfile(filename):
         # fallback to generic vocabulary
         filename = self.md.paths['vocabulary']
         if not os.path.isfile(filename):
             raise FileNotFoundException()
     
     words = []
     with open(filename, 'rb') as f:
         for word in f:
             word = word.decode('utf-8').strip()
             if word:
                 words.append(word)
     
     wd = WordDictionary.init_from_wordlist(words)
     self.word_dict = wd
     logger.debug("Done. Dictionary size is %d types" % wd.num_tokens)
Ejemplo n.º 6
0
def load_network():
    """
    Loads the network from the default file and returns it.
    """
    file = open(senna_dump)
    words, type_features = load_features(file)
    word_dict = WordDictionary(None, wordlist=words, variant='senna')
    tables = [type_features]
    
    # PADDING, allcaps, hascap, initcap, nocaps
    caps, caps_features = load_features(file)
    tables.append(caps_features)

    suff, suffix_features = load_features(file)
    tables.append(suffix_features)

    hidden_weights = load_weights(file) # (hidden_size, input_size)
    hidden_bias = load_bias(file)
    output_weights = load_weights(file) # (output_size, hidden_size)
    output_bias = load_bias(file)
        
    transition0 = load_bias(file)
    transitions = load_weights(file).T
    transitions = np.vstack((transitions, transition0))

    word_window_size = 5
    input_size = hidden_weights.shape[1]
    hidden_size = hidden_weights.shape[0]
    output_size = output_bias.shape[0]
        
    nn = Network(word_window_size, input_size, hidden_size, output_size,
                 hidden_weights, hidden_bias, output_weights, output_bias)
    nn.feature_tables = tables
    nn.transitions = transitions 
    
    return nn, word_dict, suff
Ejemplo n.º 7
0
class TaggerReader(object):
    """
    Abstract class extending TextReader with useful functions
    for tagging tasks. 
    """
    __metaclass__ = abc.ABCMeta
    
    def __init__(self, md=None, load_dictionaries=True):
        '''
        This class shouldn't be used directly. The constructor only
        provides method calls for subclasses. Subclasses should call
        this constructor after initializing the `task` attribute.
        '''
        self._set_metadata(md)
        self.codified = False
        self._converter = None
        
        if load_dictionaries:
            self.load_or_create_dictionary()
            self.load_or_create_tag_dict()
    
    @abc.abstractmethod
    def task(self):
        """
        The task the tagger reads data for.
        Must be defined in subclasses.
        """
        return None
    
    def load_or_create_dictionary(self):
        """
        Try to load the vocabulary from the default location. If the vocabulary
        file is not available, create a new one from the sentences available
        and save it.
        """
        try:
            self.load_dictionary()
        except FileNotFoundException:
            self.generate_dictionary(minimum_occurrences=2)
            self.save_dictionary()
    
    def load_or_create_tag_dict(self):
        """
        Try to load the tag dictionary from the default location. If the dictinaty
        file is not available, scan the available sentences and create a new one. 
        """
        key = '%s_tag_dict' % self.task
        filename = self.md.paths[key]
        if os.path.isfile(filename):
            self.load_tag_dict(filename)
            return
        
        tags = {tag for sent in self.sentences for _, tag in sent}
        self.tag_dict = {tag: code for code, tag in enumerate(tags)}
        self.save_tag_dict(filename)
    
    def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
        """
        Generates a token dictionary based on the given sentences.
        
        :param dict_size: Max number of tokens to be included in the dictionary.
        :param minimum_occurrences: Minimum number of times that a token must
            appear in the text in order to be included in the dictionary. 
        """
        logger = logging.getLogger("Logger")
                
        tokens = [token for sent in self.sentences for token, _ in sent]
        self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences)
        logger.info("Created dictionary with %d types" %
                    self.word_dict.num_tokens)
        
    def get_inverse_tag_dictionary(self):
        """
        Returns a version of the tag dictionary that maps numbers to tags.
        Used for consulting the meaning of the network's output.
        """
        tuples = [(x[1], x[0]) for x in self.tag_dict.items()]
        ret = dict(tuples)
        
        return ret
    
    def codify_sentence(self, sentence):
        """
        Converts a given sentence into the indices used by the neural network.
        
        :param sentence: a sequence of tokens, already tokenized
        """
        if self._converter is None:
            self.create_converter()
        return np.array([self.converter.convert(t) for t in sentence])
    
    def codify_sentences(self):
        """
        Converts each token in each sequence into indices to their feature vectors
        in feature matrices. The previous sentences as text are not accessible anymore.
        """
        if self._converter is None:
            self.create_converter()
        
        new_sentences = []
        self.tags = []
        rare_tag_value = self.tag_dict.get(self.rare_tag)
        
        for sent in self.sentences:
            new_sent = []
            sentence_tags = []
            
            for token, tag in sent:
                new_token = self.converter.convert(token)
                new_sent.append(new_token)
                sentence_tags.append(self.tag_dict.get(tag, rare_tag_value))
            
            new_sentences.append(np.array(new_sent))
            self.tags.append(np.array(sentence_tags))
        
        self.sentences = new_sentences
        self.codified = True
    
    def get_word_counter(self):
        """
        Returns a Counter object with word type occurrences.
        """
        c = Counter(token.lower()
                    for sent in self.sentences for token, _ in sent)
        return c
    
    def get_tag_counter(self):
        """
        Returns a Counter object with tag occurrences.
        """
        c = Counter(tag for sent in self.sentences for _, tag in sent)
        return c
    
    def save_tag_dict(self, filename=None, tag_dict=None):
        """
        Saves a tag dictionary to a file as a list of tags.
        
        :param tag_dict: the dictionary to save. If None, the default
            tag_dict for the class will be saved.
        :param filename: the file where the dictionary should be saved.
            If None, the class default tag_dict filename will be used.
        """
        if tag_dict is None:
            tag_dict = self.tag_dict
        if filename is None:
            key = '%s_tag_dict' % self.task
            filename = self.md.paths[key]
        
        save_tag_dict(filename, tag_dict)
    
    def load_tag_dict(self, filename=None):
        """
        Load the tag dictionary from the default file and assign
        it to the tag_dict attribute. 
        """
        if filename is None:
            key = '%s_tag_dict' % self.task
            filename = self.md.paths[key]
            
        self.tag_dict = load_tag_dict(filename)
       
    def _set_metadata(self, md):
        if md is None:
            #metadata not provided = using global data_dir for files
            self.md = metadata.Metadata(self.task, config.FILES)
        else:
            self.md = md
        
    def add_text(self, text):
        """
        Adds more text to the reader. The text must be a sequence of sequences
        of tokens.
        """
        self.sentences.extend(text)
    
    def load_dictionary(self):
        """Read a file with a word list and create a dictionary."""
        logger = logging.getLogger("Logger")
        logger.info("Loading vocabulary")
        
        # try to load vocabulary specific for the task
        key = 'vocabulary_%s' % self.task
        filename = self.md.paths[key]
        if not os.path.isfile(filename):
            # fallback to generic vocabulary
            filename = self.md.paths['vocabulary']
            if not os.path.isfile(filename):
                raise FileNotFoundException()
        
        words = []
        with open(filename, 'rb') as f:
            for word in f:
                word = word.decode('utf-8').strip()
                if word:
                    words.append(word)
        
        wd = WordDictionary.init_from_wordlist(words)
        self.word_dict = wd
        logger.debug("Done. Dictionary size is %d types" % wd.num_tokens)
    
    def save_dictionary(self, filename=None):
        """
        Saves the reader's word dictionary as a list of words.
        
        :param filename: path to the file to save the dictionary. 
            if not given, it will be saved in the default nlpnet
            data directory.
        """
        logger = logging.getLogger("Logger")
        if filename is None:
            key = 'vocabulary_%s' % self.task
            filename = self.md.paths[key]
        
        self.word_dict.save(filename)
        logger.info("Dictionary saved in %s" % filename)
    
    def create_affix_list(self, prefix_or_suffix, max_size, min_occurrences):
        """
        Handle the creation of suffix and prefix lists.
        
        Check if there exists an affix list in the data directory. If there
        isn't, create a new one based on the training sentences.
        
        :param prefix_or_suffix: string 'prefix' or 'suffix'
        """
        affix_type = prefix_or_suffix.lower()
        assert affix_type == 'suffix' or affix_type == 'prefix' 
        
        filename = self.md.paths['%ses' % affix_type]
        if os.path.isfile(filename):
            return
        
        logger = logging.getLogger("Logger")
        affixes_all_lengths = []
        
        # only get the affix size n from words with length at least (n+1)
        types = {re.sub(r'\d', '9', token.lower()) 
                 for sent in self.sentences for token, _ in sent}
        
        for length in range(1, max_size + 1):
            if affix_type == 'suffix':
                c = Counter(type_[-length:]
                            for type_ in types
                            if len(type_) > length)
            else:
                c = Counter(type_[:length]
                            for type_ in types
                            if len(type_) > length)
            affixes_this_length = [affix for affix in c 
                                   if c[affix] >= min_occurrences]
            affixes_all_lengths.extend(affixes_this_length)
        
        logger.info('Created a list of %d %ses.' % (len(affixes_all_lengths),
                                                    affix_type))
        text = '\n'.join(affixes_all_lengths)
        with open(filename, 'wb') as f:
            f.write(text.encode('utf-8'))
    
    @property
    def converter(self):
        """
        Return the token converter, which transforms tokens into their feature
        vector indices. If it doesn't exist, one is created. 
        """
        if self._converter is None:
            self.create_converter()
        
        return self._converter
    
    @converter.setter
    def converter(self, value):
        self._converter = value
    
    def create_converter(self):
        """
        Sets up the token converter, which is responsible for transforming
        tokens into their feature vector indices
        """
        def add_affix_extractors(affix):
            """
            Helper function that works for both suffixes and prefixes.
            The parameter affix should be 'suffix' or 'prefix'.
            """
            loader_function = getattr(attributes.Affix, 'load_%ses' % affix)
            loader_function(self.md)
            
            # deal with gaps between sizes (i.e., if there are sizes 2, 3,
            # and 5)
            codes = getattr(attributes.Affix, '%s_codes' % affix)
            sizes = sorted(codes)
            
            getter = getattr(attributes.Affix, 'get_%s' % affix)
            for size in sizes:
                
                # size=size because if we don't use it, lambda sticks to the
                # last value of the loop iterator size
                def f(word, size=size):
                    return getter(re.sub(r'\d', '9', word), size)
                
                self.converter.add_extractor(f)
        
        self._converter = attributes.TokenConverter()
        self.converter.add_extractor(self.word_dict.get)
        if self.md.use_caps:
            self.converter.add_extractor(get_capitalization)
        if self.md.use_prefix:
            add_affix_extractors('prefix')
        if self.md.use_suffix:
            add_affix_extractors('suffix')
Ejemplo n.º 8
0
class SRLReader(reader.TaggerReader):
    def __init__(self,
                 md=None,
                 filename=None,
                 only_boundaries=False,
                 only_classify=False,
                 only_predicates=False):
        """
        The reader will read sentences from a given file. This file must
        be in the correct format (one token per line, columns indicating
        which tokens are predicates and their argument structure).
        
        :param filename: a file with CoNLL-like format data. If it is None,
            the reader will be created with no data.
        :param only_boundaries: train to identify only argument boundaries
        :param only_classify: train to classify pre-determined argument
        :param only_predicates: train to identify only predicates
        """

        if only_boundaries:
            self.taskname = 'srl_boundary'
            self._generate_iobes_dictionary()
        elif only_classify:
            self.taskname = 'srl_classify'
        elif only_predicates:
            self.taskname = 'srl_predicates'
            self._generate_predicate_id_dictionary()
        else:
            self.taskname = 'srl'

        self.rare_tag = 'O'
        if filename is not None:
            self._read_conll(filename)
            self._clean_text()

        super(SRLReader, self).__init__(md)

    @property
    def task(self):
        """
        Abstract Base Class (ABC) attribute.
        """
        return self.taskname

    def _read_conll(self, filename):
        '''
        Read a file in CoNLL format and extracts semantic role tags
        for each token.
        '''
        lines = []
        with open(filename, 'rb') as f:
            for line in f:
                line = line.decode('utf-8').strip()
                lines.append(line)

        self.sentences = []
        self.predicates = []
        tokens = []
        sent_predicates = []
        sent_tags = []
        token_number = 0

        for line in lines:
            line = line.strip()

            if line == '':
                # blank line between sentences
                if len(tokens) > 0:
                    sentence = (tokens, sent_tags)
                    self.sentences.append(sentence)
                    self.predicates.append(np.array(sent_predicates))
                    tokens = []
                    sent_predicates = []
                    sent_tags = []
                    token_number = 0

                continue

            fields = line.split()
            word = fields[ConllPos.word]
            lemma = fields[ConllPos.lemma]
            pos = fields[ConllPos.pos].lower()
            is_predicate = fields[ConllPos.pred] != '-'
            tags = fields[ConllPos.semantic_role:]

            # if this is the first token in the sentence, find out how many
            # predicates are there. initialize a list for each of them.
            if sent_tags == []:
                expected_roles = []
                for tag in tags:
                    tag, expected_role = self._read_role(tag, 'O', True)
                    sent_tags.append([tag])
                    expected_roles.append(expected_role)
            else:
                for i, tag in enumerate(tags):
                    expected_role = expected_roles[i]
                    tag, expected_role = self._read_role(
                        tag, expected_role, True)
                    sent_tags[i].append(tag)
                    expected_roles[i] = expected_role

            token = attributes.Token(word, lemma, pos)
            tokens.append(token)
            if is_predicate:
                sent_predicates.append(token_number)

            token_number += 1

        if len(tokens) > 0:
            # last sentence
            sentence = (tokens, sent_tags)
            self.sentences.append(sentence)
            self.predicates.append(np.array(sent_predicates))

    @classmethod
    def _read_role(cls, role, expected_role, remove_continuation):
        """
        Reads the next semantic role from a CoNLL-style file.
        
        :param role: what is read from the conll file (something like
            *, (A0* or *)
        :param role: the expected role if a * is found
        :param remove_countinuation: removes the C- from non-continuous
            arguments. C-A0 becomes A0.
        :return a tuple (role, expected next role)
        """
        if role == '*':
            # signals continuation of the last block
            role = expected_role
        elif role == '*)':
            # finishes block
            role = expected_role
            expected_role = 'O'
        else:
            # verifies if it is a single argument
            match = re.search('\(([-\w]+)\*\)', role)
            if match:
                role = match.group(1)
                expected_role = 'O'
            else:
                # verifies if it opens an argument
                match = re.search('\(([-\w]+)\*', role)
                if match:
                    role = match.group(1)
                    expected_role = role
                else:
                    raise ValueError('Unexpected role data: %s' % role)

        if role.startswith('C-') and remove_continuation:
            # removes C-
            role = role[2:]

        return role, expected_role

    def extend(self, data):
        """
        Adds more data to the reader.
        :param data: a list of tuples in the format (tokens, tags, predicates), 
        one for each sentence.
        """
        self.sentences.extend([(sent, tags) for sent, tags, _ in data])
        self.predicates.extend([np.array(preds) for _, _, preds in data])

    def load_or_create_tag_dict(self):
        """
        In the case of SRL argument classification or one step SRL, try to 
        load the tag dictionary. If the file with the tags is not present,
        a new one is created from the available sentences. 
        
        In the case of argument detection or predicate detection, 
        this function does nothing.
        """
        if self.task == 'srl_predicates' or self.task == 'srl_boundary':
            return

        # only SRL as one step uses IOB tags
        iob = self.task == 'srl'
        if os.path.isfile(self.md.paths['srl_tags']):
            self.load_tag_dict(iob=iob)
            return

        self._create_tag_dict(iob)
        logger = logging.getLogger('Logger')
        logger.info('Created SRL tag dictionary')

    def _create_tag_dict(self, iob=False):
        """
        Examine the available sentences and create a tag dictionary.
        
        :param iob: If True, this function will generate an entry for B-[tag] 
            and one for I-[tag], except for the tag 'O'.
        """
        logger = logging.getLogger("Logger")
        tags = {
            tag
            for _, tag_groups in self.sentences for tags in tag_groups
            for tag in tags
        }

        # create a dictionary now even if uses IOB, in order to save it in
        # a deterministic order
        self.tag_dict = {tag: code for code, tag in enumerate(tags)}
        reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict)
        logger.debug("Saved SRL tag dictionary.")
        if not iob:
            return

        # insert I- and B- preserving the ordering
        new_dict = {}
        code = 0
        for tag in sorted(self.tag_dict, key=self.tag_dict.get):
            if tag == 'O':
                new_dict[tag] = code
            else:
                new_dict['B-%s' % tag] = code
                code += 1
                new_dict['I-%s' % tag] = code

            code += 1

        self.tag_dict = new_dict

    def load_tag_dict(self, filename=None, iob=False):
        """
        Loads the tag dictionary from the default file. The dictionary file
        should have one tag per line.
        
        :param iob: If True, this function will generate an entry for B-[tag] 
            and one for I-[tag], except for the tag 'O'.
        """
        if filename is None:
            filename = self.md.paths['srl_tags']

        if not iob:
            super(SRLReader, self).load_tag_dict(filename)
            return

        self.tag_dict = {}
        code = 0
        with open(filename, 'rb') as f:
            for tag in f:
                tag = tag.decode('utf-8').strip()
                if tag == '':
                    continue

                if tag == 'O':
                    self.tag_dict[tag] = code
                else:
                    self.tag_dict['B-%s' % tag] = code
                    code += 1
                    self.tag_dict['I-%s' % tag] = code

                code += 1

        if 'O' not in self.tag_dict:
            self.tag_dict['O'] = code

    def _generate_iobes_dictionary(self):
        """
        Generate the reader's tag dictionary mapping the IOBES tags to numeric
        codes.
        """
        self.tag_dict = {tag: code for code, tag in enumerate('IOBES')}

    def _generate_predicate_id_dictionary(self):
        """
        Generate a tag dictionary for identifying predicates.
        It has two tags: V for predicates and O for others.
        """
        self.tag_dict = {'O': 0, 'V': 1}

    def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
        """
        Generates a token dictionary based on the given sentences.
        
        :param dict_size: Max number of tokens to be included in the dictionary.
        :param minimum_occurrences: Minimum number of times that a token must
            appear in the text in order to be included in the dictionary.
        """
        logger = logging.getLogger("Logger")
        all_tokens = [
            token.word for tokens, _ in self.sentences for token in tokens
        ]
        self.word_dict = WordDictionary(all_tokens, dict_size,
                                        minimum_occurrences)
        logger.info("Created dictionary with %d tokens" %
                    self.word_dict.num_tokens)

    def _clean_text(self):
        """
        Cleans the sentences text, replacing numbers for a keyword, different
        kinds of quotation marks for a single one, etc.
        """
        for sent, _ in self.sentences:
            for i, token in enumerate(sent):
                new_word = utils.clean_text(token.word, correct=False)
                new_lemma = utils.clean_text(token.lemma, correct=False)
                token.word = new_word
                token.lemma = new_lemma
                sent[i] = token

    def create_converter(self):
        """
        This function overrides the TextReader's one in order to deal with Token
        objects instead of raw strings.
        """
        self.converter = attributes.TokenConverter()

        if self.md.use_lemma:
            # look up word lemmas
            word_lookup = lambda t: self.word_dict.get(t.lemma)
        else:
            # look up the word itself
            word_lookup = lambda t: self.word_dict.get(t.word)

        self.converter.add_extractor(word_lookup)

        if self.md.use_caps:
            caps_lookup = lambda t: attributes.get_capitalization(t.word)
            self.converter.add_extractor(caps_lookup)

        if self.md.use_pos:
            with open(self.md.paths['pos_tag_dict']) as f:
                pos_dict = cPickle.load(f)

            pos_def_dict = defaultdict(lambda: pos_dict['other'])
            pos_def_dict.update(pos_dict)
            pos_lookup = lambda t: pos_def_dict[t.pos]
            self.converter.add_extractor(pos_lookup)

        if self.md.use_chunk:
            with open(self.md.paths['chunk_tag_dict']) as f:
                chunk_dict = cPickle.load(f)

            chunk_def_dict = defaultdict(lambda: chunk_dict['O'])
            chunk_def_dict.update(chunk_dict)
            chunk_lookup = lambda t: chunk_def_dict[t.chunk]
            self.converter.add_extractor(chunk_lookup)

    def generate_tag_dict(self):
        """
        Generates a tag dictionary that converts the tag itself
        to an index to be used in the neural network.
        """
        self.tagset = set(tag for _, props in self.sentences for prop in props
                          for tag in prop)

        self.tag_dict = dict(zip(self.tagset, range(len(self.tagset))))

    def _remove_tag_names(self):
        """
        Removes the actual tag names, leaving only IOB or IOBES block
        delimiters.
        """
        for _, propositions in self.sentences:
            for tags in propositions:
                for i, tag in enumerate(tags):
                    tags[i] = tag[0]

    def _codify_sentences(self):
        """Internal helper function."""
        new_sentences = []
        self.tags = []

        for (sent, props), preds in zip(self.sentences, self.predicates):
            new_sent = []
            sentence_tags = []

            for token in sent:
                new_token = self.converter.convert(token)
                new_sent.append(new_token)

            if self.task == 'srl_predicates':
                sentence_tags = np.zeros(len(sent), np.int)
                if len(preds) > 0:
                    sentence_tags[preds] = 1
            else:
                for prop in props:
                    # for classifying arguments, leave the names. they will be
                    # changed later
                    if self.task == 'srl_classify':
                        prop_tags = prop
                    else:
                        prop_tags = np.array(
                            [self.tag_dict[tag] for tag in prop])
                    sentence_tags.append(prop_tags)

            new_sentences.append(np.array(new_sent))
            self.tags.append(sentence_tags)

        self.sentences = new_sentences
        self.codified = True

    def codify_sentences(self):
        """
        Converts each token in each sequence into indices to their feature
        vectors in feature matrices. The previous sentences as text are not
        accessible anymore. Tags are also encoded. This function takes care of
        the case of classifying pre-delimited arguments.
        """
        if self.converter is None:
            self.create_converter()

        self._codify_sentences()
        self.arg_limits = []

        if self.task == 'srl_classify':
            # generate the tags for each argument
            start = 0

            for i, propositions in enumerate(self.tags):
                new_sent_tags = []
                sent_args = []

                for prop_tags in propositions:

                    new_prop_tags = []
                    prop_args = []
                    last_tag = 'O'

                    for j, tag in enumerate(prop_tags):
                        if tag != last_tag:
                            # if we were inside an argument, it ended
                            # we may have started a new
                            if last_tag != 'O':
                                end = j - 1
                                prop_args.append(np.array([start, end]))

                            if tag != 'O':
                                start = j
                                new_prop_tags.append(self.tag_dict[tag])

                        last_tag = tag
                    else:
                        # after last iteration, check the last tag
                        if last_tag != 'O':
                            end = j
                            prop_args.append(np.array([start, end]))

                    sent_args.append(np.array(prop_args))
                    new_sent_tags.append(np.array(new_prop_tags))

                self.arg_limits.append(sent_args)
                self.tags[i] = new_sent_tags

    def convert_tags(self,
                     scheme,
                     update_tag_dict=True,
                     only_boundaries=False):
        """
        Replaces each word label with an IOB or IOBES version, appending a
        prefix to them.
        
        :param scheme: IOB or IOBES (In, Other, Begin, End, Single).
        :param update_tag_dict: whether to update or not the tag dictionary after
            converting the tags.
        :param only_boundaries: if True, only leaves the IOBES tags and removes
            the actual tags. Also, avoid updating the tag dict.
        """
        scheme = scheme.lower()
        if scheme not in ('iob', 'iobes'):
            raise ValueError("Unknown tagging scheme: %s" % scheme)

        for _, props in self.sentences:
            for prop in props:

                last_tag = None
                for i, tag in enumerate(prop):

                    if tag == 'O':
                        # O tag is independent from IBES
                        last_tag = tag
                        continue

                    try:
                        next_tag = prop[i + 1]
                    except IndexError:
                        # last word already
                        next_tag = None

                    if tag != last_tag:
                        # a new block starts here.
                        last_tag = tag
                        if scheme == 'iob' or next_tag == tag:
                            prop[i] = 'B-%s' % tag
                        else:
                            prop[i] = 'S-%s' % tag
                    else:
                        # the block continues.
                        if scheme == 'iob' or next_tag == tag:
                            prop[i] = 'I-%s' % tag
                        else:
                            prop[i] = 'E-%s' % tag

        if only_boundaries:
            self._remove_tag_names()
        elif update_tag_dict:
            self.generate_tag_dict()
        else:
            # treat any tag not appearing in the tag dictionary as O
            actual_tagset = {
                tag
                for _, props in self.sentences for prop in props
                for tag in prop
            }
            for tag in actual_tagset:
                if tag not in self.tag_dict:
                    self.tag_dict[tag] = self.tag_dict[self.rare_tag]
Ejemplo n.º 9
0
class SRLReader(reader.TaggerReader):
    
    def __init__(self, md=None, filename=None, only_boundaries=False, 
                 only_classify=False, only_predicates=False):
        """
        The reader will read sentences from a given file. This file must
        be in the correct format (one token per line, columns indicating
        which tokens are predicates and their argument structure).
        
        :param filename: a file with CoNLL-like format data. If it is None,
            the reader will be created with no data.
        :param only_boundaries: train to identify only argument boundaries
        :param only_classify: train to classify pre-determined argument
        :param only_predicates: train to identify only predicates
        """
        
        if only_boundaries:
            self.taskname = 'srl_boundary'
            self._generate_iobes_dictionary()
        elif only_classify:
            self.taskname = 'srl_classify'
        elif only_predicates:
            self.taskname = 'srl_predicates'
            self._generate_predicate_id_dictionary()
        else:
            self.taskname = 'srl'
        
        self.rare_tag = 'O'
        if filename is not None:
            self._read_conll(filename)
            self._clean_text()
        
        super(SRLReader, self).__init__(md)

    @property
    def task(self):
        """
        Abstract Base Class (ABC) attribute.
        """
        return self.taskname

    def _read_conll(self, filename):
        '''
        Read a file in CoNLL format and extracts semantic role tags
        for each token.
        '''
        lines = []
        with open(filename, 'rb') as f:
            for line in f:
                line = line.decode('utf-8').strip()
                lines.append(line)
        
        self.sentences = []
        self.predicates = []
        tokens = []
        sent_predicates = []
        sent_tags = []
        token_number = 0
        
        for line in lines:
            line = line.strip()
            
            if line == '':
                # blank line between sentences
                if len(tokens) > 0:
                    sentence = (tokens, sent_tags)
                    self.sentences.append(sentence)
                    self.predicates.append(np.array(sent_predicates))
                    tokens = []
                    sent_predicates = []
                    sent_tags = []
                    token_number = 0
                
                continue
            
            fields = line.split()
            word = fields[ConllPos.word]
            lemma = fields[ConllPos.lemma]
            pos = fields[ConllPos.pos].lower()
            is_predicate = fields[ConllPos.pred] != '-'
            tags = fields[ConllPos.semantic_role:]
            
            # if this is the first token in the sentence, find out how many
            # predicates are there. initialize a list for each of them.
            if sent_tags == []:
                expected_roles = []
                for tag in tags:
                    tag, expected_role = self._read_role(tag, 'O', True)
                    sent_tags.append([tag])
                    expected_roles.append(expected_role)
            else:
                for i, tag in enumerate(tags):
                    expected_role = expected_roles[i]
                    tag, expected_role = self._read_role(tag, expected_role,
                                                         True)
                    sent_tags[i].append(tag)
                    expected_roles[i] = expected_role
            
            token = attributes.Token(word, lemma, pos)
            tokens.append(token)
            if is_predicate:
                sent_predicates.append(token_number)
            
            token_number += 1
        
        if len(tokens) > 0:
            # last sentence
            sentence = (tokens, sent_tags)
            self.sentences.append(sentence)
            self.predicates.append(np.array(sent_predicates))
    
    @classmethod
    def _read_role(cls, role, expected_role, remove_continuation):
        """
        Reads the next semantic role from a CoNLL-style file.
        
        :param role: what is read from the conll file (something like
            *, (A0* or *)
        :param role: the expected role if a * is found
        :param remove_countinuation: removes the C- from non-continuous
            arguments. C-A0 becomes A0.
        :return a tuple (role, expected next role)
        """
        if role == '*':
            # signals continuation of the last block
            role = expected_role
        elif role == '*)':
            # finishes block
            role = expected_role
            expected_role = 'O'
        else:
            # verifies if it is a single argument
            match = re.search('\(([-\w]+)\*\)', role)
            if match:
                role = match.group(1)
                expected_role = 'O'
            else:
                # verifies if it opens an argument
                match = re.search('\(([-\w]+)\*', role)
                if match:
                    role = match.group(1)
                    expected_role = role
                else:
                    raise ValueError('Unexpected role data: %s' % role)
        
        if role.startswith('C-') and remove_continuation:
            # removes C-
            role = role[2:]
            
        return role, expected_role

    def extend(self, data):
        """
        Adds more data to the reader.
        :param data: a list of tuples in the format (tokens, tags, predicates), 
        one for each sentence.
        """
        self.sentences.extend([(sent, tags) for sent, tags, _ in data])
        self.predicates.extend([np.array(preds) for _, _, preds in data])
    
    def load_or_create_tag_dict(self):
        """
        In the case of SRL argument classification or one step SRL, try to 
        load the tag dictionary. If the file with the tags is not present,
        a new one is created from the available sentences. 
        
        In the case of argument detection or predicate detection, 
        this function does nothing.
        """
        if self.task == 'srl_predicates' or self.task == 'srl_boundary':
            return
        
        # only SRL as one step uses IOB tags
        iob = self.task == 'srl'
        if os.path.isfile(self.md.paths['srl_tags']):
            self.load_tag_dict(iob=iob)
            return
        
        self._create_tag_dict(iob)
        logger = logging.getLogger('Logger')
        logger.info('Created SRL tag dictionary')
    
    def _create_tag_dict(self, iob=False):
        """
        Examine the available sentences and create a tag dictionary.
        
        :param iob: If True, this function will generate an entry for B-[tag] 
            and one for I-[tag], except for the tag 'O'.
        """
        logger = logging.getLogger("Logger")
        tags = {tag
                for _, tag_groups in self.sentences
                for tags in tag_groups
                for tag in tags}
        
        # create a dictionary now even if uses IOB, in order to save it in 
        # a deterministic order
        self.tag_dict = {tag: code for code, tag in enumerate(tags)}
        reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict)
        logger.debug("Saved SRL tag dictionary.")
        if not iob:
            return
        
        # insert I- and B- preserving the ordering
        new_dict = {}
        code = 0
        for tag in sorted(self.tag_dict, key=self.tag_dict.get):
            if tag == 'O':
                new_dict[tag] = code
            else:
                new_dict['B-%s' % tag] = code
                code += 1
                new_dict['I-%s' % tag] = code
                
            code += 1
        
        self.tag_dict = new_dict
    
    def load_tag_dict(self, filename=None, iob=False):
        """
        Loads the tag dictionary from the default file. The dictionary file
        should have one tag per line.
        
        :param iob: If True, this function will generate an entry for B-[tag] 
            and one for I-[tag], except for the tag 'O'.
        """
        if filename is None:
            filename = self.md.paths['srl_tags']
        
        if not iob:
            super(SRLReader, self).load_tag_dict(filename)
            return
            
        self.tag_dict = {}
        code = 0
        with open(filename, 'rb') as f:
            for tag in f:
                tag = tag.decode('utf-8').strip()
                if tag == '':
                    continue
                
                if tag == 'O':
                    self.tag_dict[tag] = code
                else:
                    self.tag_dict['B-%s' % tag] = code
                    code += 1
                    self.tag_dict['I-%s' % tag] = code
                
                code += 1
        
        if 'O' not in self.tag_dict:
            self.tag_dict['O'] = code
    
    def _generate_iobes_dictionary(self):
        """
        Generate the reader's tag dictionary mapping the IOBES tags to numeric
        codes.
        """
        self.tag_dict = {tag: code for code, tag in enumerate('IOBES')}
    
    def _generate_predicate_id_dictionary(self):
        """
        Generate a tag dictionary for identifying predicates.
        It has two tags: V for predicates and O for others.
        """
        self.tag_dict = {'O': 0, 'V': 1}
    
    def generate_dictionary(self, dict_size=None, minimum_occurrences=2):
        """
        Generates a token dictionary based on the given sentences.
        
        :param dict_size: Max number of tokens to be included in the dictionary.
        :param minimum_occurrences: Minimum number of times that a token must
            appear in the text in order to be included in the dictionary.
        """
        logger = logging.getLogger("Logger")
        all_tokens = [token.word
                      for tokens, _ in self.sentences
                      for token in tokens]
        self.word_dict = WordDictionary(all_tokens, dict_size,
                                        minimum_occurrences)
        logger.info("Created dictionary with %d tokens" %
                    self.word_dict.num_tokens)
    
    def _clean_text(self):
        """
        Cleans the sentences text, replacing numbers for a keyword, different
        kinds of quotation marks for a single one, etc.
        """
        for sent, _ in self.sentences:
            for i, token in enumerate(sent):
                new_word = utils.clean_text(token.word, correct=False)
                new_lemma = utils.clean_text(token.lemma, correct=False) 
                token.word = new_word
                token.lemma = new_lemma
                sent[i] = token

    def create_converter(self):
        """
        This function overrides the TextReader's one in order to deal with Token
        objects instead of raw strings.
        """
        self.converter = attributes.TokenConverter()
        
        if self.md.use_lemma:
            # look up word lemmas 
            word_lookup = lambda t: self.word_dict.get(t.lemma)
        else:
            # look up the word itself
            word_lookup = lambda t: self.word_dict.get(t.word)
             
        self.converter.add_extractor(word_lookup)
        
        if self.md.use_caps:
            caps_lookup = lambda t: attributes.get_capitalization(t.word)
            self.converter.add_extractor(caps_lookup)
        
        if self.md.use_pos:
            with open(self.md.paths['pos_tag_dict']) as f:
                pos_dict = cPickle.load(f)
                
            pos_def_dict = defaultdict(lambda: pos_dict['other'])
            pos_def_dict.update(pos_dict)
            pos_lookup = lambda t: pos_def_dict[t.pos]
            self.converter.add_extractor(pos_lookup)
        
        if self.md.use_chunk:
            with open(self.md.paths['chunk_tag_dict']) as f:
                chunk_dict = cPickle.load(f)
            
            chunk_def_dict = defaultdict(lambda: chunk_dict['O'])
            chunk_def_dict.update(chunk_dict)
            chunk_lookup = lambda t: chunk_def_dict[t.chunk]
            self.converter.add_extractor(chunk_lookup)
    
    def generate_tag_dict(self):
        """
        Generates a tag dictionary that converts the tag itself
        to an index to be used in the neural network.
        """
        self.tagset = set(tag
                          for _, props in self.sentences
                          for prop in props
                          for tag in prop)
        
        self.tag_dict = dict(zip(self.tagset, range(len(self.tagset))))
    
    def _remove_tag_names(self):
        """
        Removes the actual tag names, leaving only IOB or IOBES block
        delimiters.
        """
        for _, propositions in self.sentences:
            for tags in propositions:
                for i, tag in enumerate(tags):
                    tags[i] = tag[0]
    
    def _codify_sentences(self):
        """Internal helper function."""
        new_sentences = []
        self.tags = []
        
        for (sent, props), preds in zip(self.sentences, self.predicates):
            new_sent = []
            sentence_tags = []
            
            for token in sent:
                new_token = self.converter.convert(token)
                new_sent.append(new_token)
            
            if self.task == 'srl_predicates':    
                sentence_tags = np.zeros(len(sent), np.int)
                if len(preds) > 0:
                    sentence_tags[preds] = 1
            else:
                for prop in props:
                    # for classifying arguments, leave the names. they will be
                    # changed later
                    if self.task == 'srl_classify':
                        prop_tags = prop
                    else:
                        prop_tags = np.array([self.tag_dict[tag]
                                              for tag in prop])
                    sentence_tags.append(prop_tags)
            
            new_sentences.append(np.array(new_sent))
            self.tags.append(sentence_tags)
        
        self.sentences = new_sentences
        self.codified = True
    
    def codify_sentences(self):
        """
        Converts each token in each sequence into indices to their feature
        vectors in feature matrices. The previous sentences as text are not
        accessible anymore. Tags are also encoded. This function takes care of
        the case of classifying pre-delimited arguments.
        """
        if self.converter is None:
            self.create_converter()
        
        self._codify_sentences()
        self.arg_limits = []
        
        if self.task == 'srl_classify':
            # generate the tags for each argument
            start = 0
            
            for i, propositions in enumerate(self.tags):
                new_sent_tags = []
                sent_args = []
                
                for prop_tags in propositions:
                    
                    new_prop_tags = []
                    prop_args = []
                    last_tag = 'O'
                    
                    for j, tag in enumerate(prop_tags):
                        if tag != last_tag:
                            # if we were inside an argument, it ended
                            # we may have started a new
                            if last_tag != 'O':
                                end = j - 1
                                prop_args.append(np.array([start, end]))
                            
                            if tag != 'O':
                                start = j
                                new_prop_tags.append(self.tag_dict[tag])
                            
                        last_tag = tag
                    else:
                        # after last iteration, check the last tag
                        if last_tag != 'O':
                            end = j
                            prop_args.append(np.array([start, end]))
                    
                    sent_args.append(np.array(prop_args))
                    new_sent_tags.append(np.array(new_prop_tags))
                
                self.arg_limits.append(sent_args)
                self.tags[i] = new_sent_tags

    def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False):
        """
        Replaces each word label with an IOB or IOBES version, appending a
        prefix to them.
        
        :param scheme: IOB or IOBES (In, Other, Begin, End, Single).
        :param update_tag_dict: whether to update or not the tag dictionary after
            converting the tags.
        :param only_boundaries: if True, only leaves the IOBES tags and removes
            the actual tags. Also, avoid updating the tag dict.
        """
        scheme = scheme.lower()
        if scheme not in ('iob', 'iobes'):
            raise ValueError("Unknown tagging scheme: %s" % scheme)
        
        for _, props in self.sentences:
            for prop in props:
                
                last_tag = None
                for i, tag in enumerate(prop):
                    
                    if tag == 'O':
                        # O tag is independent from IBES
                        last_tag = tag 
                        continue
                    
                    try:
                        next_tag = prop[i + 1]
                    except IndexError:
                        # last word already
                        next_tag = None
                     
                    if tag != last_tag:
                        # a new block starts here. 
                        last_tag = tag
                        if scheme == 'iob' or next_tag == tag:
                            prop[i] = 'B-%s' % tag
                        else:
                            prop[i] = 'S-%s' % tag
                    else:
                        # the block continues. 
                        if scheme == 'iob' or next_tag == tag:
                            prop[i] = 'I-%s' % tag
                        else:
                            prop[i] = 'E-%s' % tag
            
        if only_boundaries:
            self._remove_tag_names()
        elif update_tag_dict:
            self.generate_tag_dict()
        else:
            # treat any tag not appearing in the tag dictionary as O
            actual_tagset = {tag for _, props in self.sentences
                             for prop in props for tag in prop}
            for tag in actual_tagset:
                if tag not in self.tag_dict:
                    self.tag_dict[tag] = self.tag_dict[self.rare_tag]
Ejemplo n.º 10
0
def convertPolyglot(polyglotFile, types_feats_file, word_dict_file):
    words, embeddings = pickle.load(open(polyglotFile))
    numpy.save(types_feats_file, embeddings)
    out = open(word_dict_file, 'w')
    pickle.dump(WordDictionary(None, wordlist=words, variant='polyglot'), out)