def generate_dictionary(self, dict_size=None, minimum_occurrences=2): """ Generates a token dictionary based on the given sentences. :param dict_size: Max number of tokens to be included in the dictionary. :param minimum_occurrences: Minimum number of times that a token must appear in the text in order to be included in the dictionary. """ logger = logging.getLogger("Logger") tokens = [token for sent in self.sentences for token, _ in sent] self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences) logger.info("Created dictionary with %d types" % self.word_dict.num_tokens)
def convertWord2embedding(word2embeddingsFile, types_feats_file, word_dict_file, vocabularyFile): m = pickle.load(open(word2embeddingsFile)) numpy.save(types_feats_file, m.get_word_embeddings()) words = ['<UNK>', '<S>', '</S>', '<PAD>'] words.extend( open(vocabularyFile, 'rb').read().decode('utf-8').strip().splitlines()) out = open(word_dict_file, 'w') pickle.dump(WordDictionary(None, wordlist=words, variant='polyglot'), out)
def convertSenna(sennaFile, types_feats_file, word_dict_file): words = [] embeddings = [] for line in open(sennaFile): items = line.split() words.append(items[0].decode('utf-8')) embeddings.append([float(x) for x in items[1:]]) numpy.save(types_feats_file, embeddings) out = open(word_dict_file, 'w') pickle.dump(WordDictionary(None, wordlist=words, variant='senna'), out)
def generate_dictionary(self, dict_size=None, minimum_occurrences=2): ''' Generates a token dictionary based on the given sentences. :param dict_size: Max number of tokens to be included in the dictionary. :param minimum_occurrences: Minimum number of times that a token must appear in the text in order to be included in the dictionary. ''' logger = logging.getLogger('Logger') all_tokens = [token.word for sent in self.sentences for token in sent] self.word_dict = WordDictionary( all_tokens, dict_size, minimum_occurrences) logger.info( 'Created dictionary with %d tokens' % self.word_dict.num_tokens)
def load_dictionary(self): """Read a file with a word list and create a dictionary.""" logger = logging.getLogger("Logger") logger.info("Loading vocabulary") # try to load vocabulary specific for the task key = 'vocabulary_%s' % self.task filename = self.md.paths[key] if not os.path.isfile(filename): # fallback to generic vocabulary filename = self.md.paths['vocabulary'] if not os.path.isfile(filename): raise FileNotFoundException() words = [] with open(filename, 'rb') as f: for word in f: word = word.decode('utf-8').strip() if word: words.append(word) wd = WordDictionary.init_from_wordlist(words) self.word_dict = wd logger.debug("Done. Dictionary size is %d types" % wd.num_tokens)
def load_network(): """ Loads the network from the default file and returns it. """ file = open(senna_dump) words, type_features = load_features(file) word_dict = WordDictionary(None, wordlist=words, variant='senna') tables = [type_features] # PADDING, allcaps, hascap, initcap, nocaps caps, caps_features = load_features(file) tables.append(caps_features) suff, suffix_features = load_features(file) tables.append(suffix_features) hidden_weights = load_weights(file) # (hidden_size, input_size) hidden_bias = load_bias(file) output_weights = load_weights(file) # (output_size, hidden_size) output_bias = load_bias(file) transition0 = load_bias(file) transitions = load_weights(file).T transitions = np.vstack((transitions, transition0)) word_window_size = 5 input_size = hidden_weights.shape[1] hidden_size = hidden_weights.shape[0] output_size = output_bias.shape[0] nn = Network(word_window_size, input_size, hidden_size, output_size, hidden_weights, hidden_bias, output_weights, output_bias) nn.feature_tables = tables nn.transitions = transitions return nn, word_dict, suff
class TaggerReader(object): """ Abstract class extending TextReader with useful functions for tagging tasks. """ __metaclass__ = abc.ABCMeta def __init__(self, md=None, load_dictionaries=True): ''' This class shouldn't be used directly. The constructor only provides method calls for subclasses. Subclasses should call this constructor after initializing the `task` attribute. ''' self._set_metadata(md) self.codified = False self._converter = None if load_dictionaries: self.load_or_create_dictionary() self.load_or_create_tag_dict() @abc.abstractmethod def task(self): """ The task the tagger reads data for. Must be defined in subclasses. """ return None def load_or_create_dictionary(self): """ Try to load the vocabulary from the default location. If the vocabulary file is not available, create a new one from the sentences available and save it. """ try: self.load_dictionary() except FileNotFoundException: self.generate_dictionary(minimum_occurrences=2) self.save_dictionary() def load_or_create_tag_dict(self): """ Try to load the tag dictionary from the default location. If the dictinaty file is not available, scan the available sentences and create a new one. """ key = '%s_tag_dict' % self.task filename = self.md.paths[key] if os.path.isfile(filename): self.load_tag_dict(filename) return tags = {tag for sent in self.sentences for _, tag in sent} self.tag_dict = {tag: code for code, tag in enumerate(tags)} self.save_tag_dict(filename) def generate_dictionary(self, dict_size=None, minimum_occurrences=2): """ Generates a token dictionary based on the given sentences. :param dict_size: Max number of tokens to be included in the dictionary. :param minimum_occurrences: Minimum number of times that a token must appear in the text in order to be included in the dictionary. """ logger = logging.getLogger("Logger") tokens = [token for sent in self.sentences for token, _ in sent] self.word_dict = WordDictionary(tokens, dict_size, minimum_occurrences) logger.info("Created dictionary with %d types" % self.word_dict.num_tokens) def get_inverse_tag_dictionary(self): """ Returns a version of the tag dictionary that maps numbers to tags. Used for consulting the meaning of the network's output. """ tuples = [(x[1], x[0]) for x in self.tag_dict.items()] ret = dict(tuples) return ret def codify_sentence(self, sentence): """ Converts a given sentence into the indices used by the neural network. :param sentence: a sequence of tokens, already tokenized """ if self._converter is None: self.create_converter() return np.array([self.converter.convert(t) for t in sentence]) def codify_sentences(self): """ Converts each token in each sequence into indices to their feature vectors in feature matrices. The previous sentences as text are not accessible anymore. """ if self._converter is None: self.create_converter() new_sentences = [] self.tags = [] rare_tag_value = self.tag_dict.get(self.rare_tag) for sent in self.sentences: new_sent = [] sentence_tags = [] for token, tag in sent: new_token = self.converter.convert(token) new_sent.append(new_token) sentence_tags.append(self.tag_dict.get(tag, rare_tag_value)) new_sentences.append(np.array(new_sent)) self.tags.append(np.array(sentence_tags)) self.sentences = new_sentences self.codified = True def get_word_counter(self): """ Returns a Counter object with word type occurrences. """ c = Counter(token.lower() for sent in self.sentences for token, _ in sent) return c def get_tag_counter(self): """ Returns a Counter object with tag occurrences. """ c = Counter(tag for sent in self.sentences for _, tag in sent) return c def save_tag_dict(self, filename=None, tag_dict=None): """ Saves a tag dictionary to a file as a list of tags. :param tag_dict: the dictionary to save. If None, the default tag_dict for the class will be saved. :param filename: the file where the dictionary should be saved. If None, the class default tag_dict filename will be used. """ if tag_dict is None: tag_dict = self.tag_dict if filename is None: key = '%s_tag_dict' % self.task filename = self.md.paths[key] save_tag_dict(filename, tag_dict) def load_tag_dict(self, filename=None): """ Load the tag dictionary from the default file and assign it to the tag_dict attribute. """ if filename is None: key = '%s_tag_dict' % self.task filename = self.md.paths[key] self.tag_dict = load_tag_dict(filename) def _set_metadata(self, md): if md is None: #metadata not provided = using global data_dir for files self.md = metadata.Metadata(self.task, config.FILES) else: self.md = md def add_text(self, text): """ Adds more text to the reader. The text must be a sequence of sequences of tokens. """ self.sentences.extend(text) def load_dictionary(self): """Read a file with a word list and create a dictionary.""" logger = logging.getLogger("Logger") logger.info("Loading vocabulary") # try to load vocabulary specific for the task key = 'vocabulary_%s' % self.task filename = self.md.paths[key] if not os.path.isfile(filename): # fallback to generic vocabulary filename = self.md.paths['vocabulary'] if not os.path.isfile(filename): raise FileNotFoundException() words = [] with open(filename, 'rb') as f: for word in f: word = word.decode('utf-8').strip() if word: words.append(word) wd = WordDictionary.init_from_wordlist(words) self.word_dict = wd logger.debug("Done. Dictionary size is %d types" % wd.num_tokens) def save_dictionary(self, filename=None): """ Saves the reader's word dictionary as a list of words. :param filename: path to the file to save the dictionary. if not given, it will be saved in the default nlpnet data directory. """ logger = logging.getLogger("Logger") if filename is None: key = 'vocabulary_%s' % self.task filename = self.md.paths[key] self.word_dict.save(filename) logger.info("Dictionary saved in %s" % filename) def create_affix_list(self, prefix_or_suffix, max_size, min_occurrences): """ Handle the creation of suffix and prefix lists. Check if there exists an affix list in the data directory. If there isn't, create a new one based on the training sentences. :param prefix_or_suffix: string 'prefix' or 'suffix' """ affix_type = prefix_or_suffix.lower() assert affix_type == 'suffix' or affix_type == 'prefix' filename = self.md.paths['%ses' % affix_type] if os.path.isfile(filename): return logger = logging.getLogger("Logger") affixes_all_lengths = [] # only get the affix size n from words with length at least (n+1) types = {re.sub(r'\d', '9', token.lower()) for sent in self.sentences for token, _ in sent} for length in range(1, max_size + 1): if affix_type == 'suffix': c = Counter(type_[-length:] for type_ in types if len(type_) > length) else: c = Counter(type_[:length] for type_ in types if len(type_) > length) affixes_this_length = [affix for affix in c if c[affix] >= min_occurrences] affixes_all_lengths.extend(affixes_this_length) logger.info('Created a list of %d %ses.' % (len(affixes_all_lengths), affix_type)) text = '\n'.join(affixes_all_lengths) with open(filename, 'wb') as f: f.write(text.encode('utf-8')) @property def converter(self): """ Return the token converter, which transforms tokens into their feature vector indices. If it doesn't exist, one is created. """ if self._converter is None: self.create_converter() return self._converter @converter.setter def converter(self, value): self._converter = value def create_converter(self): """ Sets up the token converter, which is responsible for transforming tokens into their feature vector indices """ def add_affix_extractors(affix): """ Helper function that works for both suffixes and prefixes. The parameter affix should be 'suffix' or 'prefix'. """ loader_function = getattr(attributes.Affix, 'load_%ses' % affix) loader_function(self.md) # deal with gaps between sizes (i.e., if there are sizes 2, 3, # and 5) codes = getattr(attributes.Affix, '%s_codes' % affix) sizes = sorted(codes) getter = getattr(attributes.Affix, 'get_%s' % affix) for size in sizes: # size=size because if we don't use it, lambda sticks to the # last value of the loop iterator size def f(word, size=size): return getter(re.sub(r'\d', '9', word), size) self.converter.add_extractor(f) self._converter = attributes.TokenConverter() self.converter.add_extractor(self.word_dict.get) if self.md.use_caps: self.converter.add_extractor(get_capitalization) if self.md.use_prefix: add_affix_extractors('prefix') if self.md.use_suffix: add_affix_extractors('suffix')
class SRLReader(reader.TaggerReader): def __init__(self, md=None, filename=None, only_boundaries=False, only_classify=False, only_predicates=False): """ The reader will read sentences from a given file. This file must be in the correct format (one token per line, columns indicating which tokens are predicates and their argument structure). :param filename: a file with CoNLL-like format data. If it is None, the reader will be created with no data. :param only_boundaries: train to identify only argument boundaries :param only_classify: train to classify pre-determined argument :param only_predicates: train to identify only predicates """ if only_boundaries: self.taskname = 'srl_boundary' self._generate_iobes_dictionary() elif only_classify: self.taskname = 'srl_classify' elif only_predicates: self.taskname = 'srl_predicates' self._generate_predicate_id_dictionary() else: self.taskname = 'srl' self.rare_tag = 'O' if filename is not None: self._read_conll(filename) self._clean_text() super(SRLReader, self).__init__(md) @property def task(self): """ Abstract Base Class (ABC) attribute. """ return self.taskname def _read_conll(self, filename): ''' Read a file in CoNLL format and extracts semantic role tags for each token. ''' lines = [] with open(filename, 'rb') as f: for line in f: line = line.decode('utf-8').strip() lines.append(line) self.sentences = [] self.predicates = [] tokens = [] sent_predicates = [] sent_tags = [] token_number = 0 for line in lines: line = line.strip() if line == '': # blank line between sentences if len(tokens) > 0: sentence = (tokens, sent_tags) self.sentences.append(sentence) self.predicates.append(np.array(sent_predicates)) tokens = [] sent_predicates = [] sent_tags = [] token_number = 0 continue fields = line.split() word = fields[ConllPos.word] lemma = fields[ConllPos.lemma] pos = fields[ConllPos.pos].lower() is_predicate = fields[ConllPos.pred] != '-' tags = fields[ConllPos.semantic_role:] # if this is the first token in the sentence, find out how many # predicates are there. initialize a list for each of them. if sent_tags == []: expected_roles = [] for tag in tags: tag, expected_role = self._read_role(tag, 'O', True) sent_tags.append([tag]) expected_roles.append(expected_role) else: for i, tag in enumerate(tags): expected_role = expected_roles[i] tag, expected_role = self._read_role( tag, expected_role, True) sent_tags[i].append(tag) expected_roles[i] = expected_role token = attributes.Token(word, lemma, pos) tokens.append(token) if is_predicate: sent_predicates.append(token_number) token_number += 1 if len(tokens) > 0: # last sentence sentence = (tokens, sent_tags) self.sentences.append(sentence) self.predicates.append(np.array(sent_predicates)) @classmethod def _read_role(cls, role, expected_role, remove_continuation): """ Reads the next semantic role from a CoNLL-style file. :param role: what is read from the conll file (something like *, (A0* or *) :param role: the expected role if a * is found :param remove_countinuation: removes the C- from non-continuous arguments. C-A0 becomes A0. :return a tuple (role, expected next role) """ if role == '*': # signals continuation of the last block role = expected_role elif role == '*)': # finishes block role = expected_role expected_role = 'O' else: # verifies if it is a single argument match = re.search('\(([-\w]+)\*\)', role) if match: role = match.group(1) expected_role = 'O' else: # verifies if it opens an argument match = re.search('\(([-\w]+)\*', role) if match: role = match.group(1) expected_role = role else: raise ValueError('Unexpected role data: %s' % role) if role.startswith('C-') and remove_continuation: # removes C- role = role[2:] return role, expected_role def extend(self, data): """ Adds more data to the reader. :param data: a list of tuples in the format (tokens, tags, predicates), one for each sentence. """ self.sentences.extend([(sent, tags) for sent, tags, _ in data]) self.predicates.extend([np.array(preds) for _, _, preds in data]) def load_or_create_tag_dict(self): """ In the case of SRL argument classification or one step SRL, try to load the tag dictionary. If the file with the tags is not present, a new one is created from the available sentences. In the case of argument detection or predicate detection, this function does nothing. """ if self.task == 'srl_predicates' or self.task == 'srl_boundary': return # only SRL as one step uses IOB tags iob = self.task == 'srl' if os.path.isfile(self.md.paths['srl_tags']): self.load_tag_dict(iob=iob) return self._create_tag_dict(iob) logger = logging.getLogger('Logger') logger.info('Created SRL tag dictionary') def _create_tag_dict(self, iob=False): """ Examine the available sentences and create a tag dictionary. :param iob: If True, this function will generate an entry for B-[tag] and one for I-[tag], except for the tag 'O'. """ logger = logging.getLogger("Logger") tags = { tag for _, tag_groups in self.sentences for tags in tag_groups for tag in tags } # create a dictionary now even if uses IOB, in order to save it in # a deterministic order self.tag_dict = {tag: code for code, tag in enumerate(tags)} reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict) logger.debug("Saved SRL tag dictionary.") if not iob: return # insert I- and B- preserving the ordering new_dict = {} code = 0 for tag in sorted(self.tag_dict, key=self.tag_dict.get): if tag == 'O': new_dict[tag] = code else: new_dict['B-%s' % tag] = code code += 1 new_dict['I-%s' % tag] = code code += 1 self.tag_dict = new_dict def load_tag_dict(self, filename=None, iob=False): """ Loads the tag dictionary from the default file. The dictionary file should have one tag per line. :param iob: If True, this function will generate an entry for B-[tag] and one for I-[tag], except for the tag 'O'. """ if filename is None: filename = self.md.paths['srl_tags'] if not iob: super(SRLReader, self).load_tag_dict(filename) return self.tag_dict = {} code = 0 with open(filename, 'rb') as f: for tag in f: tag = tag.decode('utf-8').strip() if tag == '': continue if tag == 'O': self.tag_dict[tag] = code else: self.tag_dict['B-%s' % tag] = code code += 1 self.tag_dict['I-%s' % tag] = code code += 1 if 'O' not in self.tag_dict: self.tag_dict['O'] = code def _generate_iobes_dictionary(self): """ Generate the reader's tag dictionary mapping the IOBES tags to numeric codes. """ self.tag_dict = {tag: code for code, tag in enumerate('IOBES')} def _generate_predicate_id_dictionary(self): """ Generate a tag dictionary for identifying predicates. It has two tags: V for predicates and O for others. """ self.tag_dict = {'O': 0, 'V': 1} def generate_dictionary(self, dict_size=None, minimum_occurrences=2): """ Generates a token dictionary based on the given sentences. :param dict_size: Max number of tokens to be included in the dictionary. :param minimum_occurrences: Minimum number of times that a token must appear in the text in order to be included in the dictionary. """ logger = logging.getLogger("Logger") all_tokens = [ token.word for tokens, _ in self.sentences for token in tokens ] self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences) logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens) def _clean_text(self): """ Cleans the sentences text, replacing numbers for a keyword, different kinds of quotation marks for a single one, etc. """ for sent, _ in self.sentences: for i, token in enumerate(sent): new_word = utils.clean_text(token.word, correct=False) new_lemma = utils.clean_text(token.lemma, correct=False) token.word = new_word token.lemma = new_lemma sent[i] = token def create_converter(self): """ This function overrides the TextReader's one in order to deal with Token objects instead of raw strings. """ self.converter = attributes.TokenConverter() if self.md.use_lemma: # look up word lemmas word_lookup = lambda t: self.word_dict.get(t.lemma) else: # look up the word itself word_lookup = lambda t: self.word_dict.get(t.word) self.converter.add_extractor(word_lookup) if self.md.use_caps: caps_lookup = lambda t: attributes.get_capitalization(t.word) self.converter.add_extractor(caps_lookup) if self.md.use_pos: with open(self.md.paths['pos_tag_dict']) as f: pos_dict = cPickle.load(f) pos_def_dict = defaultdict(lambda: pos_dict['other']) pos_def_dict.update(pos_dict) pos_lookup = lambda t: pos_def_dict[t.pos] self.converter.add_extractor(pos_lookup) if self.md.use_chunk: with open(self.md.paths['chunk_tag_dict']) as f: chunk_dict = cPickle.load(f) chunk_def_dict = defaultdict(lambda: chunk_dict['O']) chunk_def_dict.update(chunk_dict) chunk_lookup = lambda t: chunk_def_dict[t.chunk] self.converter.add_extractor(chunk_lookup) def generate_tag_dict(self): """ Generates a tag dictionary that converts the tag itself to an index to be used in the neural network. """ self.tagset = set(tag for _, props in self.sentences for prop in props for tag in prop) self.tag_dict = dict(zip(self.tagset, range(len(self.tagset)))) def _remove_tag_names(self): """ Removes the actual tag names, leaving only IOB or IOBES block delimiters. """ for _, propositions in self.sentences: for tags in propositions: for i, tag in enumerate(tags): tags[i] = tag[0] def _codify_sentences(self): """Internal helper function.""" new_sentences = [] self.tags = [] for (sent, props), preds in zip(self.sentences, self.predicates): new_sent = [] sentence_tags = [] for token in sent: new_token = self.converter.convert(token) new_sent.append(new_token) if self.task == 'srl_predicates': sentence_tags = np.zeros(len(sent), np.int) if len(preds) > 0: sentence_tags[preds] = 1 else: for prop in props: # for classifying arguments, leave the names. they will be # changed later if self.task == 'srl_classify': prop_tags = prop else: prop_tags = np.array( [self.tag_dict[tag] for tag in prop]) sentence_tags.append(prop_tags) new_sentences.append(np.array(new_sent)) self.tags.append(sentence_tags) self.sentences = new_sentences self.codified = True def codify_sentences(self): """ Converts each token in each sequence into indices to their feature vectors in feature matrices. The previous sentences as text are not accessible anymore. Tags are also encoded. This function takes care of the case of classifying pre-delimited arguments. """ if self.converter is None: self.create_converter() self._codify_sentences() self.arg_limits = [] if self.task == 'srl_classify': # generate the tags for each argument start = 0 for i, propositions in enumerate(self.tags): new_sent_tags = [] sent_args = [] for prop_tags in propositions: new_prop_tags = [] prop_args = [] last_tag = 'O' for j, tag in enumerate(prop_tags): if tag != last_tag: # if we were inside an argument, it ended # we may have started a new if last_tag != 'O': end = j - 1 prop_args.append(np.array([start, end])) if tag != 'O': start = j new_prop_tags.append(self.tag_dict[tag]) last_tag = tag else: # after last iteration, check the last tag if last_tag != 'O': end = j prop_args.append(np.array([start, end])) sent_args.append(np.array(prop_args)) new_sent_tags.append(np.array(new_prop_tags)) self.arg_limits.append(sent_args) self.tags[i] = new_sent_tags def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False): """ Replaces each word label with an IOB or IOBES version, appending a prefix to them. :param scheme: IOB or IOBES (In, Other, Begin, End, Single). :param update_tag_dict: whether to update or not the tag dictionary after converting the tags. :param only_boundaries: if True, only leaves the IOBES tags and removes the actual tags. Also, avoid updating the tag dict. """ scheme = scheme.lower() if scheme not in ('iob', 'iobes'): raise ValueError("Unknown tagging scheme: %s" % scheme) for _, props in self.sentences: for prop in props: last_tag = None for i, tag in enumerate(prop): if tag == 'O': # O tag is independent from IBES last_tag = tag continue try: next_tag = prop[i + 1] except IndexError: # last word already next_tag = None if tag != last_tag: # a new block starts here. last_tag = tag if scheme == 'iob' or next_tag == tag: prop[i] = 'B-%s' % tag else: prop[i] = 'S-%s' % tag else: # the block continues. if scheme == 'iob' or next_tag == tag: prop[i] = 'I-%s' % tag else: prop[i] = 'E-%s' % tag if only_boundaries: self._remove_tag_names() elif update_tag_dict: self.generate_tag_dict() else: # treat any tag not appearing in the tag dictionary as O actual_tagset = { tag for _, props in self.sentences for prop in props for tag in prop } for tag in actual_tagset: if tag not in self.tag_dict: self.tag_dict[tag] = self.tag_dict[self.rare_tag]
class SRLReader(reader.TaggerReader): def __init__(self, md=None, filename=None, only_boundaries=False, only_classify=False, only_predicates=False): """ The reader will read sentences from a given file. This file must be in the correct format (one token per line, columns indicating which tokens are predicates and their argument structure). :param filename: a file with CoNLL-like format data. If it is None, the reader will be created with no data. :param only_boundaries: train to identify only argument boundaries :param only_classify: train to classify pre-determined argument :param only_predicates: train to identify only predicates """ if only_boundaries: self.taskname = 'srl_boundary' self._generate_iobes_dictionary() elif only_classify: self.taskname = 'srl_classify' elif only_predicates: self.taskname = 'srl_predicates' self._generate_predicate_id_dictionary() else: self.taskname = 'srl' self.rare_tag = 'O' if filename is not None: self._read_conll(filename) self._clean_text() super(SRLReader, self).__init__(md) @property def task(self): """ Abstract Base Class (ABC) attribute. """ return self.taskname def _read_conll(self, filename): ''' Read a file in CoNLL format and extracts semantic role tags for each token. ''' lines = [] with open(filename, 'rb') as f: for line in f: line = line.decode('utf-8').strip() lines.append(line) self.sentences = [] self.predicates = [] tokens = [] sent_predicates = [] sent_tags = [] token_number = 0 for line in lines: line = line.strip() if line == '': # blank line between sentences if len(tokens) > 0: sentence = (tokens, sent_tags) self.sentences.append(sentence) self.predicates.append(np.array(sent_predicates)) tokens = [] sent_predicates = [] sent_tags = [] token_number = 0 continue fields = line.split() word = fields[ConllPos.word] lemma = fields[ConllPos.lemma] pos = fields[ConllPos.pos].lower() is_predicate = fields[ConllPos.pred] != '-' tags = fields[ConllPos.semantic_role:] # if this is the first token in the sentence, find out how many # predicates are there. initialize a list for each of them. if sent_tags == []: expected_roles = [] for tag in tags: tag, expected_role = self._read_role(tag, 'O', True) sent_tags.append([tag]) expected_roles.append(expected_role) else: for i, tag in enumerate(tags): expected_role = expected_roles[i] tag, expected_role = self._read_role(tag, expected_role, True) sent_tags[i].append(tag) expected_roles[i] = expected_role token = attributes.Token(word, lemma, pos) tokens.append(token) if is_predicate: sent_predicates.append(token_number) token_number += 1 if len(tokens) > 0: # last sentence sentence = (tokens, sent_tags) self.sentences.append(sentence) self.predicates.append(np.array(sent_predicates)) @classmethod def _read_role(cls, role, expected_role, remove_continuation): """ Reads the next semantic role from a CoNLL-style file. :param role: what is read from the conll file (something like *, (A0* or *) :param role: the expected role if a * is found :param remove_countinuation: removes the C- from non-continuous arguments. C-A0 becomes A0. :return a tuple (role, expected next role) """ if role == '*': # signals continuation of the last block role = expected_role elif role == '*)': # finishes block role = expected_role expected_role = 'O' else: # verifies if it is a single argument match = re.search('\(([-\w]+)\*\)', role) if match: role = match.group(1) expected_role = 'O' else: # verifies if it opens an argument match = re.search('\(([-\w]+)\*', role) if match: role = match.group(1) expected_role = role else: raise ValueError('Unexpected role data: %s' % role) if role.startswith('C-') and remove_continuation: # removes C- role = role[2:] return role, expected_role def extend(self, data): """ Adds more data to the reader. :param data: a list of tuples in the format (tokens, tags, predicates), one for each sentence. """ self.sentences.extend([(sent, tags) for sent, tags, _ in data]) self.predicates.extend([np.array(preds) for _, _, preds in data]) def load_or_create_tag_dict(self): """ In the case of SRL argument classification or one step SRL, try to load the tag dictionary. If the file with the tags is not present, a new one is created from the available sentences. In the case of argument detection or predicate detection, this function does nothing. """ if self.task == 'srl_predicates' or self.task == 'srl_boundary': return # only SRL as one step uses IOB tags iob = self.task == 'srl' if os.path.isfile(self.md.paths['srl_tags']): self.load_tag_dict(iob=iob) return self._create_tag_dict(iob) logger = logging.getLogger('Logger') logger.info('Created SRL tag dictionary') def _create_tag_dict(self, iob=False): """ Examine the available sentences and create a tag dictionary. :param iob: If True, this function will generate an entry for B-[tag] and one for I-[tag], except for the tag 'O'. """ logger = logging.getLogger("Logger") tags = {tag for _, tag_groups in self.sentences for tags in tag_groups for tag in tags} # create a dictionary now even if uses IOB, in order to save it in # a deterministic order self.tag_dict = {tag: code for code, tag in enumerate(tags)} reader.save_tag_dict(self.md.paths['srl_tags'], self.tag_dict) logger.debug("Saved SRL tag dictionary.") if not iob: return # insert I- and B- preserving the ordering new_dict = {} code = 0 for tag in sorted(self.tag_dict, key=self.tag_dict.get): if tag == 'O': new_dict[tag] = code else: new_dict['B-%s' % tag] = code code += 1 new_dict['I-%s' % tag] = code code += 1 self.tag_dict = new_dict def load_tag_dict(self, filename=None, iob=False): """ Loads the tag dictionary from the default file. The dictionary file should have one tag per line. :param iob: If True, this function will generate an entry for B-[tag] and one for I-[tag], except for the tag 'O'. """ if filename is None: filename = self.md.paths['srl_tags'] if not iob: super(SRLReader, self).load_tag_dict(filename) return self.tag_dict = {} code = 0 with open(filename, 'rb') as f: for tag in f: tag = tag.decode('utf-8').strip() if tag == '': continue if tag == 'O': self.tag_dict[tag] = code else: self.tag_dict['B-%s' % tag] = code code += 1 self.tag_dict['I-%s' % tag] = code code += 1 if 'O' not in self.tag_dict: self.tag_dict['O'] = code def _generate_iobes_dictionary(self): """ Generate the reader's tag dictionary mapping the IOBES tags to numeric codes. """ self.tag_dict = {tag: code for code, tag in enumerate('IOBES')} def _generate_predicate_id_dictionary(self): """ Generate a tag dictionary for identifying predicates. It has two tags: V for predicates and O for others. """ self.tag_dict = {'O': 0, 'V': 1} def generate_dictionary(self, dict_size=None, minimum_occurrences=2): """ Generates a token dictionary based on the given sentences. :param dict_size: Max number of tokens to be included in the dictionary. :param minimum_occurrences: Minimum number of times that a token must appear in the text in order to be included in the dictionary. """ logger = logging.getLogger("Logger") all_tokens = [token.word for tokens, _ in self.sentences for token in tokens] self.word_dict = WordDictionary(all_tokens, dict_size, minimum_occurrences) logger.info("Created dictionary with %d tokens" % self.word_dict.num_tokens) def _clean_text(self): """ Cleans the sentences text, replacing numbers for a keyword, different kinds of quotation marks for a single one, etc. """ for sent, _ in self.sentences: for i, token in enumerate(sent): new_word = utils.clean_text(token.word, correct=False) new_lemma = utils.clean_text(token.lemma, correct=False) token.word = new_word token.lemma = new_lemma sent[i] = token def create_converter(self): """ This function overrides the TextReader's one in order to deal with Token objects instead of raw strings. """ self.converter = attributes.TokenConverter() if self.md.use_lemma: # look up word lemmas word_lookup = lambda t: self.word_dict.get(t.lemma) else: # look up the word itself word_lookup = lambda t: self.word_dict.get(t.word) self.converter.add_extractor(word_lookup) if self.md.use_caps: caps_lookup = lambda t: attributes.get_capitalization(t.word) self.converter.add_extractor(caps_lookup) if self.md.use_pos: with open(self.md.paths['pos_tag_dict']) as f: pos_dict = cPickle.load(f) pos_def_dict = defaultdict(lambda: pos_dict['other']) pos_def_dict.update(pos_dict) pos_lookup = lambda t: pos_def_dict[t.pos] self.converter.add_extractor(pos_lookup) if self.md.use_chunk: with open(self.md.paths['chunk_tag_dict']) as f: chunk_dict = cPickle.load(f) chunk_def_dict = defaultdict(lambda: chunk_dict['O']) chunk_def_dict.update(chunk_dict) chunk_lookup = lambda t: chunk_def_dict[t.chunk] self.converter.add_extractor(chunk_lookup) def generate_tag_dict(self): """ Generates a tag dictionary that converts the tag itself to an index to be used in the neural network. """ self.tagset = set(tag for _, props in self.sentences for prop in props for tag in prop) self.tag_dict = dict(zip(self.tagset, range(len(self.tagset)))) def _remove_tag_names(self): """ Removes the actual tag names, leaving only IOB or IOBES block delimiters. """ for _, propositions in self.sentences: for tags in propositions: for i, tag in enumerate(tags): tags[i] = tag[0] def _codify_sentences(self): """Internal helper function.""" new_sentences = [] self.tags = [] for (sent, props), preds in zip(self.sentences, self.predicates): new_sent = [] sentence_tags = [] for token in sent: new_token = self.converter.convert(token) new_sent.append(new_token) if self.task == 'srl_predicates': sentence_tags = np.zeros(len(sent), np.int) if len(preds) > 0: sentence_tags[preds] = 1 else: for prop in props: # for classifying arguments, leave the names. they will be # changed later if self.task == 'srl_classify': prop_tags = prop else: prop_tags = np.array([self.tag_dict[tag] for tag in prop]) sentence_tags.append(prop_tags) new_sentences.append(np.array(new_sent)) self.tags.append(sentence_tags) self.sentences = new_sentences self.codified = True def codify_sentences(self): """ Converts each token in each sequence into indices to their feature vectors in feature matrices. The previous sentences as text are not accessible anymore. Tags are also encoded. This function takes care of the case of classifying pre-delimited arguments. """ if self.converter is None: self.create_converter() self._codify_sentences() self.arg_limits = [] if self.task == 'srl_classify': # generate the tags for each argument start = 0 for i, propositions in enumerate(self.tags): new_sent_tags = [] sent_args = [] for prop_tags in propositions: new_prop_tags = [] prop_args = [] last_tag = 'O' for j, tag in enumerate(prop_tags): if tag != last_tag: # if we were inside an argument, it ended # we may have started a new if last_tag != 'O': end = j - 1 prop_args.append(np.array([start, end])) if tag != 'O': start = j new_prop_tags.append(self.tag_dict[tag]) last_tag = tag else: # after last iteration, check the last tag if last_tag != 'O': end = j prop_args.append(np.array([start, end])) sent_args.append(np.array(prop_args)) new_sent_tags.append(np.array(new_prop_tags)) self.arg_limits.append(sent_args) self.tags[i] = new_sent_tags def convert_tags(self, scheme, update_tag_dict=True, only_boundaries=False): """ Replaces each word label with an IOB or IOBES version, appending a prefix to them. :param scheme: IOB or IOBES (In, Other, Begin, End, Single). :param update_tag_dict: whether to update or not the tag dictionary after converting the tags. :param only_boundaries: if True, only leaves the IOBES tags and removes the actual tags. Also, avoid updating the tag dict. """ scheme = scheme.lower() if scheme not in ('iob', 'iobes'): raise ValueError("Unknown tagging scheme: %s" % scheme) for _, props in self.sentences: for prop in props: last_tag = None for i, tag in enumerate(prop): if tag == 'O': # O tag is independent from IBES last_tag = tag continue try: next_tag = prop[i + 1] except IndexError: # last word already next_tag = None if tag != last_tag: # a new block starts here. last_tag = tag if scheme == 'iob' or next_tag == tag: prop[i] = 'B-%s' % tag else: prop[i] = 'S-%s' % tag else: # the block continues. if scheme == 'iob' or next_tag == tag: prop[i] = 'I-%s' % tag else: prop[i] = 'E-%s' % tag if only_boundaries: self._remove_tag_names() elif update_tag_dict: self.generate_tag_dict() else: # treat any tag not appearing in the tag dictionary as O actual_tagset = {tag for _, props in self.sentences for prop in props for tag in prop} for tag in actual_tagset: if tag not in self.tag_dict: self.tag_dict[tag] = self.tag_dict[self.rare_tag]
def convertPolyglot(polyglotFile, types_feats_file, word_dict_file): words, embeddings = pickle.load(open(polyglotFile)) numpy.save(types_feats_file, embeddings) out = open(word_dict_file, 'w') pickle.dump(WordDictionary(None, wordlist=words, variant='polyglot'), out)