def write_spans(self, label_type: str, tag_type: str, instance: NLPInstance): if self.is_tag(): span_type = tag_type else: span_type = label_type instance.add_span(self.get_from(), self.get_to(), self.label, span_type) for tree in self.children: tree.write_spans(label_type, tag_type, instance)
def load(self, file_name: str, from_sent_nr: int, to_sent_nr: int): with open(file_name, encoding='UTF-8') as reader: token_preds = self._extract_predicates_from_string(self.tokens) dep_preds = self._extract_predicates_from_string(self.deps) span_preds = self._extract_predicates_from_string(self.spans) instance_nr = 0 instance = NLPInstance() as_token = None as_dep = None as_span = None result = [] # [NLPInstance] rows = {} # {str: [[str]]} self._init_rows(rows, token_preds, span_preds, dep_preds) while instance_nr < to_sent_nr: try: line = check_eof(reader.readline()).strip() if line.startswith('>>'): # monitor.progressed(instanceNr) instance_nr += 1 if instance_nr > from_sent_nr and instance_nr > 1: self._add_edges(instance, rows, token_preds, dep_preds, span_preds) result.append(instance) instance = NLPInstance() rows.clear() self._init_rows(rows, token_preds, span_preds, dep_preds) elif line.startswith('>') and instance_nr > from_sent_nr: pred = line[1:] as_token = token_preds.get(pred) as_dep = dep_preds.get(pred) as_span = span_preds.get(pred) else: line = line.strip() if line != '' and instance_nr > from_sent_nr: row = line.split('\t') if as_token is not None: rows[as_token].add(row) if as_dep is not None: rows[as_dep].add(row) if as_span is not None: rows[as_span].add(row) except EOFError: break self._add_edges(instance, rows, token_preds, dep_preds, span_preds) result.append(instance) return result
def create(self, rows): instance = NLPInstance() instance.add_token().add_property('Word', '-Root-') for row in rows: instance.add_token().\ add_property(name='Word', value=row[1]).\ add_property(name='Index', value=row[0]).\ add_property(name='Lemma', value=row[2]).\ add_property(name='CPos', value=row[3]).\ add_property(name='Pos', value=row[4]).\ add_property(name='Feats', value=row[5]) for row in rows: # dependency mod = int(row[0]) try: instance.add_dependency(start=int(row[6]), end=mod, label=row[7], edge_type='dep') except (ValueError, IndexError, KeyError): print('Can\'t parse dependency', file=sys.stderr) instance.tokens[mod].add_property('DepMissing', 'missing') # role return instance
def _extract_span04_05(rows: list, column: int, field_type: str, prefix: str, instance: NLPInstance): begin = 0 current_chunk = '' for index, row in enumerate(rows): chunk = row[column] if chunk.startswith('('): end = chunk.index( '*' ) # To get ValueError when not found instead of find's -1 current_chunk = chunk[1:end] begin = index if chunk.endswith(')'): instance.add_span(begin, index, prefix + current_chunk, field_type)
def _extract_span00_02(rows: list, column: int, field_type: str, instance: NLPInstance): in_chunk = False begin = 0 current_chunk = '' for index, row in enumerate(rows): chunk = row[column] minus = chunk.find('-') if minus != -1: bio = chunk[0:minus] label = chunk[minus + 1:] if 'B' == bio: if in_chunk: instance.add_span(begin, index - 1, current_chunk, field_type) begin = index current_chunk = label in_chunk = True elif in_chunk: instance.add_span(begin, index - 1, current_chunk, field_type) in_chunk = False
def create(self, rows): instance = NLPInstance() for index, row in enumerate(rows): instance.add_token().\ add_property(name='Word', value=row[0]).\ add_property(name='Index', value=str(index)) instance.add_span(index, index, row[1], 'pos') instance.add_span(index, index, row[2], 'chunk (BIO') instance.add_span(index, index, row[3], 'ner (BIO)') self._extract_span03(rows=rows, column=2, field_type='chunk', instance=instance) self._extract_span03(rows=rows, column=3, field_type='ner', instance=instance) return instance
def create(self, rows): instance = NLPInstance() sentence = rows[0] # Skip <s> and dep count for i in range(2, len(sentence)): w_t_c = sentence[i].split('|') instance.add_token().\ add_property(name='Word', value=w_t_c[0]).\ add_property(name='Tag', value=w_t_c[1]).\ add_property(name='Category', value=w_t_c[2]).\ add_property(name='Index', value=str(i - 1)) # instance.add_token().add_property('Word', '-Root-') mod = 1 for row in rows: if row[0] != '<s>' and row[0] != '<\s>': # dependency try: instance.add_dependency(start=int(row[1]), end=int(row[0]), label=row[2] + '_' + row[3], edge_type='dep') except (ValueError, IndexError, KeyError): print('Can\'t parse dependency', file=sys.stderr) instance.tokens[mod].add_property('DepMissing', 'missing') mod += 1 return instance
def create(self, rows): instance = NLPInstance() for index, row in enumerate(rows): instance.add_token().\ add_property(name='Word', value=row[0]).\ add_property(name='Index', value=str(index)) predicate_count = 0 for index, row in enumerate(rows): try: if row[9] != '-': sense = row[10] + '.' + row[9] instance.add_span(index, index, sense, 'sense') self._extract_span04_05(rows, 11 + predicate_count, 'role', sense + ':', instance) predicate_count += 1 except IndexError: print('Can\'t parse file: not enough (10) column in row {0}'. format(row), file=sys.stderr) sys.exit(1) return instance
def _extract_span03(rows: list, column: int, field_type: str, instance: NLPInstance): in_chunk = False begin = 0 current_chunk = '' index = 0 for index, row in enumerate(rows): chunk = row[column] minus = chunk.find('-') if minus != -1: bio = chunk[0:minus] label = chunk[minus + 1:] if in_chunk: # start a new chunk and finish old one if 'B' == bio or 'I' == bio and label != current_chunk: instance.add_span(begin, index - 1, current_chunk, field_type) begin = index current_chunk = label else: in_chunk = True begin = index current_chunk = label elif in_chunk: instance.add_span(begin, index - 1, current_chunk, field_type) in_chunk = False if in_chunk: instance.add_span(begin, index - 1, current_chunk, field_type)
def create_open(rows): instance = NLPInstance() instance.add_token() for row in rows: instance.add_token(). \ add_property('Named Entity', row[0], 10). \ add_property('NamedEntity BBN', row[1], 11). \ add_property('WordNet', row[2], 12) index = 1 for index, row in enumerate(rows, start=1): # dependency instance.add_edge(start=int(row[3]), end=index, label=row[4], edge_type='malt') return index
def create(self, rows): instance = NLPInstance() instance.add_token().add_property(name='Word', value='-Root-') for index, row in enumerate(rows, start=1): instance.add_token().\ add_property(name='Word', value=row[0]).\ add_property(name='Index', value=str(index)).\ add_property(name='Pos', value=row[1]) for mod, row in enumerate(rows, start=1): # dependency try: instance.add_dependency(start=int(row[2]), end=mod, label=row[3], edge_type='dep') except (ValueError, IndexError, KeyError): print('Can\'t parse dependency', file=sys.stderr) instance.tokens[mod].add_property('DepMissing', 'missing') # role return instance
def load(self, file_name: str, _, __): """ * Loads a corpus from a file, starting at instance <code>from</code> and ending at instance <code>to</code> * (exclusive). This method is required to call * {@link com.googlecode.whatswrong.io.CorpusFormat.Monitor#progressed(int)} * after each instance that was processed. * * @param file the file to load the corpus from. * @param from the starting instance index. * @param to the end instance index. * @return a list of NLP instances loaded from the given file in the given interval. * @throws java.io.IOException if I/O goes wrong. """ result = [] with open(file_name, encoding='UTF-8') as reader: instance = None source_length = -1 target_length = -1 for line in reader: line = line.strip() if line.startswith('<source>'): content = line.strip()[8: len(line) - 9] for token in content.split(): instance.add_token().add_property('word', token) source_length = len(instance.tokens) instance.split_point = source_length elif line.startswith('<seg'): instance = NLPInstance(render_type=RenderType.alignment) elif line.startswith('<translation>'): content = line.strip()[13: len(line) - 14] for token in content.split(): instance.add_token().add_property('word', token) target_length = len(instance.tokens) - source_length elif line.startswith('<matrix>'): check_eof(reader.readline()) for tgt in range(target_length): line = check_eof(reader.readline()).strip() col = line.split() for src in range(1, len(col)): if col[src] == '1': instance.add_edge(src - 1, tgt + source_length, 'align', 'align') result.append(instance) return result
def load(self, file_name: str, from_sent_nr: int, to_sent_nr: int): result = [] instance_nr = 0 with open(file_name, encoding='UTF-8') as reader: for line in reader: line = line.strip() if line != '': if instance_nr >= from_sent_nr: tree = Tree('[root]') tree.consume(tree, line) tree = tree.children[0] instance = NLPInstance() tree.write_tokens(self.word, self.tag, instance) tree.write_spans(self.phrase, self.tag, instance) result.append(instance) instance_nr += 1 if instance_nr >= to_sent_nr: break return result
def create(self, rows): instance = NLPInstance() for index, row in enumerate(rows): instance.add_token().\ add_property(name='Word', value=row[0]).\ add_property(name='Index', value=str(index)) predicate_count = 0 for index, row in enumerate(rows): if row[1] != '-': sense = row[1] instance.add_span(index, index, sense, 'sense') self._extract_span04_05(rows, 2 + predicate_count, 'role', sense + ':', instance) predicate_count += 1 return instance
def filter(self, original: NLPInstance) -> NLPInstance: """Filter an NLP instance. Filters the tokens and then removes edges that have tokens which were filtered out. Also filters out edges and then filter out tokens without edges if self.is_collaps is true. Filters out all edges that don't have an allowed prefix and postfix type. Filters out all edges that don't have a label that contains one of the allowed label substrings. If the set of allowed substrings is empty then the original set of edges is returned as is. Note on types: tokens ({Token}) old2new ({Token: Token}) new2old ({Token: Token}) updated_tokens ([Token]) updated_edges ({Edge}) updated_split_points ([int]) Args: original (NLPInstance): The original nlp instance. Returns: NLPInstance: The filtered NLPInstance. """ # Filter edges by connecting token properties, edge label, edge type, edge property edges = { edge for edge in original.get_edges() if self._is_edge_allowed(edge) } # Only allow edges on the path of tokens having allowed props if self.use_path: edges = self._calculate_paths(edges) # Unless collape is True all token is shown! tokens = original.tokens # Filter tokens for edges if self.collapse: # Collapse tokens to the allowed edges tokens = set() if self.collapse: for edge in edges: if edge.render_type == EdgeRenderType.dependency: tokens.add(edge.start) tokens.add(edge.end) elif edge.render_type == EdgeRenderType.span: for i in range(edge.start.index, edge.end.index + 1): tokens.add(original.get_token(i)) # Token filter: reduce the list of tokens explicitly allowed ones (or keep all remaining) tokens = { token for token in tokens if self._token_has_allowed_prop( token, self.tok_allowed_token_propvals, self.tok_propvals_whole_word) } # XXX Why do we need to create new tokens? # Compute bidirectional mapping between the new and old indexes and create new tokens old2new, new2old, updated_tokens = {}, {}, [] for i, token in enumerate(sorted( tokens, key=attrgetter('index'))): # This sould be non-capital index! new_tok = Token(i) new_tok.merge( original.tokens[token.index], forbidden_token_properties=self.forbidden_token_properties) old2new[token] = new_tok new2old[new_tok] = token updated_tokens.append(new_tok) # XXX Why do we need to create new edges? # Update edges and remove those that have vertices not in the new vertex set updated_edges = set() for edge in (e for e in edges if e.start in old2new and e.end in old2new): updated_edges.add( Edge(start=old2new[edge.start], end=old2new[edge.end], label=edge.label, note=edge.note, edge_type=edge.edge_type, render_type=edge.render_type, description=edge.description, properties=edge.properties)) # Find new split points (have to be changed because instance has new token sequence) new_token_index = 0 old_split_point = original.split_point new_tok = updated_tokens[new_token_index] old_token = new2old[new_tok] max_index_of_updated_tokens = len(updated_tokens) - 1 while new_token_index < max_index_of_updated_tokens and old_token.index < old_split_point: new_token_index += 1 new_tok = updated_tokens[new_token_index] old_token = new2old[new_tok] updated_split_point = new_token_index return NLPInstance(tokens=updated_tokens, edges=updated_edges, render_type=original.render_type, split_point=updated_split_point)
def create(self, rows): instance = NLPInstance() instance.add_token().add_property(name='Word', value='-Root-') predicates = [] for row in rows: instance.add_token().\ add_property(name='Word', value=row[1]).\ add_property(name='Index', value=row[0]).\ add_property(name='Lemma', value=row[2]).\ add_property(name='PLemma', value=row[3]).\ add_property(name='PoS', value=row[4]).\ add_property(name='PPoS', value=row[5]).\ add_property(name='Feat', value=row[6]).\ add_property(name='PFeat', value=row[7]) if row[13] != '_': index = int(row[0]) predicates.append(index) instance.add_span(index, index, row[13], 'sense') for row in rows: # dependency if row[8] != '_': instance.add_dependency(start=int(row[8]), end=int(row[0]), label=row[10], edge_type='dep') if row[9] != '_': instance.add_dependency(start=int(row[9]), end=int(row[0]), label=row[11], edge_type='pdep') # role for col in range(14, len(row)): label = row[col] if label != '_': pred = predicates[col - 14] arg = int(row[0]) # if arg != pred: instance.add_dependency(start=pred, end=arg, label=label, edge_type='role') return instance
def create(self, rows): instance = NLPInstance() instance.add_token().add_property('Word', '-Root-') predicates = [] # ArrayList<Integer>() for row in rows: instance.add_token().\ add_property(name='Word', value=row[1]).\ add_property(name='Index', value=row[0]).\ add_property(name='Lemma', value=row[2]).\ add_property(name='Pos', value=row[3]).\ add_property(name='Split Form', value=row[5]).\ add_property(name='Split Lemma', value=row[6]).\ add_property(name='Split PoS', value=row[7]) if row[10] != '_': index = int(row[0]) predicates.append(index) instance.add_span(index, index, row[10], 'sense') for row in rows: # dependency if row[8] != '_': instance.add_dependency(int(row[8]), int(row[0]), row[9], 'dep') # role for col in range(11, len(row)): label = row[col] if label != '_': pred = predicates[col - 11] arg = int(row[0]) # if arg != pred instance.add_edge(start=pred, end=arg, label=label, edge_type='role') return instance
def load(self, file_name: str, from_sentence_nr: int, to_sentence_nr: int): with open(file_name, encoding='UTF-8') as reader: """ * Skip past the next aligned segment pair in the given reader. * * @throws EndOfInputException if there was no aligned segment pair to skip because we're * already at the end of the given reader """ # There are three lines per segment pair. for _ in range(3 * from_sentence_nr): try: check_eof(reader.readline()) except EOFError: break result = [] # ArrayList<NLPInstance> for i in range(from_sentence_nr, to_sentence_nr): try: """ * @return the next aligned segment pair, loaded from the given reader * * @throws EndOfInputException if no aligned segment pair could be loaded because we're already * at the end of the given reader """ """ There are three lines per segment pair. The first line gives the segment index, source and target lengths (which we can count ourselves), and an alignment score. Skip this line (or throw an exception if there are no more lines). """ check_eof(reader.readline()) tokens = [] """ * a list of one-based {source-token-index, target-token-index} pairs """ alignment_edges = [] # [(int, int)] # String line; # The second line contains the source segment, tokenized, with no adornment. tokens.append(check_eof(reader.readline()).strip().split()) tokens.append([]) """ The third line contains the tokens of the target segment, starting with the pseudo-token "NULL", with each token followed by a whitespace-delimited list (in curly braces nested in parentheses) of the 1-based indices of the source tokens aligned to it, e.g.: NULL ({ 2 }) customization ({ 1 }) of ({ }) tasks ({ 3 4 }) """ # Strip newline and space and reappend space for later regex line = check_eof(reader.readline()).rstrip() + ' ' # start from index 1 to skip the NULL token and empty string at the EOL for ind, token_with_aligned_indices in enumerate(line.split(' }) ')[1:-1], start=1): splitted1, splitted2 = token_with_aligned_indices.split(' ({') tokens[1].append(splitted1) aligned_index_list_as_string = splitted2.strip() """ we need to handle the empty list specially, because the split method on the empty string returns a singleton array containing the empty string, but here an empty array is what we want """ aligned_indices_as_strings = [] if len(aligned_index_list_as_string) > 0: aligned_indices_as_strings = aligned_index_list_as_string.split(' ') for aligned_index_as_string in aligned_indices_as_strings: alignment_edges.append((int(aligned_index_as_string), ind)) # now we're ready to make the NLPInstance instance = NLPInstance(render_type=RenderType.alignment) if self._reverseCheckBox: self.make_instance(instance, tokens[1], tokens[0], ((e2, e1) for e1, e2 in alignment_edges)) else: self.make_instance(instance, tokens[0], tokens[1], alignment_edges) result.append(instance) except EOFError: break return result
def write_tokens(self, word_type: str, tag_type: str, instance: NLPInstance): instance.add_token().add_property(word_type, self.label).add_property('Index', str(self.index))
def update_canvas(self, curr_sent_index: int): """ Updates the canvas based on the current state of the navigator.""" if self._selected_gold is not None: if self._selected_guess is not None: instance = nlp_diff( self._gold_corpora[self._selected_gold][curr_sent_index], self._guess_corpora[self._selected_guess][curr_sent_index], 'eval_status_Match', 'eval_status_FN', 'eval_status_FP') else: instance = self._gold_corpora[ self._selected_gold][curr_sent_index] self.canvas.set_nlp_instance(instance) else: example = NLPInstance() example.add_token().add_property('Word', '[root]').add_property( 'Index', '0') example.add_token().add_property('Word', 'Add').add_property('Index', '1') example.add_token().add_property('Word', 'a').add_property('Index', '2') example.add_token().add_property('Word', 'gold').add_property( 'Index', '3') example.add_token().add_property('Word', 'corpus').add_property( 'Index', '4') example.add_token().add_property('Word', '!').add_property('Index', '5') example.add_dependency(0, 1, 'ROOT', 'dep') example.add_dependency(0, 5, 'PUNC', 'dep') example.add_dependency(1, 4, 'OBJ', 'dep') example.add_dependency(4, 2, 'DET', 'dep') example.add_dependency(4, 3, 'MOD', 'dep') example.add_dependency(1, 4, 'A1', 'role') self.canvas.set_nlp_instance(example) self.canvas.filter.allowed_edge_types = set() self.canvas.filter.allowed_edge_types.add('dep') self.canvas.filter.allowed_edge_types.add('role') self.canvas.filter.allowed_edge_types.add('sense') self.canvas.filter.allowed_edge_types.add('ner') self.canvas.filter.allowed_edge_types.add('chunk') self.canvas.filter.allowed_edge_types.add('pos') self.canvas.filter.allowed_edge_types.add('align') self.canvas.filter.allowed_edge_properties.add('eval_status_FP') self.canvas.filter.allowed_edge_properties.add('eval_status_FN') self.canvas.filter.allowed_edge_properties.add('eval_status_Match') self.canvas.renderer.params['span.orders'] = { 'pos': 0, 'chunk (BIO)': 1, 'chunk': 2, 'ner (BIO)': 2, 'ner': 3, 'sense': 4, 'role': 5, 'phase': 5 } self.canvas.fire_instance_changed()
def load(self, file_name: str, _, __): """ * Loads files from the given directory with the extensions specified by the text fields of the accessory. * * @param file the directory load the corpus from. * @param from the starting instance index. * @param to the end instance index. * @return a list of NLP instances loaded from the given file in the given interval. * @throws java.io.IOException if I/O goes wrong. """ result = [] for txt_file_name in glob.glob(os.path.join(file_name, '*.' + self.txtExtensionField.strip())): filename = os.path.abspath(txt_file_name) prefix = filename.rsplit('.', maxsplit=1)[0] protein_file_name = '{0}.{1}'.format(prefix, self.proteinExtensionField.strip()) event_file_name = '{0}.{1}'.format(prefix, self.eventExtensionField.strip()) if os.path.exists(protein_file_name) and os.path.exists(event_file_name): """ * Loads all NLPInstances in the specified files. Creates one instance. * * @param txt_file_name the text file * @param protein_file_name the file with protein annotations * @param event_file_name the file with event annotations * @return NLPInstance that represents the given text and annotations * @throws IOException if IO goes wrong. """ char_to_token = {} instance = NLPInstance() with open(txt_file_name, encoding='UTF-8') as reader: current_token = instance.add_token() current_token_content = '' for current_index, character in enumerate(iter(functools.partial(reader.read, 1), '')): char_to_token[current_index] = current_token if character == ' ' or character == '\n': if len(current_token_content) > 0: current_token.add_property('Word', current_token_content) current_token.add_property('Index', str(len(instance.tokens) - 1)) current_token_content = '' current_token = instance.add_token() else: current_token_content += character id2token = {} with open(protein_file_name, encoding='UTF-8') as reader: for line in reader.readlines(): split = line.strip().split() if split[0].startswith('T'): elem_id = split[0] elem_type = split[1] elem_from = int(split[2]) elem_to = int(split[3]) from_token = char_to_token[elem_from] to_token = char_to_token[elem_to] instance.add_edge(from_token.index, to_token.index, elem_type, 'protein', EdgeRenderType.span) id2token[elem_id] = to_token with open(event_file_name, encoding='UTF-8') as reader: # get event mentions and locations etc. for line in reader.readlines(): split = line.strip().split() elem_id = split[0] if elem_id.startswith('T'): elem_type = split[1] elem_from = int(split[2]) elem_to = int(split[3]) from_token = char_to_token[elem_from] to_token = char_to_token[elem_to] if elem_type == 'Entity': term_class = 'entity' else: term_class = 'event' instance.add_edge(from_token.index, to_token.index, elem_type, term_class, EdgeRenderType.span) id2token[elem_id] = to_token elif elem_id.startswith('E'): type_and_mention_id = split[1].split(':') even_token = id2token[type_and_mention_id[1]] id2token[elem_id] = even_token with open(event_file_name, encoding='UTF-8') as reader: # now create the event roles for line in reader.readlines(): split = line.split() elem_id = split[0] if elem_id.startswith('E'): even_token = id2token[elem_id] for elem in split[2:]: role_and_id = elem.split(':') arg_token = id2token.get(role_and_id[1]) if arg_token is None: raise RuntimeError( 'There seems to be no mention associated with id {0} for event {1} in' ' file {2}'.format(role_and_id[1], elem_id, event_file_name)) instance.add_edge(even_token.index, arg_token.index, role_and_id[0], 'role', EdgeRenderType.dependency, note=elem_id) result.append(instance) return result