def _parse_token(self, line: str) -> Token: fields: List[str] = re.split(self.column_delimiter, line) token = Token(fields[self.text_column]) for column in self.column_name_map: if len(fields) > column: if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY: token.add_label( self.column_name_map[column], fields[column] ) if self.column_name_map[column] == self.SPACE_AFTER_KEY and fields[column] == '-': token.whitespace_after = False return token
def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: sentence = self.sentences[index] else: with open(str(self.path_to_column_file), encoding=self.encoding) as file: file.seek(self.indices[index]) line = file.readline() sentence: Sentence = Sentence() while line: if self.comment_symbol is not None and line.startswith( self.comment_symbol): line = file.readline() continue if self.__line_completes_sentence(line): if len(sentence) > 0: sentence.infer_space_after() if self.tag_to_bioes is not None: sentence.convert_tag_scheme( tag_type=self.tag_to_bioes, target_scheme="iobes") return sentence else: fields: List[str] = re.split(self.column_delimiter, line) token = Token(fields[self.text_column]) for column in self.column_name_map: if len(fields) > column: if column != self.text_column: token.add_label( self.column_name_map[column], fields[column]) if not line.isspace(): sentence.add_token(token) line = file.readline() return sentence
def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: sentence: Sentence = Sentence() # current token ID token_idx = 0 for conllu_token in token_list: token = Token(conllu_token["form"]) if "ner" in conllu_token: token.add_label("ner", conllu_token["ner"]) if "ner-2" in conllu_token: token.add_label("ner-2", conllu_token["ner-2"]) if "lemma" in conllu_token: token.add_label("lemma", conllu_token["lemma"]) if "misc" in conllu_token and conllu_token["misc"] is not None: space_after = conllu_token["misc"].get("SpaceAfter") if space_after == "No": token.whitespace_after = False sentence.add_token(token) token_idx += 1 if "sentence_id" in token_list.metadata: sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) if "relations" in token_list.metadata: # relations: List[Relation] = [] for head_start, head_end, tail_start, tail_end, label in token_list.metadata["relations"]: # head and tail span indices are 1-indexed and end index is inclusive head = Span(sentence.tokens[head_start - 1 : head_end]) tail = Span(sentence.tokens[tail_start - 1 : tail_end]) sentence.add_complex_label("relation", RelationLabel(value=label, head=head, tail=tail)) # determine all NER label types in sentence and add all NER spans as sentence-level labels ner_label_types = [] for token in sentence.tokens: for annotation in token.annotation_layers.keys(): if annotation.startswith("ner") and annotation not in ner_label_types: ner_label_types.append(annotation) for label_type in ner_label_types: spans = sentence.get_spans(label_type) for span in spans: sentence.add_complex_label("entity", label=SpanLabel(span=span, value=span.tag, score=span.score)) return sentence
def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True): """ Instantiates a column dataset in CoNLL-U format. :param path_to_conll_file: Path to the CoNLL-U formatted file :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads """ if type(path_to_conll_file) is str: path_to_conll_file = Path(path_to_conll_file) assert path_to_conll_file.exists() self.in_memory = in_memory self.path_to_conll_file = path_to_conll_file self.total_sentence_count: int = 0 if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] with open(str(self.path_to_conll_file), encoding="utf-8") as file: line = file.readline() position = 0 sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) position = file.tell() sentence: Sentence = Sentence() elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label(morph.split("=")[0].lower(), morph.split("=")[1]) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() if len(sentence.tokens) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position)
def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: sentence = self.sentences[index] else: with open(str(self.path_to_conll_file), encoding="utf-8") as file: file.seek(self.indices[index]) line = file.readline() sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: break elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label( morph.split("=")[0].lower(), morph.split("=")[1] ) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() return sentence
def __init__( self, path_to_column_file: Path, column_name_map: Dict[int, str], tag_to_bioes: str = None, column_delimiter: str = "\s+", comment_symbol: str = None, in_memory: bool = True, document_separator_token: str = None, encoding: str = "utf-8", skip_first_line: bool = False, ): """ Instantiates a column dataset (typically used for sequence labeling or word-level prediction). :param path_to_column_file: path to the file with the column-formatted data :param column_name_map: a map specifying the column format :param tag_to_bioes: whether to convert to BIOES tagging scheme :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs :param comment_symbol: if set, lines that begin with this symbol are treated as comments :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token that indicates that a new document begins :param skip_first_line: set to True if your dataset has a header line """ assert path_to_column_file.exists() self.path_to_column_file = path_to_column_file self.tag_to_bioes = tag_to_bioes self.column_name_map = column_name_map self.column_delimiter = column_delimiter self.comment_symbol = comment_symbol self.document_separator_token = document_separator_token # store either Sentence objects in memory, or only file offsets self.in_memory = in_memory if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] self.total_sentence_count: int = 0 # most data sets have the token text in the first column, if not, pass 'text' as column self.text_column: int = 0 for column in self.column_name_map: if column_name_map[column] == "text": self.text_column = column # determine encoding of text file self.encoding = encoding sentence: Sentence = Sentence() sentence_started: bool = False with open(str(self.path_to_column_file), encoding=self.encoding) as f: # skip first line if to selected if skip_first_line: f.readline() line = f.readline() position = 0 while line: if self.comment_symbol is not None and line.startswith( comment_symbol): line = f.readline() continue if self.__line_completes_sentence(line): if sentence_started: sentence.infer_space_after() if self.in_memory: if self.tag_to_bioes is not None: sentence.convert_tag_scheme( tag_type=self.tag_to_bioes, target_scheme="iobes") self.sentences.append(sentence) else: self.indices.append(position) position = f.tell() self.total_sentence_count += 1 sentence: Sentence = Sentence() sentence_started = False elif self.in_memory: fields: List[str] = re.split(self.column_delimiter, line) token = Token(fields[self.text_column]) for column in column_name_map: if len(fields) > column: if column != self.text_column: token.add_label(self.column_name_map[column], fields[column]) if not line.isspace(): sentence.add_token(token) sentence_started = True elif not line.isspace(): sentence_started = True line = f.readline() if sentence_started: sentence.infer_space_after() if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) self.total_sentence_count += 1
def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: sentence: Sentence = Sentence() # Build the sentence tokens and add the annotations. for conllu_token in token_list: token = Token(conllu_token["form"]) for field in self.token_annotation_fields: field_value: Any = conllu_token[field] if isinstance(field_value, dict): # For fields that contain key-value annotations, # we add the key as label type-name and the value as the label value. for key, value in field_value.items(): token.add_label(typename=key, value=str(value)) else: token.add_label(typename=field, value=str(field_value)) if conllu_token.get("misc") is not None: space_after: Optional[str] = conllu_token["misc"].get( "SpaceAfter") if space_after == "No": token.whitespace_after = False sentence.add_token(token) if "sentence_id" in token_list.metadata: sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) if "relations" in token_list.metadata: for ( head_start, head_end, tail_start, tail_end, label, ) in token_list.metadata["relations"]: # head and tail span indices are 1-indexed and end index is inclusive head = Span(sentence.tokens[head_start - 1:head_end]) tail = Span(sentence.tokens[tail_start - 1:tail_end]) sentence.add_complex_label( "relation", RelationLabel(value=label, head=head, tail=tail)) # determine all NER label types in sentence and add all NER spans as sentence-level labels ner_label_types = [] for token in sentence.tokens: for annotation in token.annotation_layers.keys(): if annotation.startswith( "ner") and annotation not in ner_label_types: ner_label_types.append(annotation) for label_type in ner_label_types: spans = sentence.get_spans(label_type) for span in spans: sentence.add_complex_label( "entity", label=SpanLabel(span=span, value=span.tag, score=span.score), ) return sentence