def custom_tokenizer(text: str) -> List[Token]: return [Token(text, 0)]
def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True): """ Instantiates a column dataset in CoNLL-U format. :param path_to_conll_file: Path to the CoNLL-U formatted file :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads """ if type(path_to_conll_file) is str: path_to_conll_file = Path(path_to_conll_file) assert path_to_conll_file.exists() self.in_memory = in_memory self.path_to_conll_file = path_to_conll_file self.total_sentence_count: int = 0 if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] with open(str(self.path_to_conll_file), encoding="utf-8") as file: line = file.readline() position = 0 sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) position = file.tell() sentence: Sentence = Sentence() elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label(morph.split("=")[0].lower(), morph.split("=")[1]) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() if len(sentence.tokens) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position)
def read_conll_ud(path_to_conll_file: str) -> List[Sentence]: sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\s+", line) if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith('#'): continue elif '.' in fields[0]: continue elif '-' in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag('lemma', str(fields[2])) token.add_tag('upos', str(fields[3])) token.add_tag('pos', str(fields[4])) token.add_tag('dependency', str(fields[7])) for morph in str(fields[5]).split('|'): if not "=" in morph: continue token.add_tag( morph.split('=')[0].lower(), morph.split('=')[1]) if len(fields) > 10 and str(fields[10]) == 'Y': token.add_tag('frame', str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def form_sentence(tokens): s = Sentence() for w in tokens: s.add_token(Token(w)) return s
def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: sentence = self.sentences[index] else: with open(str(self.path_to_conll_file), encoding="utf-8") as file: file.seek(self.indices[index]) line = file.readline() sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: break elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label( morph.split("=")[0].lower(), morph.split("=")[1] ) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() return sentence
def train(self, intent_fst) -> None: from flair.data import Sentence, Token from flair.models import SequenceTagger, TextClassifier from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) from flair.data import TaggedCorpus from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache") ) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data") ) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data class_data_path = os.path.join(class_data_dir, "train.txt") ner_data_path = os.path.join(ner_data_dir, "train.txt") # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int(self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json") ) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir") ) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)" ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples ) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)" ) sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug(f"Generated sentences in {sentence_time} second(s)") # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for i in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = {ev["start"]: ev for ev in intent_sent["entities"]} entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for i in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug(f"Loading word embeddings from {cache_dir}") word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data(class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug( f"Intent classifier has {len(class_sentences)} example(s)" ) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)") # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)" ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
def plain_tokenizer(text: str) -> Iterable[Token]: res = [] for tok in text.split(): res.append(Token(tok)) return res
def no_op_tokenizer(text: str) -> List[Token]: return [Token(text, idx=0, start_position=0)]
def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512): """ initializes contextual string embeddings using a character-level language model. :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired. :param fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows down training and often leads to overfitting, so use with caution. :param chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster but requires more memory. Lower means slower but less memory. """ super().__init__() cache_dir = Path("embeddings") aws_path: str = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources" self.PRETRAINED_MODEL_ARCHIVE_MAP = { # multilingual models "multi-forward": f"{aws_path}/embeddings-v0.4.3/lm-jw300-forward-v0.1.pt", "multi-backward": f"{aws_path}/embeddings-v0.4.3/lm-jw300-backward-v0.1.pt", "multi-v0-forward": f"{aws_path}/embeddings-v0.4/lm-multi-forward-v0.1.pt", "multi-v0-backward": f"{aws_path}/embeddings-v0.4/lm-multi-backward-v0.1.pt", "multi-v0-forward-fast": f"{aws_path}/embeddings-v0.4/lm-multi-forward-fast-v0.1.pt", "multi-v0-backward-fast": f"{aws_path}/embeddings-v0.4/lm-multi-backward-fast-v0.1.pt", # English models "en-forward": f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt", "en-backward": f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt", "en-forward-fast": f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt", "en-backward-fast": f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt", "news-forward": f"{aws_path}/embeddings-v0.4.1/big-news-forward--h2048-l1-d0.05-lr30-0.25-20/news-forward-0.4.1.pt", "news-backward": f"{aws_path}/embeddings-v0.4.1/big-news-backward--h2048-l1-d0.05-lr30-0.25-20/news-backward-0.4.1.pt", "news-forward-fast": f"{aws_path}/embeddings/lm-news-english-forward-1024-v0.2rc.pt", "news-backward-fast": f"{aws_path}/embeddings/lm-news-english-backward-1024-v0.2rc.pt", "mix-forward": f"{aws_path}/embeddings/lm-mix-english-forward-v0.2rc.pt", "mix-backward": f"{aws_path}/embeddings/lm-mix-english-backward-v0.2rc.pt", # Arabic "ar-forward": f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-forward-v0.1.pt", "ar-backward": f"{aws_path}/embeddings-stefan-it/lm-ar-opus-large-backward-v0.1.pt", # Bulgarian "bg-forward-fast": f"{aws_path}/embeddings-v0.3/lm-bg-small-forward-v0.1.pt", "bg-backward-fast": f"{aws_path}/embeddings-v0.3/lm-bg-small-backward-v0.1.pt", "bg-forward": f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-forward-v0.1.pt", "bg-backward": f"{aws_path}/embeddings-stefan-it/lm-bg-opus-large-backward-v0.1.pt", # Czech "cs-forward": f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-forward-v0.1.pt", "cs-backward": f"{aws_path}/embeddings-stefan-it/lm-cs-opus-large-backward-v0.1.pt", "cs-v0-forward": f"{aws_path}/embeddings-v0.4/lm-cs-large-forward-v0.1.pt", "cs-v0-backward": f"{aws_path}/embeddings-v0.4/lm-cs-large-backward-v0.1.pt", # Danish "da-forward": f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-forward-v0.1.pt", "da-backward": f"{aws_path}/embeddings-stefan-it/lm-da-opus-large-backward-v0.1.pt", # German "de-forward": f"{aws_path}/embeddings/lm-mix-german-forward-v0.2rc.pt", "de-backward": f"{aws_path}/embeddings/lm-mix-german-backward-v0.2rc.pt", "de-historic-ha-forward": f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-forward-v0.1.pt", "de-historic-ha-backward": f"{aws_path}/embeddings-stefan-it/lm-historic-hamburger-anzeiger-backward-v0.1.pt", "de-historic-wz-forward": f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-forward-v0.1.pt", "de-historic-wz-backward": f"{aws_path}/embeddings-stefan-it/lm-historic-wiener-zeitung-backward-v0.1.pt", # Spanish "es-forward": f"{aws_path}/embeddings-v0.4/language_model_es_forward_long/lm-es-forward.pt", "es-backward": f"{aws_path}/embeddings-v0.4/language_model_es_backward_long/lm-es-backward.pt", "es-forward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt", "es-backward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt", # Basque "eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt", "eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt", "eu-v1-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt", "eu-v1-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt", "eu-v0-forward": f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt", "eu-v0-backward": f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt", # Persian "fa-forward": f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-forward-v0.1.pt", "fa-backward": f"{aws_path}/embeddings-stefan-it/lm-fa-opus-large-backward-v0.1.pt", # Finnish "fi-forward": f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-forward-v0.1.pt", "fi-backward": f"{aws_path}/embeddings-stefan-it/lm-fi-opus-large-backward-v0.1.pt", # French "fr-forward": f"{aws_path}/embeddings/lm-fr-charlm-forward.pt", "fr-backward": f"{aws_path}/embeddings/lm-fr-charlm-backward.pt", # Hebrew "he-forward": f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-forward-v0.1.pt", "he-backward": f"{aws_path}/embeddings-stefan-it/lm-he-opus-large-backward-v0.1.pt", # Hindi "hi-forward": f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-forward-v0.1.pt", "hi-backward": f"{aws_path}/embeddings-stefan-it/lm-hi-opus-large-backward-v0.1.pt", # Croatian "hr-forward": f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-forward-v0.1.pt", "hr-backward": f"{aws_path}/embeddings-stefan-it/lm-hr-opus-large-backward-v0.1.pt", # Indonesian "id-forward": f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-forward-v0.1.pt", "id-backward": f"{aws_path}/embeddings-stefan-it/lm-id-opus-large-backward-v0.1.pt", # Italian "it-forward": f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-forward-v0.1.pt", "it-backward": f"{aws_path}/embeddings-stefan-it/lm-it-opus-large-backward-v0.1.pt", # Japanese "ja-forward": f"{aws_path}/embeddings-v0.4.1/lm__char-forward__ja-wikipedia-3GB/japanese-forward.pt", "ja-backward": f"{aws_path}/embeddings-v0.4.1/lm__char-backward__ja-wikipedia-3GB/japanese-backward.pt", # Dutch "nl-forward": f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-forward-v0.1.pt", "nl-backward": f"{aws_path}/embeddings-stefan-it/lm-nl-opus-large-backward-v0.1.pt", "nl-v0-forward": f"{aws_path}/embeddings-v0.4/lm-nl-large-forward-v0.1.pt", "nl-v0-backward": f"{aws_path}/embeddings-v0.4/lm-nl-large-backward-v0.1.pt", # Norwegian "no-forward": f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-forward-v0.1.pt", "no-backward": f"{aws_path}/embeddings-stefan-it/lm-no-opus-large-backward-v0.1.pt", # Polish "pl-forward": f"{aws_path}/embeddings/lm-polish-forward-v0.2.pt", "pl-backward": f"{aws_path}/embeddings/lm-polish-backward-v0.2.pt", "pl-opus-forward": f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-forward-v0.1.pt", "pl-opus-backward": f"{aws_path}/embeddings-stefan-it/lm-pl-opus-large-backward-v0.1.pt", # Portuguese "pt-forward": f"{aws_path}/embeddings-v0.4/lm-pt-forward.pt", "pt-backward": f"{aws_path}/embeddings-v0.4/lm-pt-backward.pt", # Pubmed "pubmed-forward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-fw-lm.pt", "pubmed-backward": f"{aws_path}/embeddings-v0.4.1/pubmed-2015-bw-lm.pt", # Slovenian "sl-forward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-forward-v0.1.pt", "sl-backward": f"{aws_path}/embeddings-stefan-it/lm-sl-opus-large-backward-v0.1.pt", "sl-v0-forward": f"{aws_path}/embeddings-v0.3/lm-sl-large-forward-v0.1.pt", "sl-v0-backward": f"{aws_path}/embeddings-v0.3/lm-sl-large-backward-v0.1.pt", # Swedish "sv-forward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-forward-v0.1.pt", "sv-backward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt", "sv-v0-forward": f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt", "sv-v0-backward": f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt", # Tamil "ta-forward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt", "ta-backward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt", } if type(model) == str: # load model if in pretrained model map if model.lower() in self.PRETRAINED_MODEL_ARCHIVE_MAP: base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[model.lower()] model = cached_path(base_path, cache_dir=cache_dir) elif replace_with_language_code( model) in self.PRETRAINED_MODEL_ARCHIVE_MAP: base_path = self.PRETRAINED_MODEL_ARCHIVE_MAP[ replace_with_language_code(model)] model = cached_path(base_path, cache_dir=cache_dir) elif not Path(model).exists(): raise ValueError( f'The given model "{model}" is not available or is not a valid path.' ) from flair.models import LanguageModel if type(model) == LanguageModel: self.lm: LanguageModel = model self.name = f"Task-LSTM-{self.lm.hidden_size}-{self.lm.nlayers}-{self.lm.is_forward_lm}" else: self.lm: LanguageModel = LanguageModel.load_language_model(model) self.name = str(model) # embeddings are static if we don't do finetuning self.fine_tune = fine_tune self.static_embeddings = not fine_tune self.is_forward_lm: bool = self.lm.is_forward_lm self.chars_per_chunk: int = chars_per_chunk # embed a dummy sentence to determine embedding_length dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token("hello")) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len( embedded_dummy[0].get_token(1).get_embedding()) # set to eval mode self.eval()
def __init__( self, path_to_column_file: Path, column_name_map: Dict[int, str], tags_to_bioes: List[str] = None, comment_symbol: str = '#', in_memory: bool = True, document_separator_token: str = None, encoding: str = "utf-8", ): """ Instantiates a column dataset (typically used for sequence labeling or word-level prediction). :param path_to_column_file: path to the file with the column-formatted data :param column_name_map: a map specifying the column format :param tags_to_bioes: whether to convert to BIOES tagging scheme :param comment_symbol: if set, lines that begin with this symbol are treated as comments :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token that indicates that a new document begins """ assert path_to_column_file.exists() self.path_to_column_file = path_to_column_file self.tags_to_bioes = tags_to_bioes self.column_name_map = column_name_map self.comment_symbol = comment_symbol self.document_separator_token = document_separator_token # store either Sentence objects in memory, or only file offsets self.in_memory = in_memory if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] self.total_sentence_count: int = 0 # most data sets have the token text in the first column, if not, pass 'text' as column self.text_column: int = 0 for column in self.column_name_map: if column_name_map[column] == "text": self.text_column = column # determine encoding of text file self.encoding = encoding sentence: Sentence = Sentence() with open(str(self.path_to_column_file), encoding=self.encoding) as f: line = f.readline() position = 0 while line: if self.comment_symbol is not None and line.startswith( comment_symbol): line = f.readline() continue if self.__line_completes_sentence(line): if len(sentence) > 0: sentence.infer_space_after() if self.in_memory: if self.tags_to_bioes is not None: for tag in self.tags_to_bioes: sentence.convert_tag_scheme( tag_type=tag, target_scheme="iobes") self.sentences.append(sentence) else: self.indices.append(position) position = f.tell() self.total_sentence_count += 1 sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[self.text_column]) for column in column_name_map: if len(fields) > column: if column != self.text_column: token.add_tag(self.column_name_map[column], fields[column]) if not line.isspace(): sentence.add_token(token) line = f.readline() if len(sentence.tokens) > 0: sentence.infer_space_after() if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) self.total_sentence_count += 1
def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: sentence: Sentence = Sentence() # Build the sentence tokens and add the annotations. for conllu_token in token_list: token = Token(conllu_token["form"]) for field in self.token_annotation_fields: field_value: Any = conllu_token[field] if isinstance(field_value, dict): # For fields that contain key-value annotations, # we add the key as label type-name and the value as the label value. for key, value in field_value.items(): token.add_label(typename=key, value=str(value)) else: token.add_label(typename=field, value=str(field_value)) if conllu_token.get("misc") is not None: space_after: Optional[str] = conllu_token["misc"].get( "SpaceAfter") if space_after == "No": token.whitespace_after = False sentence.add_token(token) if "sentence_id" in token_list.metadata: sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) if "relations" in token_list.metadata: for ( head_start, head_end, tail_start, tail_end, label, ) in token_list.metadata["relations"]: # head and tail span indices are 1-indexed and end index is inclusive head = Span(sentence.tokens[head_start - 1:head_end]) tail = Span(sentence.tokens[tail_start - 1:tail_end]) sentence.add_complex_label( "relation", RelationLabel(value=label, head=head, tail=tail)) # determine all NER label types in sentence and add all NER spans as sentence-level labels ner_label_types = [] for token in sentence.tokens: for annotation in token.annotation_layers.keys(): if annotation.startswith( "ner") and annotation not in ner_label_types: ner_label_types.append(annotation) for label_type in ner_label_types: spans = sentence.get_spans(label_type) for span in spans: sentence.add_complex_label( "entity", label=SpanLabel(span=span, value=span.tag, score=span.score), ) return sentence
def create_sentlist_from_file_batchmax(self, data, maxlen=64, compare_column="cat"): """ takes a pandas dataframe with columns 'tok' and 'sentstart' and creates a list of flair Sentence objects with tags. Each flair Sentence object may contain several real sentences, but at most maxlen tokens. The Sentence object stops at a sentence boundary, so it is often shorter than maxlen. Sentences longer than maxlen are split! If a line with token value "EOF" is encountered, a shorter flair Sentence object is returned, so no file boundaries are crossed :param data_path: :return: """ sent_list = [] toklist = [] catlist = [] # the len_last_token is needed to add proper start/end pos for each sentence token len_last_token = 0 # track the sentence that is currently being processed curr_sentence_tok = [] curr_sentence_cat = [] for index, row in data.iterrows(): tok = str(row["tok"]) if compare_column != "NaN": cat = str(row[compare_column]) else: cat = "-" # if the current token is "EOF" this marks the end of sample file # chunks may not cross file boundaries, therefore end the sentence here in any case if tok == "EOF": # do not add this token to any list # merge toklist and curr_sentence_tok list to get all current tokens # and create a flair sentence toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) self.logger.debug("create chunk at EOF with (len: {}): {}".format(len(toklist), toklist)) self.logger.debug("catlist with (len: {}): {}".format(len(catlist), catlist)) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", catlist[i]) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 toklist = [] catlist = [] # reset the curr sent lists as well curr_sentence_tok = [] curr_sentence_cat = [] else: # if we are at the start of a new sentence, add the contents of curr_sentence_tok # and curr_sentence_cat to the main lists and start a new curr_sentence if row["sentstart"] == "yes": toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) curr_sentence_tok = [tok] curr_sentence_cat = [cat] else: curr_sentence_tok.append(tok) curr_sentence_cat.append(cat) # if the combined length of toklist and curr_sentence_tok is > maxlen now, # create a flair sentence with the tokens in toklist and reset it # the remaining tokens in curr_sentence_tok are saved for the next chunk if len(toklist) + len(curr_sentence_tok) > maxlen: # if toklist is empty at this point, we have a sentence > maxlen # and must split it. The last token currently in curr_sentence will # be preserved for later so that the chunk is not too long if len(toklist) == 0: toklist.extend(curr_sentence_tok[0:-1]) catlist.extend(curr_sentence_cat[0:-1]) curr_sentence_tok = [curr_sentence_tok[-1]] curr_sentence_cat = [curr_sentence_cat[-1]] self.logger.debug("Sentence is split (len: {}): {}".format(len(toklist), toklist)) self.logger.debug("create chunk with (len: {}): {}".format(len(toklist), toklist)) self.logger.debug("catlist with (len: {}): {}".format(len(catlist), catlist)) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", str(catlist[i])) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 toklist = [] catlist = [] self.logger.debug("toklist: {}, curr_sent_tok: {}".format(len(toklist), len(curr_sentence_tok))) # if the loop is complete, empty the buffers and add them to the list if len(curr_sentence_tok) > 0: toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", str(catlist[i])) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 return sent_list
def bert_embeddings(sentences, tokenized_contents, output_file=None): # Using bert_tokenizer for checking for sequence wordpeice tokens length > 512 bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') if output_file: f = open(output_file, 'w') # init embedding # init multilingual BERT bert_embedding = TransformerWordEmbeddings('bert-large-uncased') long_sent = False for i, (sent, sent_tokens) in enumerate(zip(sentences, tokenized_contents)): print("Encoding the {}th input sentence for BERT embedding!".format(i)) # getting the length of bert tokenized sentence after wordpeice tokenization if len(bert_tokenizer.tokenize(sent[0])) >= 510: long_sent = True truncated_tokens = sent_tokens[:len(sent_tokens) // 2] sent_tokens = sent_tokens[len(sent_tokens) // 2:] # Using our own tokens (our own tokenization) tokens: List[Token] = [Token(token) for token in sent_tokens] # create an empty sentence sentence = Sentence() # add tokens from our own tokenization sentence.tokens = tokens bert_embedding.embed(sentence) for j, (token, st) in enumerate(zip(sentence, sent_tokens)): if token.text != st: raise ValueError("Invalid token text") if output_file: f.write( token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join([str(num) for num in token.embedding.tolist()]) + '\n') if long_sent: # tokenization for the rest of the sentence truncated_tokens: List[Token] = [ Token(token) for token in truncated_tokens ] # Create empty sentence truncated_sentence = Sentence() # add tokens from our own tokenization truncated_sentence.tokens = truncated_tokens bert_embedding.embed(truncated_sentence) for token in truncated_sentence: if output_file: f.write(token.text + " " + " ".join( [str(num) for num in token.embedding.tolist()]) + '\n') else: print(token.text + " " + " ".join( [str(num) for num in token.embedding.tolist()]) + '\n') long_sent = False f.write('\n')
def mock_ner_span(text, tag, start, end): span = Span([]).set_label("class", tag) span.start_pos = start span.end_pos = end span.tokens = [Token(text[start:end])] return span
def read_column_data( path_to_column_file: Path, column_name_map: Dict[int, str], infer_whitespace_after: bool = True, ): """ Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third the chunk and the forth the NER tag. :param path_to_column_file: the path to the column file :param column_name_map: a map of column number to token annotation name :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token :return: list of sentences """ sentences: List[Sentence] = [] try: lines: List[str] = open( str(path_to_column_file), encoding="utf-8").read().strip().split("\n") except: log.info( 'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format( path_to_column_file)) lines: List[str] = open( str(path_to_column_file), encoding="latin1").read().strip().split("\n") # most data sets have the token text in the first column, if not, pass 'text' as column text_column: int = 0 for column in column_name_map: if column_name_map[column] == "text": text_column = column sentence: Sentence = Sentence() for line in lines: if line.startswith("#"): continue if line.strip().replace("", "") == "": if len(sentence) > 0: sentence.infer_space_after() sentences.append(sentence) sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[text_column]) for column in column_name_map: if len(fields) > column: if column != text_column: token.add_tag(column_name_map[column], fields[column]) sentence.add_token(token) if len(sentence.tokens) > 0: sentence.infer_space_after() sentences.append(sentence) return sentences
def read_conll_ud(path_to_conll_file: Path) -> List[Sentence]: """ Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation :param path_to_conll_file: the path to the conll-u file :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding='utf-8'). \ read().strip().split('\n') sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\t+", line) if line == '': if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith('#'): continue elif '.' in fields[0]: continue elif '-' in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag('lemma', str(fields[2])) token.add_tag('upos', str(fields[3])) token.add_tag('pos', str(fields[4])) token.add_tag('dependency', str(fields[7])) for morph in str(fields[5]).split('|'): if not "=" in morph: continue token.add_tag( morph.split('=')[0].lower(), morph.split('=')[1]) if len(fields) > 10 and str(fields[10]) == 'Y': token.add_tag('frame', str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def read_conll_ud(path_to_conll_file: Path) -> List[Sentence]: """ Reads a file in CoNLL-U format and produces a list of Sentence with full morphosyntactic annotation :param path_to_conll_file: the path to the conll-u file :return: list of sentences """ sentences: List[Sentence] = [] lines: List[str] = open(path_to_conll_file, encoding="utf-8").read().strip().split("\n") sentence: Sentence = Sentence() for line in lines: fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: sentences.append(sentence) sentence: Sentence = Sentence() elif line.startswith("#"): continue elif "." in fields[0]: continue elif "-" in fields[0]: continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_tag("lemma", str(fields[2])) token.add_tag("upos", str(fields[3])) token.add_tag("pos", str(fields[4])) token.add_tag("dependency", str(fields[7])) for morph in str(fields[5]).split("|"): if not "=" in morph: continue token.add_tag( morph.split("=")[0].lower(), morph.split("=")[1]) if len(fields) > 10 and str(fields[10]) == "Y": token.add_tag("frame", str(fields[11])) sentence.add_token(token) if len(sentence.tokens) > 0: sentences.append(sentence) return sentences
def custom_tokenizer(text: str) -> List[Token]: global text_tokens_map tokens = text_tokens_map[text] tokens: List[Token] = [Token(token) for token in tokens] return tokens