def _parse_line_to_sentence(self, line: str, label_prefix: str, tokenizer: Callable[[str], List[Token]]): words = line.split() labels = [] l_len = 0 for i in range(len(words)): if words[i].startswith(label_prefix): l_len += len(words[i]) + 1 label = words[i].replace(label_prefix, "") labels.append(label) else: break text = line[l_len:].strip() if self.truncate_to_max_chars > 0: text = text[:self.truncate_to_max_chars] if text and labels: sentence = Sentence(text, use_tokenizer=tokenizer) for label in labels: sentence.add_label(self.label_type, label) if (sentence is not None and 0 < self.truncate_to_max_tokens < len(sentence)): sentence.tokens = sentence.tokens[:self.truncate_to_max_tokens] return sentence return None
def test_tagged_corpus_statistics_multi_label(): train_sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1') dev_sentence = Sentence("The sun is shining.", use_tokenizer=True).add_label('label', 'class_2') test_sentence = Sentence("Berlin is sunny.", use_tokenizer=True) test_sentence.add_label('label', 'class_1') test_sentence.add_label('label', 'class_2') class_to_count_dict = Corpus._count_sentence_labels( [train_sentence, dev_sentence, test_sentence] ) assert "class_1" in class_to_count_dict assert "class_2" in class_to_count_dict assert 2 == class_to_count_dict["class_1"] assert 2 == class_to_count_dict["class_2"] tokens_in_sentences = Corpus._get_tokens_per_sentence( [train_sentence, dev_sentence, test_sentence] ) assert 3 == len(tokens_in_sentences) assert 4 == tokens_in_sentences[0] assert 5 == tokens_in_sentences[1] assert 4 == tokens_in_sentences[2]
def _get_tars_formatted_sentence(self, label, original_text, tars_label=None): label_text_pair = " ".join([self._get_cleaned_up_label(label), self.tars_model.document_embeddings.tokenizer.sep_token, original_text]) label_text_pair_sentence = Sentence(label_text_pair) if tars_label is not None: if tars_label: label_text_pair_sentence.add_label(self.tars_model.label_type, TARSClassifier.static_label_yes) else: label_text_pair_sentence.add_label(self.tars_model.label_type, TARSClassifier.static_label_no) return label_text_pair_sentence
def iterate_dataframe(dataset: pd.DataFrame) -> Iterable[Sentence]: for _, row in dataset.iterrows(): res = encoder(row.smiles) if not res: continue res = res.replace("]", "] ").replace(".", "DOT ") sent = Sentence(res.strip(), use_tokenizer=plain_tokenizer) for col, val in row.items(): if isinstance(val, float): if val == 1.0: sent.add_label(None, col.replace(" ", "_") + "_P ") if val == 0.0: sent.add_label(None, col.replace(" ", "_") + "_N ") yield sent
def _parse_document_to_sentence( self, text: str, labels: List[str], tokenizer: Union[Callable[[str], List[Token]], Tokenizer], ): if self.max_chars_per_doc > 0: text = text[:self.max_chars_per_doc] if text and labels: sentence = Sentence(text, use_tokenizer=tokenizer) for label in labels: sentence.add_label(self.tag_type, label) if self.max_tokens_per_doc > 0: sentence.tokens = sentence.tokens[:min(len(sentence), self. max_tokens_per_doc)] return sentence return None
def test_mixed_labels(): # example sentence sentence = Sentence("I love New York") # has sentiment value sentence.add_label("sentiment", "positive") # has 4 part of speech tags sentence[1].add_label("pos", "verb") sentence[2].add_label("pos", "proper noun") sentence[3].add_label("pos", "proper noun") sentence[0].add_label("pos", "pronoun") # has 1 NER tag sentence[2:4].add_label("ner", "City") # should be in total 6 labels assert 6 == len(sentence.labels) assert 4 == len(sentence.get_labels("pos")) assert 1 == len(sentence.get_labels("sentiment")) assert 1 == len(sentence.get_labels("ner"))
def __getitem__(self, index: int = 0) -> Sentence: if self.in_memory: return self.sentences[index] else: row = self.raw_data[index] text = " ".join( [row[text_column] for text_column in self.text_columns]) if self.max_chars_per_doc > 0: text = text[:self.max_chars_per_doc] sentence = Sentence(text, use_tokenizer=self.tokenizer) for column in self.column_name_map: if self.column_name_map[column].startswith( "label") and row[column]: sentence.add_label(self.label_type, row[column]) if 0 < self.max_tokens_per_doc < len(sentence): sentence.tokens = sentence.tokens[:self.max_tokens_per_doc] return sentence
def test_sentence_labels(): # example sentence sentence = Sentence("I love Berlin") sentence.add_label("sentiment", "positive") sentence.add_label("topic", "travelling") assert 2 == len(sentence.labels) assert 1 == len(sentence.get_labels("sentiment")) assert 1 == len(sentence.get_labels("topic")) # add another topic label sentence.add_label("topic", "travelling") assert 3 == len(sentence.labels) assert 1 == len(sentence.get_labels("sentiment")) assert 2 == len(sentence.get_labels("topic")) sentence.remove_labels("topic") assert 1 == len(sentence.labels) assert 1 == len(sentence.get_labels("sentiment")) assert 0 == len(sentence.get_labels("topic"))
def predict( self, text: Union[List[Sentence], Sentence, List[str], str], mini_batch_size: int = 32, **kwargs, ) -> List[Sentence]: """Predict method for running inference using the pre-trained sequence classifier model * **text** - String, list of strings, sentences, or list of sentences to run inference on * **mini_batch_size** - Mini batch size * ****kwargs**(Optional) - Optional arguments for the Transformers classifier """ id2label = self.model.config.id2label sentences = text results: List[Sentence] = [] with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, DataPoint) or isinstance(sentences, str): sentences = [sentences] # filter empty sentences if isinstance(sentences[0], Sentence): sentences = [ sentence for sentence in sentences if len(sentence) > 0 ] if len(sentences) == 0: return sentences # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) original_order_index = sorted(range(len(rev_order_len_index)), key=lambda k: rev_order_len_index[k]) reordered_sentences: List[Union[DataPoint, str]] = [ sentences[index] for index in rev_order_len_index ] # Turn all Sentence objects into strings if isinstance(reordered_sentences[0], Sentence): str_reordered_sentences = [ sentence.to_original_text() for sentence in sentences ] else: str_reordered_sentences = reordered_sentences # Tokenize and get dataset dataset = self._tokenize(str_reordered_sentences) dataloader = DataLoader(dataset, batch_size=mini_batch_size) predictions: List[Tuple[str, float]] = [] logger.info(f"Running prediction on {len(dataset)} text sequences") logger.info(f"Batch size = {mini_batch_size}") for batch in tqdm(dataloader, desc="Predicting text"): self.model.eval() batch = tuple(t.to(self.device) for t in batch) if len(batch) == 3: inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } else: inputs = { "input_ids": batch[0], "attention_mask": batch[1] } outputs = self.model(**inputs) logits = outputs[0] preds = torch.softmax(logits, dim=1).tolist() predictions += preds for text, pred in zip(str_reordered_sentences, predictions): # Initialize and assign labels to each class in each datapoint prediction text_sent = Sentence(text) for k, v in id2label.items(): text_sent.add_label(label_type="sc", value=v, score=pred[k]) results.append(text_sent) # Order results back into original order results = [results[index] for index in original_order_index] return results
# print the sentence with all tags of this type print(sentence.to_tagged_string()) ### from flair.data import Label tag: Label = sentence[3].get_tag('ner') print( f'"{sentence[3]}" is tagged as "{tag.value}" with confidence score "{tag.score}"' ) ### sentence = Sentence('France is the current world cup winner.') # add a label to a sentence sentence.add_label('sports') # a sentence can also belong to multiple classes sentence.add_labels(['sports', 'world cup']) # you can also set the labels while initializing the sentence sentence = Sentence('France is the current world cup winner.', labels=['sports', 'world cup']) print(sentence) for label in sentence.labels: print(label)
def __init__( self, path_to_file: Union[str, Path], column_name_map: Dict[int, str], label_type: str = "class", max_tokens_per_doc: int = -1, max_chars_per_doc: int = -1, tokenizer=segtok_tokenizer, in_memory: bool = True, skip_header: bool = False, encoding: str = 'utf-8', **fmtparams, ): """ Instantiates a Dataset for text classification from CSV column formatted data :param path_to_file: path to the file with the CSV data :param column_name_map: a column name map that indicates which column is text and which the label(s) :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings :param skip_header: If True, skips first line because it is header :param fmtparams: additional parameters for the CSV file reader :return: a Corpus with annotated train, dev and test data """ if type(path_to_file) == str: path_to_file: Path = Path(path_to_file) assert path_to_file.exists() # variables self.path_to_file = path_to_file self.in_memory = in_memory self.tokenizer = tokenizer self.column_name_map = column_name_map self.max_tokens_per_doc = max_tokens_per_doc self.max_chars_per_doc = max_chars_per_doc self.label_type = label_type # different handling of in_memory data than streaming data if self.in_memory: self.sentences = [] else: self.raw_data = [] self.total_sentence_count: int = 0 # most data sets have the token text in the first column, if not, pass 'text' as column self.text_columns: List[int] = [] for column in column_name_map: if column_name_map[column] == "text": self.text_columns.append(column) with open(self.path_to_file, encoding=encoding) as csv_file: csv_reader = csv.reader(csv_file, **fmtparams) if skip_header: next(csv_reader, None) # skip the headers for row in csv_reader: # test if format is OK wrong_format = False for text_column in self.text_columns: if text_column >= len(row): wrong_format = True if wrong_format: continue # test if at least one label given has_label = False for column in self.column_name_map: if self.column_name_map[column].startswith( "label") and row[column]: has_label = True break if not has_label: continue if self.in_memory: text = " ".join([ row[text_column] for text_column in self.text_columns ]) if self.max_chars_per_doc > 0: text = text[:self.max_chars_per_doc] sentence = Sentence(text, use_tokenizer=self.tokenizer) for column in self.column_name_map: if (self.column_name_map[column].startswith("label") and row[column]): sentence.add_label(label_type, row[column]) if 0 < self.max_tokens_per_doc < len(sentence): sentence.tokens = sentence.tokens[:self. max_tokens_per_doc] self.sentences.append(sentence) else: self.raw_data.append(row) self.total_sentence_count += 1