def setup_class(cls): # create a sample dataset to test cls.dataset = Dataset() part = Part('some text c.A100G p.V100Q some text') part.sentences = [[ Token('some', 0), Token('text', 5), Token('c', 10), Token('.', 11), Token('A', 12), Token('100', 13), Token('G', 16), Token('p', 18), Token('.', 19), Token('V', 20), Token('100', 21), Token('Q', 24), Token('some', 26), Token('text', 31) ]] predicted_labels = [ 'O', 'O', 'B', 'I', 'I', 'I', 'E', 'A', 'I', 'I', 'I', 'E', 'O', 'O' ] for index, label in enumerate(predicted_labels): part.sentences[0][index].predicted_labels = [Label(label)] cls.dataset.documents['doc_1'] = Document() cls.dataset.documents['doc_1'].parts['p1'] = part part = Part('test edge case DNA A927B test') part.sentences = [[ Token('test', 0), Token('edge', 5), Token('case', 10), Token('DNA', 15), Token('A', 19), Token('927', 20), Token('B', 23), Token('test', 25) ]] predicted_labels = ['O', 'O', 'O', 'O', 'M', 'P', 'M', 'O'] for index, label in enumerate(predicted_labels): part.sentences[0][index].predicted_labels = [Label(label)] cls.dataset.documents['doc_1'].parts['p2'] = part
def tag(data, model_file, class_id): warnings.warn('Use non-static `annotate` instead', DeprecationWarning) """ :type data: nalaf.structures.data.Dataset :type model_file: str """ tagger = pycrfsuite.Tagger() try: tagger.open(model_file) for sentence in data.sentences(): labels = tagger.tag( pycrfsuite.ItemSequence(token.features for token in sentence)) for token_index in range(len(sentence)): label = labels[token_index] sentence[token_index].predicted_labels = [ Label(label, tagger.marginal(label, token_index)) ] data.form_predicted_annotations(class_id) finally: tagger.close()
def read_predictions(self, dataset, class_id, prediction_file='output.txt'): """ :type dataset: nalaf.structures.data.Dataset Reads in the predictions made by our model for each token and stores them into token.predicted_label[] Requires a dataset object and the output prediction file. The default output prediction file is 'output.txt'. The format is: * [predicted label]:[marginal probability] * in new line for each token * followed by a blank line for the end of the sentence IMPORTANT NOTE: Assumes a call to the test() function was made previously with the 'i' option included. Furthermore, it assumes we are calling it with the same dataset object used to create the test file. For example first we would call: * crf.create_input_file(dataset=test, mode='test') * crf.test(options='-m example_entity_model -i test > output.txt') Then we would call: * crf.read_predictions(dataset=test) """ os.chdir(self.directory) with open(prediction_file) as file: for sentence in dataset.sentences(): for token in sentence: label, probability = file.readline().split(':') token.predicted_labels = [Label(label, float(probability))] file.readline() # skip the empty line signifying new sentence # call form_predicted_annotations() to populate the mention level predictions dataset.form_predicted_annotations(class_id)
def annotate(self, corpus, class_id): """ :type corpus: nalaf.structures.data.Dataset :type class_id: str ~ to annotate with """ for sentence in corpus.sentences(): labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence)) for token_index in range(len(sentence)): label = labels[token_index] sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))] corpus.form_predicted_annotations(class_id)
def label(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ for part in dataset.parts(): for sentence in part.sentences: for token in sentence: token.original_labels = [Label('O')] for ann in part.annotations: start = ann.offset end = ann.offset + len(ann.text) if start <= token.start < token.end <= end: token.original_labels[0].value = 'I-{}'.format( ann.class_id)
def annotate(self, corpus, class_id): """ :type corpus: nalaf.structures.data.Dataset :type class_id: str ~ to annotate with """ for sentence in corpus.sentences(): labels = self.tagger.tag(pycrfsuite.ItemSequence(token.features for token in sentence)) for token_index in range(len(sentence)): label = labels[token_index] try: sentence[token_index].predicted_labels = [Label(label, self.tagger.marginal(label, token_index))] except Exception as e: raise Exception("Exception when assining the predicted labels; likely a Multi-Thread problem", e) corpus.form_predicted_annotations(class_id)
def label(self, dataset): """ :type dataset: nalaf.structures.data.Dataset """ for part in dataset.parts(): previous_token = None for sentence in part.sentences: alternate = 'W' for token in sentence: token.original_labels = [Label('O')] for ann in part.annotations: start = ann.offset end = ann.offset + len(ann.text) if start == token.start or start < token.start < end: if ann.class_id == self.mut_class_id: self._match_regex_label(previous_token, token) previous_token = token # replace temporary label with W or M if token.original_labels[0].value == '*': token.original_labels[0].value = alternate alternate = 'W' if alternate == 'M' else 'M' # reset the alternation to W since we reached end if token.end == end: alternate = 'W' break # iterate a sliding window of 3 # when you find 'P I P' labels replace them with 'P P P' for previous, current, next in zip(sentence, sentence[1:], sentence[2:]): if previous.original_labels[ 0].value == 'P' and next.original_labels[ 0].value == 'P': if current.original_labels[0].value == 'I': current.original_labels[0].value = 'P'