def json2crf(training_data): """ Takes json annotated data and converts to CRFSuite training data representation :param training_data: :return labeled_examples: """ from app.nlu.tasks import sentence_tokenize, pos_tag_and_label labeled_examples = [] for example in training_data: # POS tag and initialize bio label as 'O' for all the tokens tagged_example = pos_tag_and_label(example.get("text")) # find no of words before selection for enitity in example.get("entities"): try: begin_index = enitity.get("begin") end_index = enitity.get("end") # find no of words before the entity inverse_selection = example.get("text")[0:begin_index - 1] inverse_selection = sentence_tokenize(inverse_selection) inverse_selection = inverse_selection.split(" ") inverse_word_count = len(inverse_selection) # get the entity value from selection selection = example.get("text")[begin_index:end_index] tokens = sentence_tokenize(selection).split(" ") selection_word_count = len(tokens) # build BIO tagging for i in range(1, selection_word_count + 1): if i == 1: bio = "B-" + enitity.get("name") else: bio = "I-" + enitity.get("name") tagged_example[(inverse_word_count + i) - 1][2] = bio except: # catches and skips invalid offsets and annotation continue labeled_examples.append(tagged_example) return labeled_examples
def json2crf(training_data): """ Takes json annotated data and converts to CRFSuite training data representation :param training_data: :return labeled_examples: """ from app.nlu.tasks import sentence_tokenize, pos_tag_and_label labeled_examples = [] for example in training_data: # POS tag and initialize bio label as 'O' for all the tokens tagged_example = pos_tag_and_label(example.get("text")) # find no of words before selection for enitity in example.get("entities"): try: # find no of words before the entity inverse_selection = example.get("text")[0:enitity.get("begin")-1] inverse_word_count = len(sentence_tokenize(inverse_selection).split(" ")) # get the entity value from seletion selection = example.get( "text")[enitity.get("begin"):enitity.get("end")] tokens = sentence_tokenize(selection).split(" ") selection_word_count = len(tokens) # build BIO tagging for i in range(1, selection_word_count+1): if i == 1: bio = "B-" + enitity.get("name") else: bio = "I-" + enitity.get("name") tagged_example[(inverse_word_count + i) - 1][2] = bio except: # catches and skips invalid offsets and annotation continue labeled_examples.append(tagged_example) return labeled_examples