def load_data(self):
     if self.filename.endswith('.json'):
         data = read_json(self.filename)
     elif self.filename.endswith('.jsonl'):
         data = read_jsonline(self.filename)
     else:
         data = read_json(self.filename)
         self.logger.warning('Your file suffix is not json or jsonl')
     return data
 def __transform(self):
     sents = []
     labels = []
     seq_lengths = []
     input_sents = read_json(self.__filename)
     random.shuffle(input_sents)
     random.shuffle(input_sents)
     random.shuffle(input_sents)
     random.shuffle(input_sents)
     random.shuffle(input_sents)
     for sent in input_sents:
         sent_words = [t['text'] for t in sent['tokens']]
         sent_labels = sent['labels']
         mapped_words = [self.__word2id_mapper[word] for word in sent_words]
         mapped_labels = [
             self.__label2id_mapper[label] for label in sent_labels
         ]
         if len(mapped_words) >= self.__sent_padding_length:
             mapped_words = mapped_words[:self.__sent_padding_length]
             mapped_labels = mapped_labels[:self.__sent_padding_length]
         else:
             pad_idx = self.__word2id_mapper[BATCH_PAD]
             mapped_words += [pad_idx] * (self.__sent_padding_length -
                                          len(sent_words))
             mapped_labels += [0] * (self.__sent_padding_length -
                                     len(sent_labels))
         if self.__is_skip_window:
             sents.append(self.__indices2index_windows(mapped_words))
         else:
             sents.append(mapped_words)
         labels.append(mapped_labels)
         seq_lengths.append(len(sent_labels))
         self.__sent_count += 1
     return np.array(sents), np.array(labels), np.array(seq_lengths)
 def evaluate(self, pred_filename):
     pred_data = read_json(pred_filename)
     kfold_counter = {}
     for i in range(self.k):
         single_fold_pred_data = pred_data[i]
         ret = self.__evaluators[i].evaluate(single_fold_pred_data,
                                             is_percentage=False)
         for metrics, e_counter in ret.items():
             for e_type, val in e_counter.items():
                 kfold_counter[metrics + '-' + e_type][str(i)] = val
     kfold_counter = pd.DataFrame(kfold_counter)
     counter_sum = kfold_counter.sum(axis=1)
     macro_row = {}
     micro_row = {}
     for e in self.__entity_types:
         micro_precision = counter_sum['true_positive_count-' +
                                       e] / counter_sum['pred_count-' + e]
         micro_recall = counter_sum['true_positive_count-' +
                                    e] / counter_sum['true_count-' + e]
         micro_f1 = 2 * micro_precision * micro_recall / (micro_precision +
                                                          micro_recall)
         micro_row['precision-' + e] = micro_precision
         micro_row['recall-' + e] = micro_recall
         micro_row['f1-' + e] = micro_f1
         macro_row['precision-' +
                   e] = counter_sum['precision-' + e] / self.k
         macro_row['recall-' + e] = counter_sum['recall-' + e] / self.k
         macro_row['f1-' + e] = counter_sum['f1-' + e] / self.k
     kfold_counter.append(macro_row)
     kfold_counter.append(micro_row)
     for e in self.__entity_types:
         col_names = ['true_positive_count-' + e, 'pred_count-' + e]
         kfold_counter.drop(col_names, axis='columns', inplace=True)
     return kfold_counter
 def __get_data(self, data):
     if data is None:
         return data
     elif isinstance(data, str):
         return read_json(data)
     elif isinstance(data, list):
         return data
     else:
         raise TypeError(
             'input data type {} is invalid in evaluation'.format(
                 type(data)))
 def __init__(self, k, true_filename, entity_types=ENTITY_TYPES):
     self.k = k
     self.__entity_types = entity_types
     evaluators = []
     true_data = read_json(true_filename)
     if k != len(true_data):
         raise ValueError('k and true data does not correspond.')
     for single_fold_true_data in true_data:
         evaluator = EntityEvaluator(single_fold_true_data,
                                     entity_types=entity_types)
         evaluators.append(evaluator)
     self.__evaluators = evaluators
Beispiel #6
0
 def from_json(cls, filename):
     return cls(**read_json(filename))