def evaluate_block(self, data_iter: DataLoader, docs: Sequence[Document]) -> float: """ :param data_iter: :param docs: :return: """ self.decode_block(data_iter=data_iter, docs=docs) if self.chunking: acc = ChunkF1() for doc in docs: for sen in doc.sentences: acc.update(labels=sen[to_gold(self.key)], preds=sen[self.key]) else: acc = Accuracy() for doc in docs: for sen in doc.sentences: labels = nd.array([ self.label_map.cid(label) for label in sen[to_gold(self.key)] ]) preds = nd.array( [self.label_map.cid(pred) for pred in sen[self.key]]) acc.update(labels=labels, preds=preds) return acc.get()[1]
def json_reader(filepath: str, cols: Any = None, key: str = None) -> Tuple[List[Document], LabelMap]: # TODO: update this to accept any format (see tsv_reader) documents = [] dc = wc = sc = 0 label_map = LabelMap() logging.info('Reading json file from: ') if not os.path.isfile(filepath) and filepath.endswith('.json'): raise ValueError("{} is not a valid format".format(filepath)) logging.info('- filepath: %s' % filepath) with open(filepath) as f: docs = json.load(f) for i, doc in enumerate(docs): sentences = [] for sen in doc['sens']: wc += len(sen['tok']) if key is not None: sen = sen.copy() sen[to_gold(key)] = sen.pop(key) sentences.append(Sentence(sen)) sc += len(sentences) [[label_map.add(i) for i in sent[to_gold(key)]] for sent in sentences] document = Document(sens=sentences) document[DOC_ID] = i documents.append(document) dc += len(documents) logging.info('- dc = %d, sc = %d, wc = %d' % (dc, sc, wc)) return documents, label_map
def tsv_reader(tsv_directory: str, cols: Dict[str, int], key: str = None) -> Tuple[List[Document], LabelMap]: documents = [] wc = sc = 0 label_map = LabelMap() if TOK not in cols: raise ValueError('The column index of "%s" must be specified' % TOK) if key is not None: if key in cols: cols = cols.copy() cols[to_gold(key)] = cols.pop(key) else: raise ValueError('Key mismatch: %s is not a key in %s' % (key, str(cols))) logging.info('Reading tsv from:') logging.info('- directory: %s' % tsv_directory) for filename in glob.glob('{}/*'.format(tsv_directory)): # avoid reading unexpected files, such as hidden files. if not os.path.isfile(filename): continue logging.info(' - file: %s' % filename) sentences = [] sid = 0 fields = {k: [] for k in cols.keys()} with open(filename) as fin: for line in fin.readlines(): if line.startswith('#'): continue l = line.split() if l: for k, v in fields.items(): v.append(l[cols[k]]) elif len(fields[TOK]) > 0: wc += len(fields[TOK]) sentences.append(Sentence(fields)) fields = {k: [] for k in cols.keys()} if len(fields[TOK]) > 0: wc += len(fields[TOK]) sentences.append(Sentence(fields)) [[label_map.add(i) for i in sent[to_gold(key)]] for sent in sentences] [sent.update({SID: i}) for i, sent in enumerate(sentences)] sc += len(sentences) documents.append(Document(sens=sentences)) [sent.update({DOC_ID: i}) for i, sent in enumerate(documents)] logging.info('- dc = %d, sc = %d, wc = %d' % (len(documents), sc, wc)) return documents, label_map
def extract(self, did, sid, sen, **kwargs): w = self.extract_sen(sen) if self.label: for i, label in enumerate(sen[to_gold(self.key)]): self._data.append((self.extract_x(i, w), self.extract_y(label))) else: for i in range(len(sen)): self._data.append((self.extract_x(i, w), -1))
def extract_labels(self, sen): if self.label: return nd.array( [self.label_map.cid(l) for l in sen[to_gold(self.key)]]) else: return nd.array([-1 for _ in range(len(sen))])