Beispiel #1
0
    def evaluate_block(self, data_iter: DataLoader,
                       docs: Sequence[Document]) -> float:
        """

        :param data_iter:
        :param docs:
        :return:
        """
        self.decode_block(data_iter=data_iter, docs=docs)
        if self.chunking:
            acc = ChunkF1()
            for doc in docs:
                for sen in doc.sentences:
                    acc.update(labels=sen[to_gold(self.key)],
                               preds=sen[self.key])
        else:
            acc = Accuracy()
            for doc in docs:
                for sen in doc.sentences:
                    labels = nd.array([
                        self.label_map.cid(label)
                        for label in sen[to_gold(self.key)]
                    ])
                    preds = nd.array(
                        [self.label_map.cid(pred) for pred in sen[self.key]])
                    acc.update(labels=labels, preds=preds)
        return acc.get()[1]
Beispiel #2
0
def json_reader(filepath: str,
                cols: Any = None,
                key: str = None) -> Tuple[List[Document], LabelMap]:
    # TODO: update this to accept any format (see tsv_reader)
    documents = []
    dc = wc = sc = 0
    label_map = LabelMap()

    logging.info('Reading json file from: ')
    if not os.path.isfile(filepath) and filepath.endswith('.json'):
        raise ValueError("{} is not a valid format".format(filepath))
    logging.info('- filepath: %s' % filepath)

    with open(filepath) as f:
        docs = json.load(f)
        for i, doc in enumerate(docs):
            sentences = []
            for sen in doc['sens']:
                wc += len(sen['tok'])
                if key is not None:
                    sen = sen.copy()
                    sen[to_gold(key)] = sen.pop(key)
                sentences.append(Sentence(sen))
            sc += len(sentences)
            [[label_map.add(i) for i in sent[to_gold(key)]] for sent in sentences]
            document = Document(sens=sentences)
            document[DOC_ID] = i
            documents.append(document)
            dc += len(documents)
    logging.info('- dc = %d, sc = %d, wc = %d' % (dc, sc, wc))
    return documents, label_map
Beispiel #3
0
def tsv_reader(tsv_directory: str,
               cols: Dict[str, int],
               key: str = None) -> Tuple[List[Document], LabelMap]:
    documents = []
    wc = sc = 0
    label_map = LabelMap()

    if TOK not in cols:
        raise ValueError('The column index of "%s" must be specified' % TOK)

    if key is not None:
        if key in cols:
            cols = cols.copy()
            cols[to_gold(key)] = cols.pop(key)
        else:
            raise ValueError('Key mismatch: %s is not a key in %s' % (key, str(cols)))

    logging.info('Reading tsv from:')
    logging.info('- directory: %s' % tsv_directory)

    for filename in glob.glob('{}/*'.format(tsv_directory)):
        # avoid reading unexpected files, such as hidden files.
        if not os.path.isfile(filename):
            continue
        logging.info('  - file: %s' % filename)

        sentences = []
        sid = 0
        fields = {k: [] for k in cols.keys()}

        with open(filename) as fin:
            for line in fin.readlines():
                if line.startswith('#'):
                    continue
                l = line.split()

                if l:
                    for k, v in fields.items():
                        v.append(l[cols[k]])
                elif len(fields[TOK]) > 0:
                    wc += len(fields[TOK])
                    sentences.append(Sentence(fields))
                    fields = {k: [] for k in cols.keys()}

            if len(fields[TOK]) > 0:
                wc += len(fields[TOK])
                sentences.append(Sentence(fields))

        [[label_map.add(i) for i in sent[to_gold(key)]] for sent in sentences]
        [sent.update({SID: i}) for i, sent in enumerate(sentences)]
        sc += len(sentences)
        documents.append(Document(sens=sentences))

    [sent.update({DOC_ID: i}) for i, sent in enumerate(documents)]
    logging.info('- dc = %d, sc = %d, wc = %d' % (len(documents), sc, wc))
    return documents, label_map
Beispiel #4
0
 def extract(self, did, sid, sen, **kwargs):
     w = self.extract_sen(sen)
     if self.label:
         for i, label in enumerate(sen[to_gold(self.key)]):
             self._data.append((self.extract_x(i,
                                               w), self.extract_y(label)))
     else:
         for i in range(len(sen)):
             self._data.append((self.extract_x(i, w), -1))
Beispiel #5
0
 def extract_labels(self, sen):
     if self.label:
         return nd.array(
             [self.label_map.cid(l) for l in sen[to_gold(self.key)]])
     else:
         return nd.array([-1 for _ in range(len(sen))])