Beispiel #1
0
    def lookup(keyword, cache=None):
        if cache:
            if cache.contains(keyword):
                return cache.get(keyword)
        logger.info('request ' + keyword)
        url = "https://vlsp.hpda.vn/demo/?page=vcl"
        payload = {"word": keyword}
        r = requests.post(url, data=payload)
        soup = BeautifulSoup(r.content, "html.parser")
        senses_data = soup.select("#vcl_content table .sense")
        if len(senses_data) == 0:
            if cache:
                cache.add(keyword, None)
            return None

        w = Word(keyword, senses=[])
        for sense_data in senses_data:
            tags = sense_data.select(".word_description li")
            # morpho_tag = tags[0]
            syntax_tag = tags[1]
            semantic_tag = tags[2]
            syntax_tag = syntax_tag.select("font")
            tag, sub_tag = syntax_tag[0].text, syntax_tag[1].text
            description = " / ".join(
                [item.text for item in semantic_tag.select("font")])
            sense = Sense(tag, sub_tag, description)
            w.add_sense(sense)
            if cache:
                cache.add(keyword, w)
        return w
Beispiel #2
0
def load_big_file(f: str) -> mmap.mmap:
    r"""
    Workaround for loading a big pickle file. Files over 2GB cause pickle errors on certin Mac and Windows distributions.
    Source: flairNLP
    """
    logger.info(f"loading file {f}")
    with open(f, "rb") as f_in:
        # mmap seems to be much more memory efficient
        bf = mmap.mmap(f_in.fileno(), 0, access=mmap.ACCESS_READ)
        f_in.close()
    return bf
Beispiel #3
0
    def evaluate(self, data, buckets=8, batch_size=5000, **kwargs):
        args = self.args.update(locals())

        self.transform.train()
        logger.info("Loading the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets)
        logger.info(f"\n{dataset}")

        logger.info("Evaluating the dataset")
        start = datetime.now()
        loss, metric = self._evaluate(dataset.loader)
        elapsed = datetime.now() - start
        logger.info(f"loss: {loss:.4f} - {metric}")
        logger.info(
            f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s"
        )

        return loss, metric
Beispiel #4
0
    def predict(self,
                data,
                pred=None,
                buckets=8,
                batch_size=5000,
                prob=False,
                **kwargs):
        args = self.args.update(locals())
        if not args:
            args = locals()
            args.update(kwargs)
            args = type('Args', (object, ), locals())

        self.transform.eval()
        if args.prob:
            self.transform.append(Field('probs'))

        logger.info("Loading the data")
        dataset = Dataset(self.transform, data)
        dataset.build(args.batch_size, args.buckets)
        logger.info(f"\n{dataset}")

        logger.info("Making predictions on the dataset")
        start = datetime.now()
        preds = self._predict(dataset.loader)
        elapsed = datetime.now() - start

        for name, value in preds.items():
            setattr(dataset, name, value)
        if pred is not None:
            logger.info(f"Saving predicted results to {pred}")
            self.transform.save(pred, dataset.sentences)
        logger.info(
            f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s"
        )

        return dataset
Beispiel #5
0
def get_from_cache(url: str, cache_dir: Path = None) -> Path:
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    cache_dir.mkdir(parents=True, exist_ok=True)

    filename = re.sub(r'.+/', '', url)
    # get cache path to put the file
    cache_path = cache_dir / filename
    if cache_path.exists():
        return cache_path

    # make HEAD request to check ETag
    response = requests.head(url)

    # (anhv: 27/12/2020) github release assets return 302
    if response.status_code not in [200, 302]:
        if "www.dropbox.com" in url:
            # dropbox return code 301, so we ignore this error
            pass
        else:
            raise IOError("HEAD request failed for url {}".format(url))

    # add ETag to filename if it exists
    # etag = response.headers.get("ETag")

    if not cache_path.exists():
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        fd, temp_filename = tempfile.mkstemp()
        logger.info("%s not found in cache, downloading to %s", url,
                    temp_filename)

        # GET file object
        req = requests.get(url, stream=True)
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = Tqdm.tqdm(unit="B", total=total)
        with open(temp_filename, 'wb') as temp_file:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)

        progress.close()

        logger.info("copying %s to cache at %s", temp_filename, cache_path)
        shutil.copyfile(temp_filename, str(cache_path))
        logger.info("removing temp file %s", temp_filename)
        os.close(fd)
        os.remove(temp_filename)

    return cache_path
Beispiel #6
0
def make_ud_file(file):
    logger.info(msg=file)
    sentences = []
    i = 0
    for line in open(join(CLEANED_FOLDER, file)):
        s = UDSentence.load_from_raw_text(line)
        sentences.append(s)
        i += 1
        if i % 200 == 0:
            logger.info(f"{file}:{i}")
    ud_dataset = UDDataset(sentences)
    ud_dataset.write(join(UD_FOLDER, file))
    logger.info(f"Finish {file}")
    def train(self,
              base_path: Union[Path, str],
              fix_len=20,
              min_freq=2,
              buckets=32,
              batch_size=5000,
              lr=2e-3,
              mu=.9,
              nu=.9,
              epsilon=1e-12,
              clip=5.0,
              decay=.75,
              decay_steps=5000,
              patience=100,
              max_epochs=10):
        r"""
        Train any class that implement model interface

        Args:
            base_path (object): Main path to which all output during training is logged and models are saved
            max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
            patience:
            decay_steps:
            decay:
            clip:
            epsilon:
            nu:
            mu:
            lr:
            proj:
            tree:
            batch_size:
            buckets:
            min_freq:
            fix_len:


        """
        ################################################################################################################
        # BUILD
        ################################################################################################################
        args = Config()
        args.feat = self.parser.feat
        args.embed = self.parser.embed
        os.makedirs(os.path.dirname(base_path), exist_ok=True)
        logger.info("Building the fields")
        WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
        if args.feat == 'char':
            FEAT = SubwordField('chars',
                                pad=pad,
                                unk=unk,
                                bos=bos,
                                fix_len=fix_len)
        elif args.feat == 'bert':
            from transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained(args.bert)
            args.max_len = min(args.max_len or tokenizer.max_len,
                               tokenizer.max_len)
            FEAT = SubwordField('bert',
                                pad=tokenizer.pad_token,
                                unk=tokenizer.unk_token,
                                bos=tokenizer.bos_token or tokenizer.cls_token,
                                fix_len=fix_len,
                                tokenize=tokenizer.tokenize)
            FEAT.vocab = tokenizer.get_vocab()
        else:
            FEAT = Field('tags', bos=bos)

        ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs)
        REL = Field('rels', bos=bos)
        if args.feat in ('char', 'bert'):
            transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL)
        else:
            transform = CoNLL(FORM=WORD, CPOS=FEAT, HEAD=ARC, DEPREL=REL)

        train = Dataset(transform, self.corpus.train)
        WORD.build(
            train, min_freq,
            (Embedding.load(args.embed, unk) if self.parser.embed else None))
        FEAT.build(train)
        REL.build(train)
        args.update({
            'n_words': WORD.vocab.n_init,
            'n_feats': len(FEAT.vocab),
            'n_rels': len(REL.vocab),
            'pad_index': WORD.pad_index,
            'unk_index': WORD.unk_index,
            'bos_index': WORD.bos_index,
            'feat_pad_index': FEAT.pad_index,
        })
        parser = DependencyParser(
            n_words=args.n_words,
            n_feats=args.n_feats,
            n_rels=args.n_feats,
            pad_index=args.pad_index,
            unk_index=args.unk_index,
            # bos_index=args.bos_index,
            feat_pad_index=args.feat_pad_index,
            transform=transform)
        word_field_embeddings = self.parser.embeddings[0]
        word_field_embeddings.n_vocab = 1000
        parser.embeddings = self.parser.embeddings
        parser.embeddings[0] = word_field_embeddings
        parser.load_pretrained(WORD.embed).to(device)

        ################################################################################################################
        # TRAIN
        ################################################################################################################
        args = Config()
        parser.transform.train()
        if dist.is_initialized():
            batch_size = batch_size // dist.get_world_size()
        logger.info('Loading the data')
        train = Dataset(parser.transform, self.corpus.train, **args)
        dev = Dataset(parser.transform, self.corpus.dev)
        test = Dataset(parser.transform, self.corpus.test)
        train.build(batch_size, buckets, True, dist.is_initialized())
        dev.build(batch_size, buckets)
        test.build(batch_size, buckets)
        logger.info(
            f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n")
        logger.info(f'{parser}')
        if dist.is_initialized():
            parser = DDP(parser,
                         device_ids=[dist.get_rank()],
                         find_unused_parameters=True)

        optimizer = Adam(parser.parameters(), lr, (mu, nu), epsilon)
        scheduler = ExponentialLR(optimizer, decay**(1 / decay_steps))

        elapsed = timedelta()
        best_e, best_metric = 1, Metric()

        for epoch in range(1, max_epochs + 1):
            start = datetime.now()
            logger.info(f'Epoch {epoch} / {max_epochs}:')

            parser.train()

            loader = train.loader
            bar, metric = progress_bar(loader), AttachmentMetric()
            for words, feats, arcs, rels in bar:
                optimizer.zero_grad()

                mask = words.ne(parser.WORD.pad_index)
                # ignore the first token of each sentence
                mask[:, 0] = 0
                s_arc, s_rel = parser.forward(words, feats)
                loss = parser.forward_loss(s_arc, s_rel, arcs, rels, mask)
                loss.backward()
                nn.utils.clip_grad_norm_(parser.parameters(), clip)
                optimizer.step()
                scheduler.step()

                arc_preds, rel_preds = parser.decode(s_arc, s_rel, mask)
                # ignore all punctuation if not specified
                if not self.parser.args['punct']:
                    mask &= words.unsqueeze(-1).ne(parser.puncts).all(-1)
                metric(arc_preds, rel_preds, arcs, rels, mask)
                bar.set_postfix_str(
                    f'lr: {scheduler.get_last_lr()[0]:.4e} - loss: {loss:.4f} - {metric}'
                )

            loss, dev_metric = parser.evaluate(dev.loader)
            logger.info(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}")
            loss, test_metric = parser.evaluate(test.loader)
            logger.info(f"{'test:':6} - loss: {loss:.4f} - {test_metric}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric:
                best_e, best_metric = epoch, dev_metric
                if is_master():
                    parser.save(base_path)
                logger.info(f'{t}s elapsed (saved)\n')
            else:
                logger.info(f'{t}s elapsed\n')
            elapsed += t
            if epoch - best_e >= patience:
                break
        loss, metric = parser.load(base_path).evaluate(test.loader)

        logger.info(f'Epoch {best_e} saved')
        logger.info(f"{'dev:':6} - {best_metric}")
        logger.info(f"{'test:':6} - {metric}")
        logger.info(f'{elapsed}s elapsed, {elapsed / epoch}s/epoch')
Beispiel #8
0
    def predict(
        self,
        data,
        buckets=8,
        batch_size=5000,
        pred=None,
        prob=False,
        tree=True,
        proj=False,
    ):
        r"""
        Args:
            data (list[list] or str):
                The data for prediction, both a list of instances and filename are allowed.
            pred (str):
                If specified, the predicted results will be saved to the file. Default: ``None``.
            buckets (int):
                The number of buckets that sentences are assigned to. Default: 32.
            batch_size (int):
                The number of tokens in each batch. Default: 5000.
            prob (bool):
                If ``True``, outputs the probabilities. Default: ``False``.
            tree (bool):
                If ``True``, ensures to output well-formed trees. Default: ``False``.
            proj (bool):
                If ``True``, ensures to output projective trees. Default: ``False``.
            verbose (bool):
                If ``True``, increases the output verbosity. Default: ``True``.
            kwargs (dict):
                A dict holding the unconsumed arguments that can be used to update the configurations for prediction.

        Returns:
            A :class:`~underthesea.utils.Dataset` object that stores the predicted results.
        """
        self.transform.eval()
        if prob:
            self.transform.append(Field('probs'))

        logger.info('Loading the data')
        dataset = Dataset(self.transform, data)
        dataset.build(batch_size, buckets)
        logger.info(f'\n{dataset}')

        logger.info('Making predictions on the dataset')
        start = datetime.now()
        loader = dataset.loader
        self.eval()

        arcs, rels, probs = [], [], []
        for words, feats in progress_bar(loader):
            mask = words.ne(self.WORD.pad_index)
            # ignore the first token of each sentence
            mask[:, 0] = 0
            lens = mask.sum(1).tolist()
            s_arc, s_rel = self.forward(words, feats)
            arc_preds, rel_preds = self.decode(s_arc, s_rel, mask, tree, proj)
            arcs.extend(arc_preds[mask].split(lens))
            rels.extend(rel_preds[mask].split(lens))
            if prob:
                arc_probs = s_arc.softmax(-1)
                probs.extend([
                    prob[1:i + 1, :i + 1].cpu()
                    for i, prob in zip(lens, arc_probs.unbind())
                ])
        arcs = [seq.tolist() for seq in arcs]
        rels = [self.REL.vocab[seq.tolist()] for seq in rels]
        preds = {'arcs': arcs, 'rels': rels}
        if prob:
            preds['probs'] = probs

        elapsed = datetime.now() - start

        for name, value in preds.items():
            setattr(dataset, name, value)
        if pred is not None:
            logger.info(f'Saving predicted results to {pred}')
            self.transform.save(pred, dataset.sentences)
        logger.info(
            f'{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s'
        )

        return dataset
Beispiel #9
0
    def train(self, base_path, params):
        base_path = join(UNDERTHESEA_FOLDER, base_path)
        if exists(base_path):
            rmtree(base_path)
        makedirs(base_path)
        features = self.tagger.features
        print(features)
        transformer = TaggedTransformer(features)
        logger.info("Start feature extraction")
        train_samples = self.corpus.train
        test_samples = self.corpus.test
        X_train, y_train = transformer.transform(train_samples,
                                                 contain_labels=True)
        X_test, y_test = transformer.transform(test_samples,
                                               contain_labels=True)
        logger.info("Finish feature extraction")

        # Train
        logger.info("Start train")
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(X_train, y_train):
            trainer.append(xseq, yseq)
        trainer.set_params(params)
        filename = join(base_path, 'model.bin')
        trainer.train(filename)
        logger.info("Finish train")

        # Tagger
        logger.info("Start tagger")
        tagger = pycrfsuite.Tagger()
        tagger.open(filename)
        y_pred = [tagger.tag(x_seq) for x_seq in X_test]
        y_true = y_test
        print(classification_report(y_true, y_pred, digits=4))
        print(f'Accuracy: {accuracy_score(y_true, y_pred):.4f}')
        sentences = [[item[0] for item in sentence]
                     for sentence in self.corpus.test]
        sentences = zip(sentences, y_test, y_pred)
        texts = []
        for s in sentences:
            tokens, y_true, y_pred = s
            tokens_ = ["\t".join(item) for item in zip(tokens, y_true, y_pred)]
            text = "\n".join(tokens_)
            texts.append(text)
        text = "\n\n".join(texts)
        open(join(base_path, "output.txt"), "w").write(text)

        with open(join(base_path, "model.metadata"), "w") as f:
            f.write("model: CRFSequenceTagger")
        self.tagger.save(join(base_path, "features.bin"))

        logger.info("Finish tagger")