def lookup(keyword, cache=None): if cache: if cache.contains(keyword): return cache.get(keyword) logger.info('request ' + keyword) url = "https://vlsp.hpda.vn/demo/?page=vcl" payload = {"word": keyword} r = requests.post(url, data=payload) soup = BeautifulSoup(r.content, "html.parser") senses_data = soup.select("#vcl_content table .sense") if len(senses_data) == 0: if cache: cache.add(keyword, None) return None w = Word(keyword, senses=[]) for sense_data in senses_data: tags = sense_data.select(".word_description li") # morpho_tag = tags[0] syntax_tag = tags[1] semantic_tag = tags[2] syntax_tag = syntax_tag.select("font") tag, sub_tag = syntax_tag[0].text, syntax_tag[1].text description = " / ".join( [item.text for item in semantic_tag.select("font")]) sense = Sense(tag, sub_tag, description) w.add_sense(sense) if cache: cache.add(keyword, w) return w
def load_big_file(f: str) -> mmap.mmap: r""" Workaround for loading a big pickle file. Files over 2GB cause pickle errors on certin Mac and Windows distributions. Source: flairNLP """ logger.info(f"loading file {f}") with open(f, "rb") as f_in: # mmap seems to be much more memory efficient bf = mmap.mmap(f_in.fileno(), 0, access=mmap.ACCESS_READ) f_in.close() return bf
def evaluate(self, data, buckets=8, batch_size=5000, **kwargs): args = self.args.update(locals()) self.transform.train() logger.info("Loading the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Evaluating the dataset") start = datetime.now() loss, metric = self._evaluate(dataset.loader) elapsed = datetime.now() - start logger.info(f"loss: {loss:.4f} - {metric}") logger.info( f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s" ) return loss, metric
def predict(self, data, pred=None, buckets=8, batch_size=5000, prob=False, **kwargs): args = self.args.update(locals()) if not args: args = locals() args.update(kwargs) args = type('Args', (object, ), locals()) self.transform.eval() if args.prob: self.transform.append(Field('probs')) logger.info("Loading the data") dataset = Dataset(self.transform, data) dataset.build(args.batch_size, args.buckets) logger.info(f"\n{dataset}") logger.info("Making predictions on the dataset") start = datetime.now() preds = self._predict(dataset.loader) elapsed = datetime.now() - start for name, value in preds.items(): setattr(dataset, name, value) if pred is not None: logger.info(f"Saving predicted results to {pred}") self.transform.save(pred, dataset.sentences) logger.info( f"{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s" ) return dataset
def get_from_cache(url: str, cache_dir: Path = None) -> Path: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ cache_dir.mkdir(parents=True, exist_ok=True) filename = re.sub(r'.+/', '', url) # get cache path to put the file cache_path = cache_dir / filename if cache_path.exists(): return cache_path # make HEAD request to check ETag response = requests.head(url) # (anhv: 27/12/2020) github release assets return 302 if response.status_code not in [200, 302]: if "www.dropbox.com" in url: # dropbox return code 301, so we ignore this error pass else: raise IOError("HEAD request failed for url {}".format(url)) # add ETag to filename if it exists # etag = response.headers.get("ETag") if not cache_path.exists(): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. fd, temp_filename = tempfile.mkstemp() logger.info("%s not found in cache, downloading to %s", url, temp_filename) # GET file object req = requests.get(url, stream=True) content_length = req.headers.get('Content-Length') total = int(content_length) if content_length is not None else None progress = Tqdm.tqdm(unit="B", total=total) with open(temp_filename, 'wb') as temp_file: for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() logger.info("copying %s to cache at %s", temp_filename, cache_path) shutil.copyfile(temp_filename, str(cache_path)) logger.info("removing temp file %s", temp_filename) os.close(fd) os.remove(temp_filename) return cache_path
def make_ud_file(file): logger.info(msg=file) sentences = [] i = 0 for line in open(join(CLEANED_FOLDER, file)): s = UDSentence.load_from_raw_text(line) sentences.append(s) i += 1 if i % 200 == 0: logger.info(f"{file}:{i}") ud_dataset = UDDataset(sentences) ud_dataset.write(join(UD_FOLDER, file)) logger.info(f"Finish {file}")
def train(self, base_path: Union[Path, str], fix_len=20, min_freq=2, buckets=32, batch_size=5000, lr=2e-3, mu=.9, nu=.9, epsilon=1e-12, clip=5.0, decay=.75, decay_steps=5000, patience=100, max_epochs=10): r""" Train any class that implement model interface Args: base_path (object): Main path to which all output during training is logged and models are saved max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed. patience: decay_steps: decay: clip: epsilon: nu: mu: lr: proj: tree: batch_size: buckets: min_freq: fix_len: """ ################################################################################################################ # BUILD ################################################################################################################ args = Config() args.feat = self.parser.feat args.embed = self.parser.embed os.makedirs(os.path.dirname(base_path), exist_ok=True) logger.info("Building the fields") WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True) if args.feat == 'char': FEAT = SubwordField('chars', pad=pad, unk=unk, bos=bos, fix_len=fix_len) elif args.feat == 'bert': from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert) args.max_len = min(args.max_len or tokenizer.max_len, tokenizer.max_len) FEAT = SubwordField('bert', pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.bos_token or tokenizer.cls_token, fix_len=fix_len, tokenize=tokenizer.tokenize) FEAT.vocab = tokenizer.get_vocab() else: FEAT = Field('tags', bos=bos) ARC = Field('arcs', bos=bos, use_vocab=False, fn=CoNLL.get_arcs) REL = Field('rels', bos=bos) if args.feat in ('char', 'bert'): transform = CoNLL(FORM=(WORD, FEAT), HEAD=ARC, DEPREL=REL) else: transform = CoNLL(FORM=WORD, CPOS=FEAT, HEAD=ARC, DEPREL=REL) train = Dataset(transform, self.corpus.train) WORD.build( train, min_freq, (Embedding.load(args.embed, unk) if self.parser.embed else None)) FEAT.build(train) REL.build(train) args.update({ 'n_words': WORD.vocab.n_init, 'n_feats': len(FEAT.vocab), 'n_rels': len(REL.vocab), 'pad_index': WORD.pad_index, 'unk_index': WORD.unk_index, 'bos_index': WORD.bos_index, 'feat_pad_index': FEAT.pad_index, }) parser = DependencyParser( n_words=args.n_words, n_feats=args.n_feats, n_rels=args.n_feats, pad_index=args.pad_index, unk_index=args.unk_index, # bos_index=args.bos_index, feat_pad_index=args.feat_pad_index, transform=transform) word_field_embeddings = self.parser.embeddings[0] word_field_embeddings.n_vocab = 1000 parser.embeddings = self.parser.embeddings parser.embeddings[0] = word_field_embeddings parser.load_pretrained(WORD.embed).to(device) ################################################################################################################ # TRAIN ################################################################################################################ args = Config() parser.transform.train() if dist.is_initialized(): batch_size = batch_size // dist.get_world_size() logger.info('Loading the data') train = Dataset(parser.transform, self.corpus.train, **args) dev = Dataset(parser.transform, self.corpus.dev) test = Dataset(parser.transform, self.corpus.test) train.build(batch_size, buckets, True, dist.is_initialized()) dev.build(batch_size, buckets) test.build(batch_size, buckets) logger.info( f"\n{'train:':6} {train}\n{'dev:':6} {dev}\n{'test:':6} {test}\n") logger.info(f'{parser}') if dist.is_initialized(): parser = DDP(parser, device_ids=[dist.get_rank()], find_unused_parameters=True) optimizer = Adam(parser.parameters(), lr, (mu, nu), epsilon) scheduler = ExponentialLR(optimizer, decay**(1 / decay_steps)) elapsed = timedelta() best_e, best_metric = 1, Metric() for epoch in range(1, max_epochs + 1): start = datetime.now() logger.info(f'Epoch {epoch} / {max_epochs}:') parser.train() loader = train.loader bar, metric = progress_bar(loader), AttachmentMetric() for words, feats, arcs, rels in bar: optimizer.zero_grad() mask = words.ne(parser.WORD.pad_index) # ignore the first token of each sentence mask[:, 0] = 0 s_arc, s_rel = parser.forward(words, feats) loss = parser.forward_loss(s_arc, s_rel, arcs, rels, mask) loss.backward() nn.utils.clip_grad_norm_(parser.parameters(), clip) optimizer.step() scheduler.step() arc_preds, rel_preds = parser.decode(s_arc, s_rel, mask) # ignore all punctuation if not specified if not self.parser.args['punct']: mask &= words.unsqueeze(-1).ne(parser.puncts).all(-1) metric(arc_preds, rel_preds, arcs, rels, mask) bar.set_postfix_str( f'lr: {scheduler.get_last_lr()[0]:.4e} - loss: {loss:.4f} - {metric}' ) loss, dev_metric = parser.evaluate(dev.loader) logger.info(f"{'dev:':6} - loss: {loss:.4f} - {dev_metric}") loss, test_metric = parser.evaluate(test.loader) logger.info(f"{'test:':6} - loss: {loss:.4f} - {test_metric}") t = datetime.now() - start # save the model if it is the best so far if dev_metric > best_metric: best_e, best_metric = epoch, dev_metric if is_master(): parser.save(base_path) logger.info(f'{t}s elapsed (saved)\n') else: logger.info(f'{t}s elapsed\n') elapsed += t if epoch - best_e >= patience: break loss, metric = parser.load(base_path).evaluate(test.loader) logger.info(f'Epoch {best_e} saved') logger.info(f"{'dev:':6} - {best_metric}") logger.info(f"{'test:':6} - {metric}") logger.info(f'{elapsed}s elapsed, {elapsed / epoch}s/epoch')
def predict( self, data, buckets=8, batch_size=5000, pred=None, prob=False, tree=True, proj=False, ): r""" Args: data (list[list] or str): The data for prediction, both a list of instances and filename are allowed. pred (str): If specified, the predicted results will be saved to the file. Default: ``None``. buckets (int): The number of buckets that sentences are assigned to. Default: 32. batch_size (int): The number of tokens in each batch. Default: 5000. prob (bool): If ``True``, outputs the probabilities. Default: ``False``. tree (bool): If ``True``, ensures to output well-formed trees. Default: ``False``. proj (bool): If ``True``, ensures to output projective trees. Default: ``False``. verbose (bool): If ``True``, increases the output verbosity. Default: ``True``. kwargs (dict): A dict holding the unconsumed arguments that can be used to update the configurations for prediction. Returns: A :class:`~underthesea.utils.Dataset` object that stores the predicted results. """ self.transform.eval() if prob: self.transform.append(Field('probs')) logger.info('Loading the data') dataset = Dataset(self.transform, data) dataset.build(batch_size, buckets) logger.info(f'\n{dataset}') logger.info('Making predictions on the dataset') start = datetime.now() loader = dataset.loader self.eval() arcs, rels, probs = [], [], [] for words, feats in progress_bar(loader): mask = words.ne(self.WORD.pad_index) # ignore the first token of each sentence mask[:, 0] = 0 lens = mask.sum(1).tolist() s_arc, s_rel = self.forward(words, feats) arc_preds, rel_preds = self.decode(s_arc, s_rel, mask, tree, proj) arcs.extend(arc_preds[mask].split(lens)) rels.extend(rel_preds[mask].split(lens)) if prob: arc_probs = s_arc.softmax(-1) probs.extend([ prob[1:i + 1, :i + 1].cpu() for i, prob in zip(lens, arc_probs.unbind()) ]) arcs = [seq.tolist() for seq in arcs] rels = [self.REL.vocab[seq.tolist()] for seq in rels] preds = {'arcs': arcs, 'rels': rels} if prob: preds['probs'] = probs elapsed = datetime.now() - start for name, value in preds.items(): setattr(dataset, name, value) if pred is not None: logger.info(f'Saving predicted results to {pred}') self.transform.save(pred, dataset.sentences) logger.info( f'{elapsed}s elapsed, {len(dataset) / elapsed.total_seconds():.2f} Sents/s' ) return dataset
def train(self, base_path, params): base_path = join(UNDERTHESEA_FOLDER, base_path) if exists(base_path): rmtree(base_path) makedirs(base_path) features = self.tagger.features print(features) transformer = TaggedTransformer(features) logger.info("Start feature extraction") train_samples = self.corpus.train test_samples = self.corpus.test X_train, y_train = transformer.transform(train_samples, contain_labels=True) X_test, y_test = transformer.transform(test_samples, contain_labels=True) logger.info("Finish feature extraction") # Train logger.info("Start train") trainer = pycrfsuite.Trainer(verbose=True) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params(params) filename = join(base_path, 'model.bin') trainer.train(filename) logger.info("Finish train") # Tagger logger.info("Start tagger") tagger = pycrfsuite.Tagger() tagger.open(filename) y_pred = [tagger.tag(x_seq) for x_seq in X_test] y_true = y_test print(classification_report(y_true, y_pred, digits=4)) print(f'Accuracy: {accuracy_score(y_true, y_pred):.4f}') sentences = [[item[0] for item in sentence] for sentence in self.corpus.test] sentences = zip(sentences, y_test, y_pred) texts = [] for s in sentences: tokens, y_true, y_pred = s tokens_ = ["\t".join(item) for item in zip(tokens, y_true, y_pred)] text = "\n".join(tokens_) texts.append(text) text = "\n\n".join(texts) open(join(base_path, "output.txt"), "w").write(text) with open(join(base_path, "model.metadata"), "w") as f: f.write("model: CRFSequenceTagger") self.tagger.save(join(base_path, "features.bin")) logger.info("Finish tagger")