class SentencePairScorer(object): def __init__(self): parser = options.get_generation_parser(interactive=True) self.args = options.parse_args_and_arch(parser) self.embedder = Embedder(self.args) self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2))) def score_sentences(self, pairs): sp = spm.SentencePieceProcessor() sp.Load(self.args.sentencepiece) entok = MosesTokenizer(lang='en') def process_example(i): tok = entok.tokenize(i, escape=False) p = " ".join(tok).lower() p = sp.EncodeAsPieces(p) p = " ".join(p) return p embeddings_1 = [] embeddings_2 = [] sentences_1 = [] sentences_2 = [] for i in pairs: p1, p2 = process_example(i[0]), process_example(i[1]) sentences_1.append(p1) sentences_2.append(p2) if len(sentences_1) == 32: vecs = self.embedder.embed(sentences_1, self.args.eval_encoder) embeddings_1.append(vecs) vecs = self.embedder.embed(sentences_2, self.args.eval_encoder) embeddings_2.append(vecs) sentences_1 = [] sentences_2 = [] if len(sentences_1) > 0: vecs = self.embedder.embed(sentences_1, self.args.eval_encoder) embeddings_1.append(vecs) vecs = self.embedder.embed(sentences_2, self.args.eval_encoder) embeddings_2.append(vecs) embeddings_1 = np.vstack(embeddings_1) embeddings_2 = np.vstack(embeddings_2) scores = [] for i in range(embeddings_1.shape[0]): s = self.similarity(embeddings_1[i], embeddings_2[i]) scores.append(s) return scores
def __init__(self, vocab_size, d_model, N, heads, dropout): super(Decoder, self).__init__() self.N = N self.embed = Embedder(vocab_size, d_model) self.pe = PositionalEncoder(d_model, dropout=dropout) self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) self.norm = Norm(d_model)
def __init__(self, vocab_size, hidden_size, N, heads, dropout): super(Encoder, self).__init__() self.N = N self.embed = Embedder(vocab_size, hidden_size) self.pe = PositionalEncoder(hidden_size, dropout=dropout) self.layers = get_clones(EncoderLayer(hidden_size, heads, dropout), N) self.norm = Norm(hidden_size)
foreign = [] for i in results: if "STS" in i and "all" not in i and "SemEval17" not in i: total.append(results[i]["pearson"][0]) if "STS" in i and "all" in i: all.append(results[i]["pearson"]["mean"]) if i == "SemEval17.STS.input.track2.ar-en.txt" or i == "SemEval17.STS.input.track4a.es-en.txt" \ or i == "SemEval17.STS.input.track6.tr-en.txt": cross.append(results[i]["pearson"][0]) if i == "SemEval17.STS.input.track1.ar-ar.txt" or i == "SemEval17.STS.input.track3.es-es.txt": foreign.append(results[i]["pearson"][0]) print("Average (cross): {0}".format(np.mean(cross))) print("Average (foreign): {0}".format(np.mean(foreign))) print("Average (datasets): {0}".format(np.mean(total))) print("Average (comps): {0}".format(np.mean(all)), flush=True) return np.mean(all) if __name__ == '__main__': from embed import Embedder from fairseq import options parser = options.get_generation_parser(interactive=True) args = options.parse_args_and_arch(parser) embedder = Embedder(args) evaluate(embedder, args)
def main(args, init_distributed=False): utils.import_user_module(args) assert args.max_tokens is not None or args.max_sentences is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) # Print args print(args) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(','): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # Build trainer trainer = Trainer(args, task, model, criterion) print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) model.epoch_iter = epoch_itr # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(',') if args.load_pretrained_encoder: from collections import OrderedDict new_state_dict = OrderedDict() if not args.cpu: _model = torch.load(args.load_pretrained_encoder) else: _model = torch.load(args.load_pretrained_encoder, map_location='cpu') state_dict = _model['model'] for i in state_dict: if "encoder_" in i: new_state_dict[i] = state_dict[i] model.load_state_dict(new_state_dict, strict=True) while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update: # train for one epoch train(args, trainer, task, epoch_itr) if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) embedder = Embedder(args, model, task) sts = evaluate(embedder, args) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: if args.save_sts: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, -sts) else: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ':' in getattr(args, 'data', '') # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print('| done training in {:.1f} seconds'.format(train_meter.sum))
from art import text2art from embed import Embedder from es import ElasticSearcher from elasticsearch import Elasticsearch print(text2art('COVID-19 Browser')) embedder = Embedder() es_searcher = ElasticSearcher() while True: query = input('Type your question:') query_emb = embedder([query])[0].tolist() res = es_searcher(query_emb) print(res)
def __init__(self): parser = options.get_generation_parser(interactive=True) self.args = options.parse_args_and_arch(parser) self.embedder = Embedder(self.args) self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
args = parser.parse_args() metadata_path = Path(args.metadata) pr = Project() # lucene-index-cord19-2020-05-26-bm25 # tommaso-index-cord19-2020-05-26-bm25 # prepare the data name = metadata_path.stem print(f'Creating embedding from {name}') ds = CovidPapersDataset.from_path(metadata_path) dl = DataLoader(ds, batch_size=128, num_workers=4, collate_fn=lambda x: x) with open(pr.base_dir / 'es_index.json', 'r') as f: index_file = json.load(f) es_provider = ElasticSearchProvider(index_file, index_name=name) # see all at http://localhost:9200/lucene-index-cord19-2020-05-26-bm25/_search?pretty=true&q=*:* # create the adpater for the data es_adapter = CovidPapersEmbeddedAdapter() # drop the dataset es_provider.drop() # create a new one es_provider.create_index() embedder = Embedder(model_type='electra') for batch in tqdm(dl): x = [b['title_abstract'] for b in batch] embs = embedder(x) es_provider(es_adapter(batch, embs))