Example #1
0
class SentencePairScorer(object):

    def __init__(self):
        parser = options.get_generation_parser(interactive=True)
        self.args = options.parse_args_and_arch(parser)

        self.embedder = Embedder(self.args)
        self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))

    def score_sentences(self, pairs):

        sp = spm.SentencePieceProcessor()
        sp.Load(self.args.sentencepiece)

        entok = MosesTokenizer(lang='en')

        def process_example(i):
            tok = entok.tokenize(i, escape=False)
            p = " ".join(tok).lower()
            p = sp.EncodeAsPieces(p)
            p = " ".join(p)
            return p

        embeddings_1 = []
        embeddings_2 = []
        sentences_1 = []
        sentences_2 = []
        for i in pairs:
            p1, p2 = process_example(i[0]), process_example(i[1])
            sentences_1.append(p1)
            sentences_2.append(p2)

            if len(sentences_1) == 32:
                vecs = self.embedder.embed(sentences_1, self.args.eval_encoder)
                embeddings_1.append(vecs)

                vecs = self.embedder.embed(sentences_2, self.args.eval_encoder)
                embeddings_2.append(vecs)

                sentences_1 = []
                sentences_2 = []

        if len(sentences_1) > 0:
            vecs = self.embedder.embed(sentences_1, self.args.eval_encoder)
            embeddings_1.append(vecs)

            vecs = self.embedder.embed(sentences_2, self.args.eval_encoder)
            embeddings_2.append(vecs)

        embeddings_1 = np.vstack(embeddings_1)
        embeddings_2 = np.vstack(embeddings_2)

        scores = []
        for i in range(embeddings_1.shape[0]):
            s = self.similarity(embeddings_1[i], embeddings_2[i])
            scores.append(s)

        return scores
Example #2
0
 def __init__(self, vocab_size, d_model, N, heads, dropout):
     super(Decoder, self).__init__()
     self.N = N
     self.embed = Embedder(vocab_size, d_model)
     self.pe = PositionalEncoder(d_model, dropout=dropout)
     self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
     self.norm = Norm(d_model)
Example #3
0
 def __init__(self, vocab_size, hidden_size, N, heads, dropout):
     super(Encoder, self).__init__()
     self.N = N
     self.embed = Embedder(vocab_size, hidden_size)
     self.pe = PositionalEncoder(hidden_size, dropout=dropout)
     self.layers = get_clones(EncoderLayer(hidden_size, heads, dropout), N)
     self.norm = Norm(hidden_size)
Example #4
0
    foreign = []
    for i in results:
        if "STS" in i and "all" not in i and "SemEval17" not in i:
            total.append(results[i]["pearson"][0])
        if "STS" in i and "all" in i:
            all.append(results[i]["pearson"]["mean"])
        if i == "SemEval17.STS.input.track2.ar-en.txt" or i == "SemEval17.STS.input.track4a.es-en.txt" \
                or i == "SemEval17.STS.input.track6.tr-en.txt":
            cross.append(results[i]["pearson"][0])
        if i == "SemEval17.STS.input.track1.ar-ar.txt" or i == "SemEval17.STS.input.track3.es-es.txt":
            foreign.append(results[i]["pearson"][0])

    print("Average (cross): {0}".format(np.mean(cross)))
    print("Average (foreign): {0}".format(np.mean(foreign)))
    print("Average (datasets): {0}".format(np.mean(total)))
    print("Average (comps): {0}".format(np.mean(all)), flush=True)
    return np.mean(all)


if __name__ == '__main__':

    from embed import Embedder
    from fairseq import options

    parser = options.get_generation_parser(interactive=True)
    args = options.parse_args_and_arch(parser)

    embedder = Embedder(args)

    evaluate(embedder, args)
Example #5
0
def main(args, init_distributed=False):
    utils.import_user_module(args)

    assert args.max_tokens is not None or args.max_sentences is not None, \
        'Must specify batch size either with --max-tokens or --max-sentences'

    # Initialize CUDA and distributed training
    if torch.cuda.is_available() and not args.cpu:
        torch.cuda.set_device(args.device_id)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if init_distributed:
        args.distributed_rank = distributed_utils.distributed_init(args)

    if distributed_utils.is_master(args):
        checkpoint_utils.verify_checkpoint_directory(args.save_dir)

    # Print args
    print(args)

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load valid dataset (we load training data below, based on the latest checkpoint)
    for valid_sub_split in args.valid_subset.split(','):
        task.load_dataset(valid_sub_split, combine=False, epoch=0)

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
    print(model)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {} (num. trained: {})'.format(
        sum(p.numel() for p in model.parameters()),
        sum(p.numel() for p in model.parameters() if p.requires_grad),
    ))

    # Build trainer
    trainer = Trainer(args, task, model, criterion)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Load the latest checkpoint if one is available and restore the
    # corresponding train iterator
    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer)

    model.epoch_iter = epoch_itr
    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    valid_subsets = args.valid_subset.split(',')

    if args.load_pretrained_encoder:
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        if not args.cpu:
            _model = torch.load(args.load_pretrained_encoder)
        else:
            _model = torch.load(args.load_pretrained_encoder,
                                map_location='cpu')
        state_dict = _model['model']
        for i in state_dict:
            if "encoder_" in i:
                new_state_dict[i] = state_dict[i]
        model.load_state_dict(new_state_dict, strict=True)

    while lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates(
    ) < max_update:
        # train for one epoch
        train(args, trainer, task, epoch_itr)

        if not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0:
            valid_losses = validate(args, trainer, task, epoch_itr,
                                    valid_subsets)
        else:
            valid_losses = [None]

        # only use first validation loss to update the learning rate
        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])

        embedder = Embedder(args, model, task)
        sts = evaluate(embedder, args)

        # save checkpoint
        if epoch_itr.epoch % args.save_interval == 0:
            if args.save_sts:
                checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                                 -sts)
            else:
                checkpoint_utils.save_checkpoint(args, trainer, epoch_itr,
                                                 valid_losses[0])

        reload_dataset = ':' in getattr(args, 'data', '')
        # sharded data: get train iterator for next epoch
        epoch_itr = trainer.get_train_iterator(epoch_itr.epoch,
                                               load_dataset=reload_dataset)
    train_meter.stop()
    print('| done training in {:.1f} seconds'.format(train_meter.sum))
from art import text2art
from embed import Embedder
from es import ElasticSearcher
from elasticsearch import Elasticsearch

print(text2art('COVID-19 Browser'))
embedder = Embedder()
es_searcher = ElasticSearcher()

while True:
    query = input('Type your question:')
    query_emb = embedder([query])[0].tolist()
    res = es_searcher(query_emb)
    print(res)
Example #7
0
    def __init__(self):
        parser = options.get_generation_parser(interactive=True)
        self.args = options.parse_args_and_arch(parser)

        self.embedder = Embedder(self.args)
        self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
Example #8
0
args = parser.parse_args()

metadata_path = Path(args.metadata)

pr = Project()
# lucene-index-cord19-2020-05-26-bm25
# tommaso-index-cord19-2020-05-26-bm25
# prepare the data
name = metadata_path.stem
print(f'Creating embedding from {name}')

ds = CovidPapersDataset.from_path(metadata_path)
dl = DataLoader(ds, batch_size=128, num_workers=4, collate_fn=lambda x: x)

with open(pr.base_dir / 'es_index.json', 'r') as f:
    index_file = json.load(f)
    es_provider = ElasticSearchProvider(index_file, index_name=name)
    # see all at http://localhost:9200/lucene-index-cord19-2020-05-26-bm25/_search?pretty=true&q=*:*
# create the adpater for the data
es_adapter = CovidPapersEmbeddedAdapter()
# drop the dataset
es_provider.drop()
# create a new one
es_provider.create_index()

embedder = Embedder(model_type='electra')

for batch in tqdm(dl):
    x = [b['title_abstract'] for b in batch]
    embs = embedder(x)
    es_provider(es_adapter(batch, embs))