Python convert_features_to_datasetの例

プログラミング言語: Python

名前空間/パッケージ名: farm.data_handler.dataset

メソッド/関数: convert_features_to_dataset

hotexamples.comのコード掲載数: 3

Python convert_features_to_dataset - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのfarm.data_handler.dataset.convert_features_to_datasetの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: processor.py プロジェクト: leiframming/FARM

 def _create_dataset(self):
     baskets = self.baskets
     features_flat = []
     for basket in baskets:
         for sample in basket.samples:
             features_flat.extend(sample.features)
     dataset, tensor_names = convert_features_to_dataset(features=features_flat)
     return dataset, tensor_names

コード例 #2

ファイルを表示

 def _create_dataset(self, keep_baskets=False):
     features_flat = []
     for basket in self.baskets:
         for sample in basket.samples:
             features_flat.extend(sample.features)
     if not keep_baskets:
         # free up some RAM, we don't need baskets from here on
         self.baskets = None
     dataset, tensor_names = convert_features_to_dataset(features=features_flat)
     return dataset, tensor_names

コード例 #3

ファイルを表示

def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base",
                                     do_lower_case=True, use_fast=True)
    passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base",
                                       do_lower_case=True, use_fast=True)

    processor = TextSimilarityProcessor(
        tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1
    )

    question_language_model = LanguageModel.load(pretrained_model_name_or_path="bert-base-uncased",
                                                 language_model_class="DPRQuestionEncoder",
                                                 hidden_dropout_prob=0, attention_probs_dropout_prob=0)
    passage_language_model = LanguageModel.load(pretrained_model_name_or_path="bert-base-uncased",
                                                language_model_class="DPRContextEncoder",
                                                hidden_dropout_prob=0, attention_probs_dropout_prob=0)

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPRQuestionEncoder
    assert type(passage_language_model) == DPRContextEncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][0, 0].item() - -0.010200000368058681 < 0.0001

    d = {'query': 'big little lies season 2 how many episodes',
         'passages': [
                         {'title': 'Big Little Lies (TV series)',
                          'text': 'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
                          'label': 'positive',
                          'external_id': '18768923'},
                         {'title': 'Little People, Big World',
                          'text': 'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
                          'label': 'hard_negative',
                          'external_id': '7459116'},
                         {'title': 'Cormac McCarthy',
                          'text': 'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
                          'label': 'negative',
                          'passage_id': '2145653'}
                     ]
         }

    sample = processor._dict_to_samples(d)
    feats = processor._sample_to_features(sample[0])
    dataset, tensor_names = convert_features_to_dataset(feats)
    features = {key: val.unsqueeze(0).to(device) for key, val in zip(tensor_names, dataset[0])}

    # test features
    assert torch.all(torch.eq(features["query_input_ids"][0][:10].cpu(),
                              torch.tensor([101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102])))
    assert torch.all(torch.eq(features["passage_input_ids"][0][0][:10].cpu(),
                              torch.tensor([101,  2502,  2210,  3658,  1006,  2694,  2186,  1007,   102,  2186])))
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(), torch.tensor(list(range(10)))))
    assert torch.all(torch.eq(features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(), torch.tensor(list(range(127)))))
    assert torch.all(torch.eq(features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143)))))

    # test model encodings
    query_vector = model.language_model1(**features)[0]
    passage_vector = model.language_model2(**features)[0]
    assert torch.all(torch.le(query_vector[0, :10].cpu() - torch.tensor([-0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638, 0.1405,
                                                                         0.2285, 0.0893]), torch.ones((1, 10))*0.0001))
    assert torch.all(torch.le(passage_vector[0, :10].cpu() - torch.tensor([0.0557, -0.6836, -0.3645, -0.5566,  0.2034, -0.3656,  0.2969, -0.0555,
                                                                          0.3405, -0.8691]), torch.ones((1, 10))*0.0001))
    assert torch.all(torch.le(passage_vector[1, :10].cpu() - torch.tensor([-0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306,  0.1156,
                                                                           0.3350, -0.3412]), torch.ones((1, 10)) * 0.0001))

    # test logits and loss
    embeddings = model(**features)
    query_emb, passage_emb = embeddings[0]
    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))

    loss = model.logits_to_loss_per_head(embeddings, **features)
    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(query_emb, passage_emb).cpu()
    assert torch.all(torch.le(similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]), torch.ones((1, 2)) * 0.0001))
    assert (loss[0].item() - 0.0018) <= 0.0001