Beispiel #1
0
def embedding_extraction():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    # load from a local path:
    #lang_model = Path("../saved_models/glove-german-uncased")
    # or through s3
    lang_model = "glove-german-uncased"  #only glove or word2vec or converted fasttext (fixed vocab) embeddings supported
    do_lower_case = True
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(lang_model)
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # Create Inferencer for embedding extraction
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings")

    # Extract vectors
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    result = inferencer.extract_vectors(dicts=basic_texts,
                                        extraction_strategy="cls_token",
                                        extraction_layer=-1)
    print(result)
    inferencer.close_multiprocessing_pool()
Beispiel #2
0
def embeddings_extraction():
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    batch_size = 32
    use_gpu = True
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
    lang_model = "bert-base-german-cased"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # 4. Create an AdaptiveModel with  a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    adaptive_model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[],
        embeds_dropout_prob=0,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Extract embeddings with model in inference mode
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"
        },
        {
            "text": "Martin Müller spielt Fussball"
        },
    ]

    model = Inferencer(adaptive_model, processor, gpu=use_gpu)
    result = model.extract_vectors(dicts=basic_texts)
    print(result)
Beispiel #3
0
tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model,
                                      do_lower_case=False)

# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

# 4. Create an AdaptiveModel with  a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)

adaptive_model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[],
    embeds_dropout_prob=0,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Extract embeddings with model in inference mode
basic_texts = [
    {
        "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"
    },
    {
        "text": "Martin Müller spielt Fussball"
    },
]

model = Inferencer(adaptive_model, processor, gpu=use_gpu)
result = model.extract_vectors(dicts=basic_texts)
print(result)
Beispiel #4
0
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model, do_lower_case=False
)

# 2. Create a lightweight Processor only for inference (no labels, minimal preprocessing)
processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

# 4. Create an AdaptiveModel with  a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)

adaptive_model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[],
    embeds_dropout_prob=0,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Extract embeddings with model in inference mode
basic_texts = [
    {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot ist"},
    {"text": "Martin Müller spielt Fussball"},
]

model = Inferencer(adaptive_model, processor, gpu=use_gpu)
result = model.extract_vectors(dicts=basic_texts, extraction_strategy="reduce_mean", extraction_layer=-1)
print(result)