Ejemplo n.º 1
0
 def embed_sentence(sentence: str,
                    pooling_operation,
                    layers: str = '1',
                    use_scalar_mix: bool = False) -> Sentence:
     embeddings = XLMEmbeddings(pretrained_model_name_or_path=xlm_model,
                                layers=layers,
                                pooling_operation=pooling_operation,
                                use_scalar_mix=use_scalar_mix)
     flair_sentence = Sentence(sentence)
     embeddings.embed(flair_sentence)
     return flair_sentence
Ejemplo n.º 2
0
    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLMEmbeddings(
            model=xlm_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence
Ejemplo n.º 3
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Ejemplo n.º 4
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                print(
                    f"Corresponding flair embedding module not found for {model_name_or_path}"
                )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
    def _get_stacked_embeddings(self) -> StackedEmbeddings:
        layers = ",".join(str(layer) for layer in self.experiment.layers)
        pooling_operation = self.experiment.pooling_operation

        token_embeddings = []

        for embedding in self.experiment.embeddings:
            if embedding.startswith("roberta"):
                token_embeddings.append(
                    RoBERTaEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif (embedding.startswith("bert")
                  or embedding.startswith("distilbert")
                  or embedding.startswith("spanbert")):
                token_embeddings.append(
                    BertEmbeddings(
                        bert_model_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("elmo"):
                model_name = embedding.split("-")[-1]
                token_embeddings.append(ELMoEmbeddings(model=model_name))
            elif embedding.startswith("gpt2"):
                token_embeddings.append(
                    OpenAIGPT2Embeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("xlm"):
                token_embeddings.append(
                    XLMEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))
            elif embedding.startswith("xlnet"):
                token_embeddings.append(
                    XLNetEmbeddings(
                        pretrained_model_name_or_path=embedding,
                        pooling_operation=pooling_operation,
                        layers=layers,
                        use_scalar_mix=self.experiment.use_scalar_mix,
                    ))

        return StackedEmbeddings(embeddings=token_embeddings)
Ejemplo n.º 6
0
    def embed_text(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
        model_name_or_path: str = "bert-base-cased",
    ) -> List[Sentence]:
        """ Produces embeddings for text

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        * **model_name_or_path** - The hosted model name key or model path
        **return** - A list of Flair's `Sentence`s
        """
        # Convert into sentences
        if isinstance(text, str):
            sentences = Sentence(text)
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        else:
            sentences = text

        # Load correct Embeddings module
        if not self.models[model_name_or_path]:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.models[model_name_or_path] = BertEmbeddings(
                    model_name_or_path)
            elif "roberta" in model_name_or_path:
                self.models[model_name_or_path] = RoBERTaEmbeddings(
                    model_name_or_path)
            elif "gpt2" in model_name_or_path:
                self.models[model_name_or_path] = OpenAIGPT2Embeddings(
                    model_name_or_path)
            elif "xlnet" in model_name_or_path:
                self.models[model_name_or_path] = XLNetEmbeddings(
                    model_name_or_path)
            elif "xlm" in model_name_or_path:
                self.models[model_name_or_path] = XLMEmbeddings(
                    model_name_or_path)
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.models[model_name_or_path] = FlairEmbeddings(
                    model_name_or_path)
            else:
                try:
                    self.models[model_name_or_path] = WordEmbeddings(
                        model_name_or_path)
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )
                return Sentence("")
        embedding = self.models[model_name_or_path]
        return embedding.embed(sentences)
Ejemplo n.º 7
0
    def __init__(
        self,
        *embeddings: str,
        methods: List[str] = ["rnn", "pool"],
        configs: Dict = {
            "pool_configs": {
                "fine_tune_mode": "linear",
                "pooling": "mean"
            },
            "rnn_configs": {
                "hidden_size": 512,
                "rnn_layers": 1,
                "reproject_words": True,
                "reproject_words_dimension": 256,
                "bidirectional": False,
                "dropout": 0.5,
                "word_dropout": 0.0,
                "locked_dropout": 0.0,
                "rnn_type": "GRU",
                "fine_tune": True,
            },
        },
    ):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Check methods
        for m in methods:
            assert m in self.__class__.__allowed_methods

        # Set configs for pooling and rnn parameters
        for k, v in configs.items():
            assert k in self.__class__.__allowed_configs
            setattr(self, k, v)

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if "bert" in model_name_or_path and "roberta" not in model_name_or_path:
                self.embedding_stack.append(BertEmbeddings(model_name_or_path))
            elif "roberta" in model_name_or_path:
                self.embedding_stack.append(
                    RoBERTaEmbeddings(model_name_or_path))
            elif "gpt2" in model_name_or_path:
                self.embedding_stack.append(
                    OpenAIGPT2Embeddings(model_name_or_path))
            elif "xlnet" in model_name_or_path:
                self.embedding_stack.append(
                    XLNetEmbeddings(model_name_or_path))
            elif "xlm" in model_name_or_path:
                self.embedding_stack.append(XLMEmbeddings(model_name_or_path))
            elif ("flair" in model_name_or_path
                  or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    raise ValueError(
                        f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                    )

        assert len(self.embedding_stack) != 0
        if "pool" in methods:
            self.pool_embeddings = DocumentPoolEmbeddings(
                self.embedding_stack, **self.pool_configs)
            print("Pooled embedding loaded")
        if "rnn" in methods:
            self.rnn_embeddings = DocumentRNNEmbeddings(
                self.embedding_stack, **self.rnn_configs)
            print("RNN embeddings loaded")
Ejemplo n.º 8
0
def get_xlm(model_name):
    return XLMEmbeddings(model_name)
def main():
    argparser = argparse.ArgumentParser(
        description="download embeddings for models")
    argparser.add_argument("-bert",
                           "--bert",
                           action='store_true',
                           default=False,
                           help="bert embeddings (12 layers)")
    argparser.add_argument("-roberta",
                           "--roberta",
                           action='store_true',
                           default=False,
                           help="roberta embeddings (12 layers)")
    argparser.add_argument("-gpt2",
                           "--gpt2",
                           action='store_true',
                           default=False,
                           help="gpt2 embeddings (12 layers)")
    argparser.add_argument("-xlm",
                           "--xlm",
                           action='store_true',
                           default=False,
                           help="xlm embeddings (24 layers)")
    argparser.add_argument("-local",
                           "--local",
                           action='store_true',
                           default=False,
                           help="if local")
    args = argparser.parse_args()

    # verify arguments
    if args.bert and args.roberta and args.xlm and args.gpt2:
        print("select only one flag for model type from (bert, roberta, xlm)")
        exit()
    if not args.bert and not args.roberta and not args.xlm and not args.gpt2:
        print("select at least flag for model type from (bert, roberta, xlm)")
        exit()

    if args.bert or args.roberta or args.gpt2:
        num_layers = 12
    if args.xlm:
        num_layers = 24

    # open sentences
    file = open("cleaned_sentencesGLM.txt", "r").read().splitlines()

    # specify model
    print("uploading model...")
    for layer in tqdm(range(num_layers)):
        print(layer)
        if args.bert:
            embeddings = BertEmbeddings("bert-base-multilingual-cased",
                                        layers="-{}".format(layer))
            model_type = "bert"
        elif args.roberta:
            embeddings = RoBERTaEmbeddings("roberta-base",
                                           layers="-{}".format(layer))
            model_type = "roberta"
        elif args.xlm:
            embeddings = XLMEmbeddings("xlm-mlm-en-2048",
                                       layers="-{}".format(layer))
            model_type = "xlm"
        elif args.gpt2:
            embeddings = TransformerWordEmbeddings("gpt2",
                                                   layers="-{}".format(layer))
            model_type = "gpt2"
        else:
            print("error on calling embeddings")
            exit()

        embed_matrix = get_embeddings(file, embeddings)

        print("aggregating types...")
        avg_sentence = process_sentence(embed_matrix, "avg")
        max_sentence = process_sentence(embed_matrix, "max")
        min_sentence = process_sentence(embed_matrix, "min")
        last_sentence = process_sentence(embed_matrix, "last")

        methods = ['avg', 'max', 'min', 'last']
        mats = [avg_sentence, max_sentence, min_sentence, last_sentence]

        bool_labels = [1] * len(file)

        print("saving files...")
        if args.local:
            file_path = '../embeddings/{}/layer{}/'.format(model_type, layer)
        else:
            file_path = "/n/shieber_lab/Lab/users/cjou/embeddings/{}/layer{}/".format(
                model_type, layer)

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        for i in range(len(methods)):
            print("saving file: " + file_path + str(methods[i]) + ".p")
            pickle.dump(mats[i], open(file_path + str(methods[i]) + ".p",
                                      "wb"))

    print("done.")
Ejemplo n.º 10
0

if __name__ == "__main__":

    from test_textsim import *
    from flair.embeddings import XLMRobertaEmbeddings, BertEmbeddings, XLNetEmbeddings, XLMEmbeddings, RoBERTaEmbeddings

    measures = {}

    SAME, DIFF = load_data("./data/test_STS2017en-en.txt")

    MODELS = {
        "xlmr": XLMRobertaEmbeddings(),
        "bert": BertEmbeddings(),
        "xlnet": XLNetEmbeddings(),
        "xlm": XLMEmbeddings(),
        "roberta": RoBERTaEmbeddings(),
    }

    for model in MODELS:

        print(model)

        results = run_experiment(SAME,
                                 DIFF,
                                 lambda x: flair_embed_dict(x, MODELS[model]),
                                 wmdistance,
                                 inverse=True)
        measures['{}-wmdist'.format(model)] = results
        print(score(results[0], results[1]))