Exemple #1
0
 def _download_if_required(self):
     # download the model weights and data to client machine
     cached_path(
         path=self.final_model_dir,
         url=
         "https://parsect-models.s3-ap-southeast-1.amazonaws.com/lstm_crf_parscit_final.zip",
     )
Exemple #2
0
 def _download_if_required(self):
     # download the model weights and data to client machine
     cached_path(
         path=f"{self.final_model_dir}.zip",
         url="https://parsect-models.s3-ap-southeast-1.amazonaws.com/i2b2.zip",
         unzip=True,
     )
Exemple #3
0
    def __init__(
            self,
            dropout_value: float = 0.5,
            datasets_manager: DatasetsManager = None,
            word_tokens_namespace: str = "tokens",
            device: torch.device = torch.device("cpu"),
            fine_tune: bool = False,
    ):
        super(ElmoEmbedder, self).__init__()

        # Sometimes you need two different tensors that are
        # two different linear combination of representations
        # TODO: change this in-case you need 2 representations
        self.num_output_representations = 1
        self.dropout_value = dropout_value
        self.datasets_manager = datasets_manager
        self.device = torch.device(device) if isinstance(device,
                                                         str) else device
        self.msg_printer = wasabi.Printer()
        self.word_tokens_namespace = word_tokens_namespace
        self.fine_tune = fine_tune
        self.embedder_name = "ElmoEmbedder"

        self.elmo_options_file = pathlib.Path(ELMO_OPTIONS_FILE)
        self.elmo_weights_file = pathlib.Path(ELMO_WEIGHTS_FILE)
        if not self.elmo_options_file.is_file():
            self.elmo_options_file = cached_path(
                url=EMBEDDING_FILE_URLS["ELMO_OPTIONS_FILE"],
                path=self.elmo_options_file,
                unzip=False,
            )
            self.elmo_weights_file = cached_path(
                url=EMBEDDING_FILE_URLS["ELMO_WEIGHTS_FILE"],
                path=self.elmo_weights_file,
                unzip=False,
            )

        with self.msg_printer.loading("Loading Elmo Object"):
            self.elmo: nn.Module = Elmo(
                options_file=self.elmo_options_file,
                weight_file=self.elmo_weights_file,
                num_output_representations=self.num_output_representations,
                dropout=self.dropout_value,
                requires_grad=fine_tune,
            )

        self.msg_printer.good(f"Finished Loading ELMO object")
 def _get_data(self):
     data_manager = SeqLabellingDatasetManager(
         train_filename=cached_path(
             path=self.data_dir.joinpath("parscit.train"),
             url=self.train_data_file_url,
             unzip=False,
         ),
         dev_filename=cached_path(
             path=self.data_dir.joinpath("parscit.dev"),
             url=self.dev_data_file_url,
             unzip=False,
         ),
         test_filename=cached_path(
             path=self.data_dir.joinpath("parscit.test"),
             url=self.test_data_file_url,
             unzip=False,
         ),
     )
     return data_manager
Exemple #5
0
    def _get_data(self):
        train_file = cached_path(
            path=self.data_dir.joinpath("scicite.train"),
            url=self.train_data_url,
            unzip=False,
        )
        dev_file = cached_path(
            path=self.data_dir.joinpath("scicite.dev"),
            url=self.dev_data_url,
            unzip=False,
        )
        test_file = cached_path(
            path=self.data_dir.joinpath("scicite.test"),
            url=self.test_data_url,
            unzip=False,
        )

        data_manager = TextClassificationDatasetManager(
            train_filename=train_file,
            dev_filename=dev_file,
            test_filename=test_file)
        return data_manager
Exemple #6
0
    def _get_data(self):
        train_filename = cached_path(
            path=self.data_dir.joinpath("i2b2.train"),
            url=self.train_data_url,
            unzip=False,
        )

        dev_filename = cached_path(path=self.data_dir.joinpath("i2b2.dev"),
                                   url=self.dev_data_url,
                                   unzip=False)

        test_filename = cached_path(path=self.data_dir.joinpath("i2b2.dev"),
                                    url=self.dev_data_url,
                                    unzip=False)

        data_manager = CoNLLDatasetManager(
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            column_names=["NER", "NER", "NER"],
            train_only="ner",
        )
        return data_manager
Exemple #7
0
    def _get_data(self):
        train_filename = self.data_dir.joinpath("sectLabel.train")
        dev_filename = self.data_dir.joinpath("sectLabel.dev")
        test_filename = self.data_dir.joinpath("sectLabel.test")

        train_filename = cached_path(path=train_filename,
                                     url=self.train_data_url,
                                     unzip=False)
        dev_filename = cached_path(path=dev_filename,
                                   url=self.dev_data_url,
                                   unzip=False)

        test_filename = cached_path(path=test_filename,
                                    url=self.test_data_url,
                                    unzip=False)

        data_manager = TextClassificationDatasetManager(
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
        )

        return data_manager
Exemple #8
0
    def get_preloaded_filename(self):
        filename = None
        url = None

        if self.embedding_type == "glove_6B_50":
            filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.50d.txt")
            url = EMBEDDING_FILE_URLS["GLOVE_FILE"]

        elif self.embedding_type == "glove_6B_100":
            filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.100d.txt")
            url = EMBEDDING_FILE_URLS["GLOVE_FILE"]

        elif self.embedding_type == "glove_6B_200":
            filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.200d.txt")
            url = EMBEDDING_FILE_URLS["GLOVE_FILE"]

        elif self.embedding_type == "glove_6B_300":
            filename = os.path.join(EMBEDDING_CACHE_DIR, "glove.6B.300d.txt")
            url = EMBEDDING_FILE_URLS["GLOVE_FILE"]

        elif self.embedding_type == "parscit":
            filename = os.path.join(EMBEDDING_CACHE_DIR, "vectors_with_unk.kv")
            url = EMBEDDING_FILE_URLS["PARSCIT_EMBEDDINGS"]

        elif self.embedding_type == "lample_conll":
            filename = os.path.join(EMBEDDING_CACHE_DIR, "lample_conll")
            url = EMBEDDING_FILE_URLS["LAMPLE_CONLL"]
        else:
            raise ValueError(
                f"Check the embedding type. It has to be one of {self.allowed_embedding_types}"
            )

        url_path = pathlib.Path(url)
        destination_path = url_path.parts[-1]
        destination_path = self.embedding_cache_dir.joinpath(destination_path)
        _ = cached_path(url=url, unzip=True, path=destination_path)

        return filename
Exemple #9
0
 def _download_if_required(self):
     cached_path(
         path=self.final_model_dir,
         url=
         "https://parsect-models.s3-ap-southeast-1.amazonaws.com/sectlabel_elmo_bilstm.zip",
     )
Exemple #10
0
 def _download_if_required(self):
     cached_path(
         path=self.final_model_dir,
         url="https://parsect-models.s3-ap-southeast-1.amazonaws.com/genericsect_bow_elmo.zip",
     )