Esempio n. 1
0
 def setUp(self):
     from sparknlp.training import POS
     self.data = SparkContextForTest.data
     self.train_pos = POS().readDataset(SparkContextForTest.spark,
                                        os.getcwd() + "/../src/test/resources/anc-pos-corpus-small/test-training.txt",
                                        delimiter="|", outputPosCol="tags", outputDocumentCol="document",
                                        outputTextCol="text")
Esempio n. 2
0
 def setUp(self):
     self.data = SparkContextForTest.spark \
             .createDataFrame([["I saw a girl with a telescope"]]).toDF("text")
     self.corpus = os.getcwd() + "/../src/test/resources/anc-pos-corpus-small/"
     self.conllu = os.getcwd() + "/../src/test/resources/parser/unlabeled/conll-u/train_small.conllu.txt"
     from sparknlp.training import POS
     self.train_pos = POS().readDataset(SparkContextForTest.spark,
                                        os.getcwd() + "/../src/test/resources/anc-pos-corpus-small/test-training.txt",
                                        delimiter="|", outputPosCol="tags", outputDocumentCol="document",
                                        outputTextCol="text")
Esempio n. 3
0
    def setUpClass(cls):
        from sparknlp.training import POS

        cls.trainDataset = POS().readDataset(
            SparkContextForTest.spark,
            "../src/test/resources/anc-pos-corpus-small/test-training.txt",
            delimiter="|",
            outputPosCol="tags",
            outputDocumentCol="document",
            outputTextCol="text",
        )

        document_assembler = (
            DocumentAssembler().setInputCol("text").setOutputCol("document"))

        tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")

        embds = (WordEmbeddings().setStoragePath(
            "../src/test/resources/ner-corpus/embeddings.100d.test.txt",
            ReadAs.TEXT).setDimension(100).setStorageRef(
                "glove_100d").setInputCols("document",
                                           "token").setOutputCol("embeddings"))

        cls.classifier = (
            NerDLApproach().setInputCols(
                "document", "token",
                "embeddings").setLabelColumn("tags").setOutputCol("out").
            setMaxEpochs(1).setEnableOutputLogs(True).setOutputLogsPath(
                cls.OUTPUT_LOG_PATH))

        cls.pipeline = Pipeline(
            stages=[document_assembler, tokenizer, embds, cls.classifier])

        # TODO: Does not work for SentenceDetectorDLApproach due to log naming scheme.
        # cls.trainDataset = SparkContextForTest.spark.createDataFrame(
        #     [["This is a sentence."]], ["text"]
        # )
        #
        # document_assembler = (
        #     DocumentAssembler().setInputCol("text").setOutputCol("document")
        # )
        # cls.classifier = (
        #     SentenceDetectorDLApproach()
        #     .setInputCols("document")
        #     .setOutputCol("sentence")
        #     .setOutputLogsPath(cls.OUTPUT_LOG_PATH)
        #     .setEpochsNumber(1)
        # )
        #
        # cls.pipeline = Pipeline(stages=[document_assembler, cls.classifier])

        comet_ml.init(project_name="sparknlp-testing",
                      offline_directory="/tmp")
        cls.logger = CometLogger(comet_mode="offline",
                                 offline_directory=cls.OUTPUT_LOG_PATH)