def setUp(self): from sparknlp.training import POS self.data = SparkContextForTest.data self.train_pos = POS().readDataset(SparkContextForTest.spark, os.getcwd() + "/../src/test/resources/anc-pos-corpus-small/test-training.txt", delimiter="|", outputPosCol="tags", outputDocumentCol="document", outputTextCol="text")
def setUp(self): self.data = SparkContextForTest.spark \ .createDataFrame([["I saw a girl with a telescope"]]).toDF("text") self.corpus = os.getcwd() + "/../src/test/resources/anc-pos-corpus-small/" self.conllu = os.getcwd() + "/../src/test/resources/parser/unlabeled/conll-u/train_small.conllu.txt" from sparknlp.training import POS self.train_pos = POS().readDataset(SparkContextForTest.spark, os.getcwd() + "/../src/test/resources/anc-pos-corpus-small/test-training.txt", delimiter="|", outputPosCol="tags", outputDocumentCol="document", outputTextCol="text")
def setUpClass(cls): from sparknlp.training import POS cls.trainDataset = POS().readDataset( SparkContextForTest.spark, "../src/test/resources/anc-pos-corpus-small/test-training.txt", delimiter="|", outputPosCol="tags", outputDocumentCol="document", outputTextCol="text", ) document_assembler = ( DocumentAssembler().setInputCol("text").setOutputCol("document")) tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") embds = (WordEmbeddings().setStoragePath( "../src/test/resources/ner-corpus/embeddings.100d.test.txt", ReadAs.TEXT).setDimension(100).setStorageRef( "glove_100d").setInputCols("document", "token").setOutputCol("embeddings")) cls.classifier = ( NerDLApproach().setInputCols( "document", "token", "embeddings").setLabelColumn("tags").setOutputCol("out"). setMaxEpochs(1).setEnableOutputLogs(True).setOutputLogsPath( cls.OUTPUT_LOG_PATH)) cls.pipeline = Pipeline( stages=[document_assembler, tokenizer, embds, cls.classifier]) # TODO: Does not work for SentenceDetectorDLApproach due to log naming scheme. # cls.trainDataset = SparkContextForTest.spark.createDataFrame( # [["This is a sentence."]], ["text"] # ) # # document_assembler = ( # DocumentAssembler().setInputCol("text").setOutputCol("document") # ) # cls.classifier = ( # SentenceDetectorDLApproach() # .setInputCols("document") # .setOutputCol("sentence") # .setOutputLogsPath(cls.OUTPUT_LOG_PATH) # .setEpochsNumber(1) # ) # # cls.pipeline = Pipeline(stages=[document_assembler, cls.classifier]) comet_ml.init(project_name="sparknlp-testing", offline_directory="/tmp") cls.logger = CometLogger(comet_mode="offline", offline_directory=cls.OUTPUT_LOG_PATH)