def main(): spark, sc = init_spark() training_data = CoNLL().readDataset(spark, './spark_nlp/eng.train.txt') # training_data.show() bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \ .setInputCols(["sentence", 'token']) \ .setOutputCol("bert") \ .setCaseSensitive(False) \ .setPoolingLayer(0) nerTagger = NerDLApproach() \ .setInputCols(["sentence", "token", "bert"]) \ .setLabelColumn("label") \ .setOutputCol("ner") \ .setMaxEpochs(1) \ .setRandomSeed(0) \ .setVerbose(1) \ .setValidationSplit(0.2) \ .setEvaluationLogExtended(True) \ .setEnableOutputLogs(True) \ .setIncludeConfidence(True) #.setTestDataset("test_withEmbeds.parquet") ''' test_data = CoNLL().readDataset(spark, './spark_nlp/eng.testa.txt') test_data = bert.transform(test_data) test_data.write.parquet("test_withEmbeds.parquet") ''' ner_pipeline = Pipeline(stages=[bert, nerTagger]) ner_model = ner_pipeline.fit(training_data)
def __init__(self): self.spark = ( SparkSession.builder.appName("DL-NER").master("local[*]").config( "spark.driver.memory", "8G").config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.1.0").config( "spark.kryoserializer.buffer.max", "500m").getOrCreate()) self.sc = self.spark.sparkContext self.ner_model = None self.conll = CoNLL( documentCol="document", sentenceCol="sentence", tokenCol="token", posCol="pos", )
def setUp(self): from sparknlp.training import CoNLL self.data = SparkContextForTest.data self.embeddings = os.getcwd( ) + "/../src/test/resources/ner-corpus/embeddings.100d.test.txt" external_dataset = os.getcwd( ) + "/../src/test/resources/ner-corpus/sentence-detector/unpunctuated_dataset.txt" self.training_set = CoNLL().readDataset(SparkContextForTest.spark, external_dataset)
def read_data(dir_path, filenames): dfs = [] print(dir_path) #directory = os.fsencode(dir_path) with open(filenames) as names: files = names.readlines() for filename in files: dfs = dfs.append(CoNLL().readDataset(spark, dir_path+filename.strip())) return unionAll(dfs)
#spark = sparknlp.start() print("Spark NLP version: ", sparknlp.version()) print("Apache Spark version: ", spark.version) """Create some data for testing purposes""" from pyspark.sql import Row R = Row('sentence', 'start', 'end') test_data = spark.createDataFrame([R('Peter is a good person, and he was working at IBM',0,1)]) """Create a custom pipeline""" !ls from sparknlp.training import CoNLL training_data = CoNLL().readDataset(spark, 'con_rest_train.bio') training_data.show() import time documentAssembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer() \ .setInputCols(["document"]) \ .setOutputCol("token") embeddings = WordEmbeddings() \ .setInputCols(["document", "token"])\ .setOutputCol("embeddings")\
class SparkNLPModule: def __init__(self): self.spark = ( SparkSession.builder.appName("DL-NER").master("local[*]").config( "spark.driver.memory", "8G").config("spark.jars.packages", "JohnSnowLabs:spark-nlp:2.1.0").config( "spark.kryoserializer.buffer.max", "500m").getOrCreate()) self.sc = self.spark.sparkContext self.ner_model = None self.conll = CoNLL( documentCol="document", sentenceCol="sentence", tokenCol="token", posCol="pos", ) def train(self, training_data_path, glove_path): training_data = self.conll.readDataset(self.spark, training_data_path) glove = (WordEmbeddings().setInputCols([ "sentence", "token" ]).setOutputCol("glove").setEmbeddingsSource(glove_path, 300, 2)) nerTagger = (NerDLApproach().setInputCols( ["sentence", "token", "glove"]).setLabelColumn("label").setOutputCol( "ner").setMaxEpochs(1).setRandomSeed(0).setVerbose(0)) converter = (NerConverter().setInputCols(["sentence", "token", "ner" ]).setOutputCol("ner_span")) finisher = (Finisher().setInputCols( ["sentence", "token", "ner", "ner_span"]).setIncludeMetadata(True)) ner_pipeline = Pipeline(stages=[glove, nerTagger, converter, finisher]) print("Start fitting") started = time.time() self.ner_model = ner_pipeline.fit(training_data) traintime = time.time() - started print("Fitting is ended", traintime, " s") return self.ner_model # print(ner_model) def predict(self, model, txt): if not self.ner_model: self.ner_model = model # untuk melakukan prediksi dalam bentuk dataframe (menguji dataset berdasarkan model yang ada) prediction_data = self.spark.createDataFrame([[txt]]).toDF("text") # testing document = DocumentAssembler().setInputCol("text").setOutputCol( "document") sentence = (SentenceDetector().setInputCols( ["document"]).setOutputCol("sentence")) token = Tokenizer().setInputCols(["sentence"]).setOutputCol("token") prediction_pipeline = Pipeline(stages=[ document, sentence, token, self.ner_model, # model NER yang telah di-train sebelumnya ]) prediction_model = prediction_pipeline.fit(prediction_data) predicted_res = prediction_model.transform(prediction_data) # memperlihatkan hasilnya # predicted_res.show() # mengambil kolom 'finished_ner' untuk perbandingan dan mengukur akurasi finished_ner = predicted_res.select("finished_token", "finished_ner") rddl = finished_ner.rdd predicted = [] # memasukkan hasil kolom 'finished_ner' dalam bentuk RDD ke list of list of string for x in rddl.collect(): predicted.append(list(x)) print(predicted) self.convert_to_rasa(predicted[0]) def load_model(self, model_path): if model_path: sameModel = PipelineModel.read().load(model_path) return sameModel else: return None def convert_to_rasa(self, pred): """ Clinton was the President of United States of America hasil masih seperti ini [['Clinton'], ['Clinton'], [], ['United', 'States', 'of', 'America'], ['United', 'States', 'of', 'America'], ['United', 'States', 'of', 'America'], ['United', 'States', 'of', 'America'], ['United', 'States', 'of', 'America']] seharusnya [['Clinton','person'], ['President','role'], ['United', 'States', 'of', 'America','location']] person, role, location => entity yang diinginkan rasa, seperti ini [{"value":"Clinton", "start": 0, "end": 6, "entity": "person", "confidence": null, "extractor": "SparkNLPEntityExtractor"}, {"value":"President", "start": 16, "end": 24, "entity": "person", "confidence": null, "extractor": "SparkNLPEntityExtractor"}, ...dst ] """ tokens = pred[0] entities = pred[1] prev_ent = None hasil = [] value = [] for i in range(len(tokens)): if entities[i] != "O": if prev_ent == None or prev_ent == entities[i]: value.append(tokens[i]) elif prev_ent != entities[i]: prev_ent = entities[i] value = [] else: prev_ent = None value = [] if value and value not in hasil: hasil.append(value) print("DEBUGGGG\n==============") print(hasil)