import sparknlp from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql.functions import udf, col from pyspark.sql.types import IntegerType, StringType from sparknlp.pretrained import PretrainedPipeline # 1. Setup sparknlp.start() conf = SparkConf().setAppName('parallel-project') sc = SparkContext.getOrCreate() spark = SQLContext(sc) pipeline = PretrainedPipeline('analyze_sentiment', 'en') # 2. Data Cleansing # read in data to a DataFrame comments = spark.read.json('RC_2019-02-28-one-day') # dummy_data = [["Hello, world!", "/r/soccer"], ["Wow. Simply wow. What an unbelievable pass, inch perfect.", "/r/nba"]] # comments = sc.parallelize(dummy_data).toDF(['body', 'subreddit']) comments.printSchema # Rename 'body' to 'text' for spark-nlp comments = comments.withColumnRenamed('body', 'text') # keep only the columns we're interested in commentsCleaned = comments.select('subreddit', 'text') # Filter out bad comment data commentsCleaned = commentsCleaned.filter(commentsCleaned.text != '[deleted]')\ .filter(commentsCleaned.text != '[removed]')\
! apt-get update -qq ! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"] ! java -version # Install pyspark ! pip install --ignore-installed pyspark==2.4.4 # Install Spark NLP ! pip install --ignore-installed spark-nlp==2.5.0 import sparknlp spark = sparknlp.start() print("Spark NLP version: ", sparknlp.version()) print("Apache Spark version: ", spark.version) from google.colab import drive drive.mount('/content/drive') Dataset = spark.read.option("header", True).csv('drive/My Drive/bbc-text.csv') Dataset.show(10) df_train, df_test = Dataset.randomSplit([.7, .3]) df_train.show(5)
import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import base64 from sklearn.metrics import classification_report # import findspark # findspark.init() import sparknlp spark = sparknlp.start(gpu=True) import pyspark from pyspark.ml import Pipeline from sparknlp.annotator import * from sparknlp.common import * from sparknlp.base import * from pyspark.sql.types import StringType from pyspark.sql import SparkSession import pyspark.sql.functions as F from sparknlp.pretrained import PretrainedPipeline import json import time import warnings import os warnings.filterwarnings("ignore")
def get_sample_sdf(): nlu.spark = sparknlp.start() nlu.spark_started = True return nlu.spark.createDataFrame(get_sample_pdf())
from PredictionAlgorithms.SentimentAnalysis.SAMachineLearning import SAMachineLearning from PredictionAlgorithms.PredictiveConstants import PredictiveConstants as pc from PredictionAlgorithms.PredictiveUtilities import PredictiveUtilities as pu from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.classification import DecisionTreeClassifier from pyspark.sql import SparkSession import sparknlp # sparkTest = \ # SparkSession.builder.appName('DMXPredictiveAnalytics').master('local[*]').getOrCreate() # sparkTest.sparkContext.setLogLevel('ERROR') sparkTest = sparknlp.start() class SADecisionTreeClassifier(SAMachineLearning): def sentimentData(self, sentimentDataInfo): sentimentDataInfo = self.sentimentAnalysis(sentimentDataInfo) sentimentDataInfo = self.trainModel(sentimentDataInfo) sentimentDataInfo = self.invertIndexColm(sentimentDataInfo) modelName = sentimentDataInfo.get(pc.MODELSHEETNAME) storagePath = sentimentDataInfo.get(pc.STORAGELOCATION) jsonStorageLocation = storagePath + modelName #--sahil store the data in json format --> write the separate method for this. sentimentDataInfo.pop(pc.SPARK, "None") sentimentDataInfo.pop(pc.DATASET, "None") sentimentDataInfo.pop(pc.TESTDATA, "None") sentimentDataInfo.pop(pc.TRAINDATA, "None") sentimentDataInfo.pop(pc.MODEL, "None") # json.dump(sentimentDataInfo, open(storagePath + modelName + ".json", 'w')) pu.writeToJson(jsonStorageLocation, sentimentDataInfo)