def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
    def train_lg(training_data, collection):
        # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
        hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
        idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
        pipeline1 = Pipeline(stages=[hashingTF, idf])

        # Fit the pipeline1 to training documents.
        model1 = pipeline1.fit(training_data)

        lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        pipeline2 = Pipeline(stages=[model1, lr])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \
            .addGrid(lr.regParam, [0.1, 0.01]) \
            .build()

        crossval = CrossValidator(estimator=pipeline2,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=BinaryClassificationEvaluator(),
                                  numFolds=5)

        # Run cross-validation, and choose the best set of parameters.
        cvModel = crossval.fit(training_data)

    #     model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
    #                             + collection["Id"] + '_'
    #                             + collection["name"])
    #     cvModel.save(sc, model_path)
        return cvModel
def tf_idf_feature(wordsData):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    for features_label in rescaledData.select("features", "id").take(3):
        print(features_label)
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
Exemple #5
0
def extract_tf_features(p_df, input_col, output_col):
    """
    Extracts TF features.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.    
    """
    hashingTF = HashingTF(inputCol=input_col, outputCol=output_col, numFeatures=3000)
    return hashingTF.transform(p_df)
Exemple #6
0
def term_frequency(df, column):
    """
    Compute term-frequency of a token contained in a column.
    Transformation: array<string> --> vector
    """ 
    tf = HashingTF(inputCol=column, outputCol='_'+column)
    df = tf.transform(df)
    
    df = replace(df, column, '_'+column)
    return df
Exemple #7
0
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):

    global idfModel
    
    hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
    featurizedData = hashingTF.transform(dataframe)
    idf = IDF(inputCol=in_col2, outputCol=out_col2)
    idfModel = idf.fit(featurizedData)
    dataframe = idfModel.transform(featurizedData)
    
    return dataframe
Exemple #9
0
    def test_apply_binary_term_freqs(self):

        df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
        n = 10
        hashingTF = HashingTF()
        hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
        output = hashingTF.transform(df)
        features = output.select("features").first().features.toArray()
        expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(features[i]))
def predictLabel(label,title,model):
    """预测新闻的标签"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col):
    #from pyspark.sql.functions import udf
    #from pyspark.sql.types import *
    output_raw_col = ip_col+"raw_features"
    output_col = ip_col+"features"
    hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
    featurizedData = hashingTF.transform(df)
    idf = IDF(inputCol=output_raw_col, outputCol=output_col)
    idfModel = idf.fit(featurizedData)
    rescaled_data = idfModel.transform(featurizedData)
    rescaled_data.show(5)
    print(rescaled_data.count())
    return rescaled_data
Exemple #13
0
def makeTFIDF(sc, spark, reviews):
    # count vectorizer and tfidf
    # cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
    # cvModel = cv.fit(reviews)
    # reviews = cvModel.transform(reviews)

    # HashingTF for fewer dimensions:
    hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
    reviews = hashingtf.transform(reviews)

    # create TF-IDF matrix
    idf = IDF().setInputCol('tf').setOutputCol('tfidf')
    tfidfModel = idf.fit(reviews)
    reviews = tfidfModel.transform(reviews)
 def _build_stages(self):
     self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
     self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
     self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
     self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
     self.lr = LogisticRegression(maxIter=10, regParam=0.01)
     return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]
Exemple #15
0
 def append_tf_idf(self, df):
     """
     Calculate term frequency and inverse document frequency
      based on at least 1 visit hourly in this case. Compares how often the tokens appeared
      at least once per hour compared to other tokens. Not used for the main purpose of the project.
     Args:
         :param df: Dataframe parameter.
     Returns:
         :return:  Dataframe with term frequency and inverse document frequency added in the columns
                     'rawFeatures' and 'features' respectively.
     """
     #Create TF column.
     hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
     tf = hashingTF.transform(df)
     tf.persist(StorageLevel.MEMORY_AND_DISK)
     #Create IDF column.
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idfModel = idf.fit(tf)
     tfidf = idfModel.transform(tf)
     return tfidf
def create_pipeline(model_type, num_features=10000):
    """
    Defines pipeline from BOW to prediction.
    """

    remover = StopWordsRemover(inputCol="bow", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                outputCol="features")

    if model_type == 'log_reg':
        model = LogisticRegression()
    elif model_type == 'gbt':
        model = GBTClassifier()
    elif model_type == 'naive_bayes':
        model = NaiveBayes()
    elif model_type == 'rf':
        model = RandomForestClassifier()

    return Pipeline(stages=[remover, hashingTF, tfidf,
                                model])
Exemple #17
0
def kmeansresults():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2**20)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    kmeans = KMeans(k=8,
                    seed=1,
                    featuresCol='rawFeatures',
                    maxIter=10,
                    initMode='random')
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, kmeans])
    pipeline.save("KMeansPipeline")
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    results.groupBy("prediction").count().show(
    )  # Note "display" is for Databricks; use show() for OSS Apache Spark
    # results.filter(results.prediction == 1).show(200,False)
    results.show()
    results.toPandas().to_csv(
        'kmeansresultsCanadaAndProductsAndDisastersAndClaritin.csv')
    model.stages[-1].save("KMeansModel")
def test_get_params_to_log(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = get_params_to_log(lor)
    assert (
        lor_params["maxIter"] == 3
        and not lor_params["standardization"]
        and lor_params["family"] == lor.getOrDefault(lor.family)
    )

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = get_params_to_log(ova)
    assert (
        ova_params["classifier"] == "LogisticRegression"
        and ova_params["labelCol"] == "abcd"
        and ova_params["LogisticRegression.maxIter"] == 3
        and ova_params["LogisticRegression.family"] == lor.getOrDefault(lor.family)
    )

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = get_params_to_log(pipeline)
    nested_pipeline_params = get_params_to_log(nested_pipeline)

    assert pipeline_params["stages"] == ["Tokenizer", "HashingTF", "OneVsRest"]
    assert nested_pipeline_params["stages"] == ["Tokenizer", "Pipeline_2"]
    assert nested_pipeline_params["Pipeline_2.stages"] == ["HashingTF", "OneVsRest"]
    assert nested_pipeline_params["OneVsRest.classifier"] == "LogisticRegression"

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (
            params_to_test["Tokenizer.inputCol"] == "text"
            and params_to_test["Tokenizer.outputCol"] == "words"
        )
        assert params_to_test["HashingTF.outputCol"] == "features"
        assert params_to_test["OneVsRest.classifier"] == "LogisticRegression"
        assert params_to_test["LogisticRegression.maxIter"] == 3
Exemple #19
0
def compute_clusters(addons_df, num_clusters, random_seed):
    """ Performs user clustering by using add-on ids as features.
    """

    # Build the stages of the pipeline. We need hashing to make the next
    # steps work.
    hashing_stage = HashingTF(inputCol="addon_ids", outputCol="hashed_features")
    idf_stage = IDF(inputCol="hashed_features", outputCol="features", minDocFreq=1)
    # As a future improvement, we may add a sane value for the minimum cluster size
    # to BisectingKMeans (e.g. minDivisibleClusterSize). For now, just make sure
    # to pass along the random seed if needed for tests.
    kmeans_kwargs = {"seed": random_seed} if random_seed else {}
    bkmeans_stage = BisectingKMeans(k=num_clusters, **kmeans_kwargs)
    pipeline = Pipeline(stages=[hashing_stage, idf_stage, bkmeans_stage])

    # Run the pipeline and compute the results.
    model = pipeline.fit(addons_df)
    return (
        model
        .transform(addons_df)
        .select(["client_id", "prediction"])
    )
Exemple #20
0
def RF_Model(train_dataframe, test_dataframe):
    '''
    Takes the argument as a train_dataframe, test_dataframe implements the
    pipeline of RegexTokenizer,    NGrams =3 , HashingTF, IDF and
    Random Forest Classifier and predicts the label based on features of
    test_dataframe.

    The Pattern RegexTokenizer is set to "\\W|\b(00|CC)\b" because it removes
    all nonwords that is extra spaces or punctuations, '??', '00' and 'CC' are
    removed as these are most repeated words and accuracy is significantly
    improved.
    Args:
        dataframe:
            -The train_dataframe should consist of the columns, 'label'
            and 'text'.
            -The test_dataframe should consist of the column 'text'.
    Returns:
        DataFrame['prediction': double, given_order: bigint, label: string]
        iff data read initially is a small dataset
        else DataFrame['prediction': double, given_order: bigint]
        data read initially is a big dataset
    '''
    train_dataframe = train_dataframe.repartition(96)\
        .withColumn('label', train_dataframe['label'].cast(IntegerType()))
    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W|\b(00|CC)\b")
    ngram = NGram(n=3, inputCol="words", outputCol="ngrams")
    hashingTF = HashingTF(inputCol="ngrams", outputCol="TF")
    idf = IDF(inputCol="TF", outputCol="features")
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                numTrees=30)
    pipeline = Pipeline(stages=[regexTokenizer, ngram, hashingTF, idf, rf])
    model = pipeline.fit(train_dataframe)
    predictions_df = model.transform(test_dataframe)
    return predictions_df\
        .drop('rawfeatures', 'n_grams', 'TF', 'text', 'words', 'features')
class BaselinePipelineEngine(PipelineEngine):
    @keyword_only
    def __init__(self, cv):
        super(BaselinePipelineEngine, self).__init__(cv)
        self.hashing_tf_map = [pow(2, 20)]
        self.lr_map = [0.1, 0.01]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr])
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features")
        self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features")
        self.lr = LogisticRegression(maxIter=10, regParam=0.01)
        return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
        return param_grid_builder.build()
    def __init__(self, sc, sql_context, dataset_path):
        """Init the recommendation engine given a Spark context and a dataset path
        """

        logger.info("Starting up the Sentiment Analyser Engine:")

        self.sc = sc
        self.sql_context = sql_context
        # Load sentiment data for later use
        logger.info("Loading Sentiment data...")
        sentiment_file_path = os.path.join(dataset_path,
                                           'arabic_tweets_labeled.csv')
        sentiment_RDD = self.sql_context.read.format(
            'com.databricks.spark.csv').options(
                header=True, inferSchema='true').load(sentiment_file_path)
        sentiment_RDD = sentiment_RDD.dropna()

        tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
        hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
        idf = IDF(inputCol='tf', outputCol="features",
                  minDocFreq=5)  # minDocFreq: remove sparse terms
        label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
        pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])
        sentiment_RDD.show()
        pipelineFit = pipeline.fit(sentiment_RDD)
        data = pipelineFit.transform(sentiment_RDD)
        self.pipelineFit = pipelineFit
        self.data = data
        (train_set, test_set) = data.randomSplit([0.8, 0.2], seed=2000)

        self.train_set = train_set
        self.test_set = test_set

        # Train the model
        self.seed = 1245
        self.iterations = 100
        self.__train_model()
def test_get_instance_param_map(spark_session):  # pylint: disable=unused-argument
    lor = LogisticRegression(maxIter=3, standardization=False)
    lor_params = _get_instance_param_map(lor)
    assert (lor_params["maxIter"] == 3 and not lor_params["standardization"]
            and lor_params["family"] == lor.getOrDefault(lor.family))

    ova = OneVsRest(classifier=lor, labelCol="abcd")
    ova_params = _get_instance_param_map(ova)
    assert (ova_params["classifier"] == lor.uid
            and ova_params["labelCol"] == "abcd"
            and ova_params[f"{lor.uid}.maxIter"] == 3 and
            ova_params[f"{lor.uid}.family"] == lor.getOrDefault(lor.family))

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])
    inner_pipeline = Pipeline(stages=[hashingTF, ova])
    nested_pipeline = Pipeline(stages=[tokenizer, inner_pipeline])

    pipeline_params = _get_instance_param_map(pipeline)
    nested_pipeline_params = _get_instance_param_map(nested_pipeline)

    assert pipeline_params["stages"] == [tokenizer.uid, hashingTF.uid, ova.uid]
    assert nested_pipeline_params["stages"] == [
        tokenizer.uid,
        {
            inner_pipeline.uid: [hashingTF.uid, ova.uid]
        },
    ]

    for params_to_test in [pipeline_params, nested_pipeline_params]:
        assert (params_to_test[f"{tokenizer.uid}.inputCol"] == "text"
                and params_to_test[f"{tokenizer.uid}.outputCol"] == "words")
        assert params_to_test[f"{hashingTF.uid}.outputCol"] == "features"
        assert params_to_test[f"{ova.uid}.classifier"] == lor.uid
        assert params_to_test[f"{lor.uid}.maxIter"] == 3
Exemple #24
0
def ldaresults():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2**18)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

    lda = LDA(k=8, seed=1, optimizer="em", featuresCol='features')

    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lda])
    model = pipeline.fit(df)
    topics = model.stages[-1].describeTopics()

    # topics.show(truncate=False)
    transformed = model.transform(df)

    # transformed.sort('topicDistribution').show(20000,truncate=False)
    # transformed.toPandas().to_csv('ldaresultsCanadaAndProductsAndDisastersAndClaritin.csv')
    transformed.rdd.map(lambda row: (row['text'] ,row['features'] ,row['topicDistribution'],int(np.argmax(np.asarray([float(x) for x in row['topicDistribution']])))))\
        .toDF()\
        .toPandas().to_csv('ldaresultsCanadaAndProductsAndDisastersAndClaritin.csv')
Exemple #25
0
def kmeans_from_csv2(file, outfile, k=8):
    df = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        (file)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2**20)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    #results.groupBy("prediction").count().show()  # Note "display" is for Databricks; use show() for OSS Apache Spark
    # results.filter(results.prediction == 1).show(200,False)
    results.show()
    #results.toPandas().to_csv(outfile)
    # Trains a k-means model.
    xaxis = []
    yaxis = []
    for k in range(2, 11):
        xaxis.append(k)
        kmeans = KMeans().setK(k).setSeed(1)
        model = kmeans.fit(results)

        # Evaluate clustering by computing Within Set Sum of Squared Errors.
        wssse = model.computeCost(results)
        yaxis.append(wssse)
        print("Within   Sum of Squared Errors  for k= " + str(k) + "is " +
              str(wssse))
    plt.plot(xaxis, yaxis)
    plt.show()
    def train_validate(self, df):
        # Split the data into training and test sets (30% held out for testing)
        (training, test) = df.randomSplit([0.7, 0.3])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                   outputCol="filtered")
        hashingTF = HashingTF(numFeatures=10000,
                              inputCol=remover.getOutputCol(),
                              outputCol="features")

        ####################
        # lr = LogisticRegression(maxIter=10, regParam=0.001)
        # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])
        ####################

        # instantiate the base classifier.
        lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)
        pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr])
        #####################

        # Fit the pipeline to training documents.
        model = pipeline.fit(training)

        # Make predictions on test documents and print columns of interest.
        prediction = model.transform(test)

        # obtain evaluator.
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        # compute the classification error on test data.
        accuracy = evaluator.evaluate(prediction)
        print("Test Error : " + str(1 - accuracy))
        return model
def preprocess(spark_session, data_file):
    raw_data = spark_session.read.format('json').load(data_file)

    regexTokenizer = RegexTokenizer(inputCol='text',
                                    outputCol='words',
                                    pattern='\\w+',
                                    gaps=False,
                                    toLowercase=True)

    stopWordsRemover = StopWordsRemover(inputCol='words',
                                        outputCol='filtered_words')

    hashingTF = HashingTF(inputCol='filtered_words',
                          outputCol='tf_features',
                          numFeatures=20)

    idf = IDF(inputCol='tf_features', outputCol='features')

    pipeline = Pipeline(
        stages=[regexTokenizer, stopWordsRemover, hashingTF, idf])
    pipeline_model = pipeline.fit(raw_data)
    data = pipeline_model.transform(raw_data)

    return data
def build_model_pipeline():
    """
    TF (term frequency): number of times the word occurs in a sepcific document
    DF (document frequency): number of times a word coccurs in collection of documents
    TF-IDF (TF - inverse DF): measures the significace of a word in a document
    """

    # 1. tokenize words, convert word to lowercase
    tokenizer = RegexTokenizer(inputCol='review',
                               outputCol='review_tokens_uf',
                               pattern='\\s+|[(),.!?\";]',
                               toLowercase=True)

    # 2. remove stopwords
    stopwords_remover = StopWordsRemover(
        stopWords=StopWordsRemover.loadDefaultStopWords('english'),
        inputCol='review_tokens_uf',
        outputCol='review_tokens')

    # 3. TF
    # cv = CountVectorizer(
    #     inputCol='review_tokens',
    #     outputCol='tf',
    #     vocabSize=200000
    # )
    cv = HashingTF(inputCol='review_tokens', outputCol='tf')

    # 4. IDF
    idf = IDF(inputCol='tf', outputCol='features')

    # 5. NB
    nb = NaiveBayes()

    pipeline = Pipeline(stages=[tokenizer, stopwords_remover, cv, idf, nb])

    return pipeline
Exemple #29
0
def getOrCreateNBModel(sc):

    # Load the pipeline from disk.
    loaded = PipelineModel.load('./nbmodel')

    # Returned the model loaded from the disk if found
#     if loaded:
#         return loaded

    # Else create the model/PipelineModel, save and return it.

    (df, spark) = loadSentiment140(sc, SENTIMENT140_DATA)

#     tokenizer = Tokenizer(inputCol='status', outputCol='barewords')

#     remover = StopWordsRemover(inputCol='barewords', outputCol='filtered')# , stopWords=removeStopWords())
#     print('Remover', remover.transform(df).head())

    tokenizer = TweetSanitizer(inputCol='status', outputCol='filtered')

    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='features')

    # Defined model parameters
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

    # Defined Pipeline
    pipeline = Pipeline(stages=[tokenizer, hashingTF, nb])

    # Train the data
    model = pipeline.fit(df)

    # Save the pipeline, overwrite if already present.
    # This won't work with PySpark's custom transformer
#     model.write().overwrite().save('./nbmodel')

    return model
Exemple #30
0
def postreview():
    form = PostReviewForm()
    if form.validate_on_submit():
        if form.review.data is not None:
            Text = [form.review.data]
            df = pd.DataFrame({'Text': Text})
            df2 = sqlContext.createDataFrame(df)
            df2 = df2.dropna()
            text = "Text"
            target = "useful"
            tokenizer = Tokenizer(inputCol=text, outputCol="words")
            hashtf = HashingTF(numFeatures=2**16,
                               inputCol="words",
                               outputCol='tf')
            idf = IDF(inputCol='tf', outputCol="features")
            label_stringIdx = StringIndexer(inputCol=target, outputCol="label")
            pipeline = Pipeline(
                stages=[tokenizer, hashtf, idf, label_stringIdx])
            transform_df = pipelineFit.transform(df2)
            predictions = lrModel_final.transform(transform_df)
            prediction_value = predictions.withColumn(
                "value", predictions["prediction"].cast(IntegerType()))
            output = prediction_value.select('value').take(1)[0][0]

            if output == 1:
                flash(
                    f'Thanks for posting your review, this was really usefull..!',
                    'success')
                return redirect(url_for('postreview', _anchor='review_form'))
            else:
                flash(
                    'Thanks for posting, really appreciate if you can share more details..',
                    'danger')
                return redirect(url_for('postreview', _anchor='review_form'))

    return render_template('index.html', form=form)
def main():

    processed_path = sys.argv[1]
    output_file = sys.argv[2]

    comm = spark.read.csv(processed_path,
                          schema=comments_schema).repartition(2000).cache()

    tokenizer = Tokenizer(inputCol="comments", outputCol="words")
    # wordsDF = tokenizer.transform(comm)

    hashing = HashingTF(inputCol="words", outputCol="features")
    # count_vect = CountVectorizer(inputCol="words", outputCol="features")
    # cv_model = count_vect.fit(wordsDF)
    # df_features = cv_model.transform(wordsDF)

    # corpus = df_features.select(col('id'), col('features')).cache()

    lda = LDA(k=10, maxIter=10, optimizer='online')

    # lda_model = lda.fit(corpus)
    pipeline = Pipeline(stages=[tokenizer, hashing, lda])
    model = pipeline.fit(comm)

    transformed = model.transform(comm).selectExpr('id', 'topicDistribution')

    topic_text = udf(to_text)
    topics_df = transformed.select(
        transformed['id'],
        topic_text(
            transformed['topicDistribution']).alias('topicDistribution'))
    # topics_df.show(truncate=False)

    topics_df.write.option('sep', ',').save(output_file,
                                            format='csv',
                                            mode='overwrite')
 def sample_tf_idf(self, mergeRDD, nl_idfModel, ece_idfModel):
     dataDF = mergeRDD.map(
         lambda p: Row(**{
             'lable': p[0],
             'edu_city_exp': p[1],
             'leibie and name': p[2]
         })).toDF()
     nl_hashingTF = HashingTF(inputCol='leibie and name',
                              outputCol='nlFeatures',
                              numFeatures=256)
     featuresData = nl_hashingTF.transform(dataDF)
     ece_hashingTF = HashingTF(inputCol='edu_city_exp',
                               outputCol='eceFeatures',
                               numFeatures=64)
     featuresData = ece_hashingTF.transform(featuresData)
     rescled = nl_idfModel.transform(featuresData)
     rescled = ece_idfModel.transform(rescled)
     RDD = rescled.rdd
     featuresRDD = RDD.map(lambda i: (i.lable, i.nlfeatures.toArray(
     ).tolist() + i.ecefeatures.toArray().tolist()))
     return featuresRDD
Exemple #33
0
    def tf_idf(self, dataRDD):
        dataDF = dataRDD.map(lambda i: Row(
            **{
                'name_and_desp': desp_text_division(i.name + ',' + i.work_desp
                                                    ),
                'salary': i.mon_wa,
                'education': [i.education],
                'city': [i.work_area],
                'work_lable': [i.work_lable],
                'work_exp': [i.work_exp]
            })).map(lambda i: Row(
                **{
                    'name_and_desp': i.name_and_desp,
                    'salary': i.salary,
                    'agg': i.education + i.city + i.work_lable + i.work_exp
                })).toDF()
        dataDF.show()
        nd_hashingTF = HashingTF(inputCol='name_and_desp',
                                 outputCol='ndFeatures',
                                 numFeatures=10240)
        f_hashingTF = HashingTF(inputCol='agg',
                                outputCol='Features_agg',
                                numFeatures=256)
        tfdata = nd_hashingTF.transform(dataDF)

        tfdata = f_hashingTF.transform(tfdata)
        nd_idf = IDF(inputCol='ndFeatures', outputCol='ndfeatures')
        f_idf = IDF(inputCol='Features_agg', outputCol='features_agg')
        nd_idf_model = nd_idf.fit(tfdata)
        f_idf_model = f_idf.fit(tfdata)
        nd_idf_model.save('hdfs://localhost:9000/nd_idf')
        f_idf_model.save('hdfs://localhost:9000/agg_idf')
        tf_idfdata = nd_idf_model.transform(tfdata)
        tf_idfdata = f_idf_model.transform(tf_idfdata)
        featuresRDD = tf_idfdata.select('salary', 'ndfeatures',
                                        'features_agg').rdd
        featuresRDD = featuresRDD.map(
            lambda i: (int(i.salary), i.ndfeatures.toArray().tolist() + i.
                       features_agg.toArray().tolist()))
        return featuresRDD
    def tf_idf(self, mergeRDD):
        fields = [
            StructField('lable', IntegerType(), nullable=True),
            StructField('edu_city_exp',
                        ArrayType(elementType=StringType()),
                        nullable=True),
            StructField('leibie_name',
                        ArrayType(elementType=StringType()),
                        nullable=True)
        ]
        schema = StructType(fields)
        rowRDD = mergeRDD.map(lambda p: Row(p[0], p[1], p[2]))
        info_df = self.spark.createDataFrame(schema=schema, data=rowRDD).toDF(
            'lable', 'edu_city_exp', 'leibie and name')
        info_df.show()
        name_df = info_df.select('lable', 'edu_city_exp', 'leibie and name')
        nl_hashingTF = HashingTF(inputCol='leibie and name',
                                 outputCol='nlFeatures',
                                 numFeatures=256)
        featurizeData = nl_hashingTF.transform(name_df)
        ece_hashingTF = HashingTF(inputCol='edu_city_exp',
                                  outputCol='eceFeatures',
                                  numFeatures=64)
        featurizeData = ece_hashingTF.transform(featurizeData)
        nl_idf = IDF(inputCol='nlFeatures', outputCol='nlfeatures')
        ece_idf = IDF(inputCol='eceFeatures', outputCol='ecefeatures')
        nl_idfModel = nl_idf.fit(featurizeData)
        ece_idfModel = ece_idf.fit(featurizeData)
        rescaledData = nl_idfModel.transform(featurizeData)
        rescaledData = ece_idfModel.transform(rescaledData)
        tf_idfmerge = []
        for i in rescaledData.select('lable', 'nlfeatures',
                                     'ecefeatures').collect():
            ele_lst = i.nlfeatures.toArray().tolist() + i.ecefeatures.toArray(
            ).tolist()
            tf_idfmerge.append((int(i.lable), ele_lst))
        print(tf_idfmerge)

        featuresRDD = self.sc.parallelize(tf_idfmerge)
        return featuresRDD
    train = train.na.drop()
    test = test.na.drop()

    for col in train.columns:
        if col in [
                'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
                'identity_hate'
        ]:
            train = train.withColumn(col, train[col].cast(T.FloatType()))

    #Main code
    out_cols = [i for i in train.columns if i not in ["id", "comment_text"]]
    tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
    wordsData = tokenizer.transform(train)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    tf = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(tf)
    tfidf = idfModel.transform(tf)
    REG = 0.1
    lr = LogisticRegression(featuresCol="features",
                            labelCol='toxic',
                            regParam=REG)
    lrModel = lr.fit(tfidf.limit(5000))
    res_train = lrModel.transform(tfidf)
    res_train.select("id", "toxic", "probability", "prediction").show(20)
    res_train.show(5)
    extract_prob = F.udf(lambda x: float(x[1]), T.FloatType())
    (res_train.withColumn("proba", extract_prob("probability")).select(
        "proba", "prediction").show())
Exemple #36
0
    return tweet.strip()

data = sc.textFile(trainingFile)

header = data.first()
rdd = data.filter(lambda row: row != header)

r = rdd.mapPartitions(lambda x : csv.reader(x))
r2 = r.map(lambda x: (processTweetText(x[3]), int(x[1])))

parts = r2.map(lambda x: Row(sentence=x[0], label=int(x[1])))
partsDF = spark.createDataFrame(parts).orderBy(rand()).limit(maxLines)

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="base_words")
hashingTF = HashingTF(numFeatures=6000, inputCol="base_words", outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.05, elasticNetParam=0.025, family="binomial")

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])

(trainSet, testSet) = partsDF.randomSplit([trainPercent, testPercent], 1291)

lrModel = pipeline.fit(trainSet)
lrResult = lrModel.transform(testSet)

avg = lrResult.where('label == prediction').count() / (maxLines * testPercent)
print(avg)

#Adjust maxIter to the number of iterations needed to reach convergence (check if it decreases less than pow(10,-3))
#import matplotlib.pyplot as plt
#a = lrModel.stages[-1].summary.objectiveHistory
## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False,
                   seed=1,
                   maxRuntimeSecs=300, # 5 minutes
                   predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])
Exemple #38
0
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create H2ODeepLearning model
dl = H2ODeepLearning(epochs=10,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
Exemple #39
0
f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1))


linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt")
partst = linest.map(lambda l: l.split(","))
ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0))
alldata = f.union(ft)

schemaApp = sqlContext.createDataFrame(alldata)

schemaApp.registerTempTable("data")

tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms")
permsData = tokenizer.transform(schemaApp)

hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures")
featurizedData = hashingTF.transform(permsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")


idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)

labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features)))

trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(wordsvectors.count())
Exemple #40
0
from pyspark.sql import SQLContext
from pyspark.ml.feature import RegexTokenizer, HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import RandomForest

## Load Dataset
df_pandas = pd.read_csv('sample.csv')

## Convert to Spark Dataframe
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(df_pandas)

## Tokenizer and Hashing 
tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features")
df_feat = hashingTF.transform(tokenizer.transform(df))

## Create LabeledPoint and Features for Prediction (predict the 1s observations)
lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features))
predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features)


## Compare predictions from Different Models


## Logistic Regression
lrm = LogisticRegressionWithSGD.train(lp, iterations=10)
logit_predict = lrm.predict(predict_feat)
logit_predict.sum()
#9112
Exemple #41
0
	(20,"apple iphone 6 16gb t mobile"),
	(20,"Apple iPhone Apple iPhone 6 16GB 412 2 cell 2895"),
	(20,"iPhone 6 T Mobile 16 GB"),
	(20,"Apple 6 16gb T Mobile")
], ["label","text"])

# Learn a mapping from words to Vectors.
#word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="textVec")
#model = word2Vec.fit(documentDF)
#result = model.transform(documentDF)
#print result.take(2)

tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText")
tokenizedTextData = tokenizer.transform(documentDF)

hashingTF = HashingTF(inputCol="tokenizedText", outputCol="rawFeatures")
featurizedData = hashingTF.transform(tokenizedTextData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
result1 = idfModel.transform(featurizedData)


for features_label in result.select("label","pcaFeatures").take(10):
  print(features_label)


wordsvectors = result["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))


Exemple #42
0
    # 2. 英文的分词方法,tokenizer
    tokenizer = Tokenizer(inputCol="MANUFACTURER_NAME_EN_STANDARD",
                          outputCol="MANUFACTURER_NAME_EN_WORDS")
    df_standard = tokenizer.transform(df_standard)

    # 3. 中文的分词,jieba
    df_standard = df_standard.withColumn(
        "MANUFACTURER_NAME_WORDS",
        manifacture_name_pseg_cut(df_standard.MANUFACTURER_NAME_STANDARD))
    df_standard.select("MANUFACTURER_NAME_STANDARD", "MANUFACTURER_NAME_WORDS",
                       "MANUFACTURER_NAME_EN_STANDARD",
                       "MANUFACTURER_NAME_EN_WORDS").show(truncate=False)

    # 4. 构建机器学习的feature
    hashingTF_en = HashingTF(inputCol="MANUFACTURER_NAME_EN_WORDS",
                             outputCol="raw_features_mnf_en",
                             numFeatures=1000)
    man_en_idf = IDF(inputCol="raw_features_mnf_en",
                     outputCol="features_mnf_en")

    hashingTF_cn = HashingTF(inputCol="MANUFACTURER_NAME_WORDS",
                             outputCol="raw_features_mnf_cn",
                             numFeatures=1000)
    man_cn_idf = IDF(inputCol="raw_features_mnf_cn",
                     outputCol="features_mnf_cn")

    pipeline = Pipeline(
        stages=[hashingTF_en, man_en_idf, hashingTF_cn, man_cn_idf])
    idf_model = pipeline.fit(df_standard)
    idf_model.write().overwrite().save(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/idf_model")
Exemple #43
0
def main():

    # read data
    yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True)
    data = yahoo.select(['sector', 'description']).dropna()

    # tokenize texts based on regular expression
    tokenize = RegexTokenizer(inputCol='description', outputCol='words_all', pattern='\\W')

    # remove stop words
    stopwords = '\n'.join((DATADIR/'stopwords'/f).read_text().strip() for f in ('mysql.txt', 'nltk.txt')).splitlines()
    remove_stopwords = StopWordsRemover(inputCol='words_all', outputCol='words_clean').setStopWords(stopwords)

    # get words frequency using simple count (bag of words)
    add_wordcount = CountVectorizer(inputCol='words_clean', outputCol='words_count', vocabSize=10000, minDF=5)

    # get tf-idf words frequencies

    add_wordtf = HashingTF(inputCol='words_clean', outputCol='words_tf', numFeatures=10000)
    add_wordidf = IDF(inputCol='words_tf', outputCol='words_tfidf', minDocFreq=5)

    # prepare output values
    index_target = StringIndexer(inputCol='sector', outputCol='label')

    # data preparation pipeline
    pipeline_wordcount = Pipeline(stages=[
        tokenize,
        remove_stopwords,
        add_wordcount,
        add_wordtf,
        add_wordidf,
        index_target,
        ])
    # apply data preparation pipeline
    model_wordcount = pipeline_wordcount.fit(data)
    prepared = model_wordcount.transform(data)

    # split to training and testing
    training, testing = prepared.randomSplit([0.8, 0.2], seed=100500)

    # fit logistic regression models

    logistic_wordcount = LogisticRegression(regParam=0.3, elasticNetParam=0,
        featuresCol='words_count', labelCol='label', predictionCol='prediction', probabilityCol='probability')

    logistic_tfidf = LogisticRegression(regParam=0.3, elasticNetParam=0,
        featuresCol='words_tfidf', labelCol='label', predictionCol='prediction', probabilityCol='probability')

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy')
    for model, name in (
            (logistic_wordcount, 'Word count + Logistic regression'),
            (logistic_tfidf, 'TF-IDF + Logistic regression')):
        predicted = model.fit(training).transform(testing)
        print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')

    # fit hyperparameters
    grid = (ParamGridBuilder()
        .addGrid(logistic_wordcount.regParam, [0.1, 0.2, 0.3, 0.4])
        .addGrid(logistic_wordcount.elasticNetParam, [0.0, 0.1, 0.2, 0.3])
        .build()
        )
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', metricName='accuracy')
    cv = CrossValidator(
        estimator=logistic_wordcount,
        estimatorParamMaps=grid,
        numFolds=5,
        evaluator=evaluator,
        seed=100500,
        )
    if not FINAL_MODEL.exists():
        model_cv = cv.fit(prepared)
        model_cv.save(str(FINAL_MODEL))
    else:
        model_cv = CrossValidatorModel.load(str(FINAL_MODEL))
    breakpoint()
def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)
Exemple #45
0
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('clean_tweet.csv')
##

(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)

##
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

##
# Try to fit the model using Pyspark's HashingTF
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) # minDocFreq removes sparse terms
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
pipeline = Pipeline(stages = [tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
# train_df.show(5)

##
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)

##
    sm = SparkModel(sc, conn, rdd_path='rdd.pkl')


    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \
            .sample(withReplacement=False, fraction=.5, seed=1)
    df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw'])
    train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1)
    results = []

    num_features = 5000
    min_doc_freq = 20
    layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]]

    for l in layers:
        remover = StopWordsRemover(inputCol="raw", outputCol="words")
        hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                              numFeatures=num_features)
        tfidf = IDF(inputCol=hashingTF.getOutputCol(),
                    outputCol="features", minDocFreq=min_doc_freq)
        indexer = StringIndexer(inputCol="string_label", outputCol="label")

        mlpc = MultilayerPerceptronClassifier(maxIter=100,
                                              layers=l,
                                              blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf,
                                    indexer, mlpc])

        model = pipeline.fit(train_rdd)
        df_output = model.transform(train_rdd)
        test_output = model.transform(test_rdd).select("label", "prediction")
        score = test_output.rdd.map(lambda row: row.label == row.prediction).mean()
##reading csv file
data = pd.read_csv("sms_spam.csv")
#print(data.head(5))
    
##creating rdd file
sc = SparkContext("local", "app")
sqc = SQLContext(sc)
df = sqc.createDataFrame(data, ['type', 'text'])

#NEW VARIABLE GENERATION
dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text'])))
dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1]))
dfClean = sqc.createDataFrame(dataClean, ['label', 'words'])
dfClean.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000)
tf = hashingTF.transform(dfClean)
idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf)
dfFinal = idf.transform(tf)

# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal)

# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = dfFinal.randomSplit([0.8, 0.2])


# Train the model.
## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                 seed=1,
                 featuresCols=[idf.getOutputCol()],
                 predictionCol="label")
elif algo == "dl":
Exemple #49
0
spark = SparkSession.builder.master("local").appName("Word Count").config(
    "spark.some.config.option", "some-value").getOrCreate()

df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
def main(sc, sqlContext):
    start = timer()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    print '---Pegando produtos---'
    start_i = timer()
    productRDD = sc.parallelize(findProductsByCategory([]))
    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())
    print '####levou %d segundos' % (timer()-start_i)

    print '---Pegando e persistindo dados de categoria e tokens---'
    start_i = timer()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
    numTokens = len(tokens)
    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Calculando TF-IDF dos produtos---'
    start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataDF = sqlContext.createDataFrame(wordsData)   

    #persistindo para a predicao
    wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)   

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")

    wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") 

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
    VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))

    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)    


    print '--Criando modelo Naive Bayes---'
    start_i = timer()
    model = NaiveBayes.train(VSMTrain)

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")

    model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Testando modelo Naive Bayes---'
    start_i = timer()
    prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
    print 'acuracidade de %f' % acuraccy
    print '####levou %d segundos' % (timer()-start_i)    
    
    print '---Pegando os posts---'

    start_i = timer()
    posts = list()
    wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
    sheet = wb['Menes']
    for row in sheet.iter_rows(row_offset=1):
        post = list()
        for cell in row:
            if cell.value is None:
                break
            post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))

        if len(post) > 0:            
            posts.append(tuple(post))

    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    postsRDD = sc.parallelize(posts)
    postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
                           .cache())

    print '####levou %d segundos' % (timer()-start_i)

    print '---Calculando TF-IDF dos Posts---'
    start_i = timer()
    wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1]))
    wordsDataDF = sqlContext.createDataFrame(wordsData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)   

    VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features))
    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)   

    print '--Criando modelo SVM---'
    start_i = timer()
    model = SVMWithSGD.train(VSMTrain, iterations=100)
    
    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm")

    model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")

    print '---Testando modelo SVM---'
    start_i = timer()
    prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features)))
    acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count())
    
    print 'acuracidade de %f' % acuraccy

    print '####levou %d segundos' % (timer()-start_i)   

    print 'O processo todo levou %d segundos' % (timer()-start)
Exemple #51
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(estimator=pipeline,
                                   estimatorParamMaps=paramGrid,
                                   evaluator=MulticlassClassificationEvaluator())
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(), paramGrid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages), len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(estimator=nested_pipeline,
                                    estimatorParamMaps=paramGrid,
                                    evaluator=MulticlassClassificationEvaluator())
        tvs2Path = temp_path + "/tvs2"
        tvs2.save(tvs2Path)
        loadedTvs2 = TrainValidationSplit.load(tvs2Path)
        self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(), paramGrid)
        self.assertEqual(loadedTvs2.getEstimator().uid, tvs2.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid, original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(loaded_nested_pipeline_model.stages,
                                              original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

# COMMAND ----------

tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)
tfIdfIn.show(10, False)


# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)


# COMMAND ----------

idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)


# COMMAND ----------

from pyspark.ml.feature import Word2Vec
Exemple #53
0
concat_string_arrays = concat(StringType())

df = df.withColumn(
    'joined_tokens',
    concat_string_arrays(col('filtered_title_tokens'),
                         col('filtered_sterm_tokens'),
                         col('filtered_attr_tokens')))
joined_ngram = NGram(n=2, inputCol="joined_tokens", outputCol="joined_ngrams")

df = joined_ngram.transform(df)
'''
stemmingUdf = udf(stemming, ArrayType(StringType()))
df = df.withColumn('stemmed_tokens', stemmingUdf('joined_tokens'))
'''
joined_hashingTF = HashingTF(inputCol="joined_ngrams",
                             outputCol="joined_rawFeatures",
                             numFeatures=30000)

df = joined_hashingTF.transform(df)

joined_idf = IDF(inputCol="joined_rawFeatures", outputCol="features")

joined_idfModel = joined_idf.fit(df)

df = joined_idfModel.transform(df)
'''
assembler = VectorAssembler(
	inputCols=['title_features','sterm_features','attr_features'],
	outputCol='features')

df = assembler.transform(df)
Exemple #54
0
from pyspark.sql import Row
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


df = spark.read.load('/home/manh/Documents/data/result_pre.parquet')
df = df.select('id', 'stemmed')
rdd =  df.select('stemmed').rdd
pre_idf = rdd.map(lambda x: set(x[0])).flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
pre_idf_collect =  pre_idf.collect()

rdd_words = pre_idf.map(lambda x: Row(word=[x[0]]))

df_words = spark.createDataFrame(rdd_words)

hashingTF = HashingTF(inputCol="word", outputCol="rawFeatures", numFeatures=100000)

featurizedData = hashingTF.transform(df_words)

featurizedData.rdd.map(lambda x: (x.word[0], x['rawFeatures'].indices[0])).map(lambda x: '%s  %s' % (x)).collect()
def main():

    spark = SQLContext(SparkContext.getOrCreate())

    # read data
    yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True)
    data = yahoo.select(['sector', 'description']).dropna()
    breakpoint()

    # tokenize texts based on regular expression
    tokenize = RegexTokenizer(inputCol='description',
                              outputCol='words_all',
                              pattern=r'\W')
    breakpoint()

    # remove stop words
    stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip()
                          for f in ('mysql.txt', 'nltk.txt')).splitlines()
    remove_stopwords = StopWordsRemover(
        inputCol='words_all', outputCol='words_clean').setStopWords(stopwords)
    breakpoint()

    # get words frequency using simple count (bag of words)
    add_wordcount = CountVectorizer(inputCol='words_clean',
                                    outputCol='words_count',
                                    vocabSize=1000,
                                    minDF=2)
    breakpoint()

    # get tf-idf words frequencies
    add_wordtf = HashingTF(inputCol='words_clean',
                           outputCol='words_tf',
                           numFeatures=10000)
    add_wordidf = IDF(inputCol='words_tf',
                      outputCol='words_tfidf',
                      minDocFreq=2)
    breakpoint()

    # prepare output values
    index_target = StringIndexer(inputCol='sector', outputCol='label')
    breakpoint()

    # data preparation pipeline
    pipeline_wordcount = Pipeline(stages=[
        tokenize,
        remove_stopwords,
        add_wordcount,
        add_wordtf,
        add_wordidf,
        index_target,
    ])
    # apply data preparation pipeline
    model_wordcount = pipeline_wordcount.fit(data)
    prepared = model_wordcount.transform(data)
    breakpoint()

    # split to training and testing
    training, testing = prepared.randomSplit([0.8, 0.2], seed=100500)
    breakpoint()

    # fit logistic regression models

    logistic_wordcount = LogisticRegression(regParam=0.3,
                                            elasticNetParam=0,
                                            featuresCol='words_count',
                                            labelCol='label',
                                            predictionCol='prediction',
                                            probabilityCol='probability')

    logistic_tfidf = LogisticRegression(regParam=0.3,
                                        elasticNetParam=0,
                                        featuresCol='words_tfidf',
                                        labelCol='label',
                                        predictionCol='prediction',
                                        probabilityCol='probability')

    breakpoint()

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  metricName='accuracy')
    for model, name in ((logistic_wordcount,
                         'Word count + Logistic regression'),
                        (logistic_tfidf, 'TF-IDF + Logistic regression')):
        predicted = model.fit(training).transform(testing)
        print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')

    breakpoint()
    conn = S3Connection()
    sc = set_spark_context()
    sqc = SQLContext(sc)
    sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl')

    logging.basicConfig(format='%(asctime)s %(message)s')
    grid_search = logging.getLogger('main')
    grid_search.setLevel(logging.DEBUG)
    handler = logging.FileHandler('../logs/grid_search.txt')
    grid_search.addHandler(handler)

    bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow))
    bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow))

    remover = StopWordsRemover(inputCol="raw", outputCol="words")
    hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts",
                numFeatures=10000)
    tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",
                minDocFreq=20)
    indexer = StringIndexer(inputCol="string_label", outputCol="label")

    for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]:

        if type(model) == MultilayerPerceptronClassifier:
            layers = [10000, 100, 2]
            model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128)

        pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler,
                                    indexer, model])
        scores = cross_val_score(pipeline, bow_rdd)
        grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \
                % (type(model), scores, scores.mean()))
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

# In[17]:

# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([(0, "a b c d e spark", 1.0), (1, "b d", 0.0),
                                  (2, "spark f g h", 1.0),
                                  (3, "hadoop mapreduce", 0.0)],
                                 ["id", "text", "label"])

# In[18]:

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# In[19]:

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# In[20]:

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                              (6, "spark hadoop spark"), (7, "apache hadoop")],
                             ["id", "text"])
Exemple #58
0

rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
Exemple #59
0
score = data.map(lambda s: 1.0
                 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0)
comment = data.map(lambda s: s[3])
split_neg_data2 = score.zip(comment)
tranform_data = split_neg_data2.map(
    lambda p: (p[0], p[1]))  #.toDF()#.withColumnRenamed('_1','label')
#tranform_data.show()
#sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence")

sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed")
start_time = time.time()
modelClassifier = nb.fit(trainingData)
end_time = time.time()
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")