Esempio n. 1
0
    def test_count_vectorizer_from_vocab(self):
        model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words",
                                                     outputCol="features", minTF=2)
        self.assertEqual(model.vocabulary, ["a", "b", "c"])
        self.assertEqual(model.getMinTF(), 2)

        dataset = self.spark.createDataFrame([
            (0, "a a a b b c".split(' '), SparseVector(3, {0: 3.0, 1: 2.0}),),
            (1, "a a".split(' '), SparseVector(3, {0: 2.0}),),
            (2, "a b".split(' '), SparseVector(3, {}),)], ["id", "words", "expected"])

        transformed_list = model.transform(dataset).select("features", "expected").collect()

        for r in transformed_list:
            feature, expected = r
            self.assertEqual(feature, expected)

        # Test an empty vocabulary
        with QuietTest(self.sc):
            with self.assertRaisesRegexp(Exception, "vocabSize.*invalid.*0"):
                CountVectorizerModel.from_vocabulary([], inputCol="words")

        # Test model with default settings can transform
        model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words")
        transformed_list = model_default.transform(dataset) \
            .select(model_default.getOrDefault(model_default.outputCol)).collect()
        self.assertEqual(len(transformed_list), 3)
Esempio n. 2
0
    def test_java_params(self):
        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
                   pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
                   pyspark.ml.regression]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if not name.endswith('Model') and not name.endswith('Params') \
                        and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
                    # NOTE: disable check_params_exist until there is parity with Scala API
                    check_params(self, cls(), check_params_exist=False)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
        check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
                     check_params_exist=False)
        check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
                     check_params_exist=False)
Esempio n. 3
0
    def test_java_params(self):
        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [
            pyspark.ml.feature, pyspark.ml.classification,
            pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline,
            pyspark.ml.recommendation, pyspark.ml.regression
        ]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if not name.endswith('Model') and not name.endswith('Params') \
                        and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
                    # NOTE: disable check_params_exist until there is parity with Scala API
                    check_params(self, cls(), check_params_exist=False)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
        check_params(self,
                     CountVectorizerModel.from_vocabulary(['a'], 'input'),
                     check_params_exist=False)
        check_params(self,
                     StringIndexerModel.from_labels(['a', 'b'], 'input'),
                     check_params_exist=False)
def transform_model(sqlContext, modelDataframe):
    # Load the CV model
    model = CountVectorizerModel.load("models/cvModel")
    # Transform the data frame
    transformedDf = model.transform(modelDataframe)

    return transformedDf
Esempio n. 5
0
    def get_cv_model(self):
        if self.has_cv:
            from pyspark.ml.feature import CountVectorizerModel
            cv_model = CountVectorizerModel.load(
                os.path.join(model_pth, model_name))
        else:
            from pyspark.ml.feature import CountVectorizer
            data = self._fit()
            cv = CountVectorizer(inputCol='item_seq',
                                 outputCol='item_seq_enc',
                                 vocabSize=1 << 20,
                                 minTF=0,
                                 minDF=0)
            cv_model = cv.fit(data)
            cv_model.write().overwrite().save(
                os.path.join(model_pth, model_name))

        copora = cv_model.vocabulary  # 579012
        action_copora = [
            'clickout item', 'interaction item deals',
            'interaction item image', 'interaction item info',
            'search for item', 'interaction item rating'
        ]
        item2id = dict(zip(copora, range(1, len(copora) + 1)))
        action2id = dict(zip(action_copora, range(1, len(action_copora) + 1)))
        sc = self.sqlContext.sparkContext
        bitem2id = sc.broadcast(item2id)
        baction2id = sc.broadcast(action2id)
        print("Item size:", len(item2id))
        return bitem2id, baction2id
Esempio n. 6
0
    def test_count_vectorizer_from_vocab(self):
        model = CountVectorizerModel.from_vocabulary(
            ["a", "b", "c"], inputCol="words", outputCol="features", minTF=2
        )
        self.assertEqual(model.vocabulary, ["a", "b", "c"])
        self.assertEqual(model.getMinTF(), 2)

        dataset = self.spark.createDataFrame(
            [
                (
                    0,
                    "a a a b b c".split(" "),
                    SparseVector(3, {0: 3.0, 1: 2.0}),
                ),
                (
                    1,
                    "a a".split(" "),
                    SparseVector(3, {0: 2.0}),
                ),
                (
                    2,
                    "a b".split(" "),
                    SparseVector(3, {}),
                ),
            ],
            ["id", "words", "expected"],
        )

        transformed_list = model.transform(dataset).select("features", "expected").collect()

        for r in transformed_list:
            feature, expected = r
            self.assertEqual(feature, expected)

        # Test an empty vocabulary
        with QuietTest(self.sc):
            with self.assertRaisesRegex(Exception, "vocabSize.*invalid.*0"):
                CountVectorizerModel.from_vocabulary([], inputCol="words")

        # Test model with default settings can transform
        model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words")
        transformed_list = (
            model_default.transform(dataset)
            .select(model_default.getOrDefault(model_default.outputCol))
            .collect()
        )
        self.assertEqual(len(transformed_list), 3)
Esempio n. 7
0
def loadModel(path):
    '''
    Load Count Vectorizer model
        input : - path
        output: - model [Count Vectorizer model data frame]
    '''
    
    model = CountVectorizerModel.load(path)
    return model
Esempio n. 8
0
def createCV(df, sc, col_name='text'):
    cv = CountVectorizer(inputCol=col_name, outputCol="features", minDF=5)
    try:
        df_r = sc.read.parquet("data/vectorized.parquet")
        cv = CountVectorizer.load("data/vectorized_cv.parquet")
        model = CountVectorizerModel.load("data/vectorized_model.parquet")
    except:
        model = cv.fit(df)
        df_r = model.transform(df)
        df_r.write.parquet("data/vectorized.parquet")
        cv.save("data/vectorized_cv.parquet")
        model.save("data/vectorized_model.parquet")
    return (df_r, model, cv)
Esempio n. 9
0
    def __init__(self):
        self.NUM_TOPICS = 3

        db_path = "/home/hadoop/csce678-project/LDA/db.csv"
        df = pd.read_csv(db_path, lineterminator='\n').drop_duplicates(['ID'])

        self.db = spark.createDataFrame(df)

        ldaModel_path = "/home/hadoop/csce678-project/LDA/lda_model"
        self.ldaModel = LocalLDAModel.load(ldaModel_path)

        model_path = "/home/hadoop/csce678-project/LDA/token"
        self.model = CountVectorizerModel.load(model_path)
def main():
    review_topics = spark.read.parquet("topic_modelling/review_topics_pos")
    cv_model = CountVectorizerModel.load("topic_modelling/cvmodel_pos")
    ldamodel = LocalLDAModel.load("topic_modelling/ldamodel_pos")
    f1out = open("topic_modelling/postive_topics", "w+")
    topics = ldamodel.describeTopics(
        maxTermsPerTopic=10).rdd.map(lambda x: list(x)).collect()
    vocabulary = cv_model.vocabulary
    for topic in range(len(topics)):
        towrite = "topic {} : \n".format(topic)
        f1out.write(towrite)
        words = topics[topic][1]
        scores = topics[topic][2]
        stri = ''
        for word in range(len(words)):
            stri += str(scores[word]) + "*" + vocabulary[words[word]] + " + "
        f1out.write(stri[:-3] + "\n")
    f1out.close()

    review_topics = spark.read.parquet("topic_modelling/review_topics_neg")
    cv_model = CountVectorizerModel.load("topic_modelling/cvmodel_neg")
    ldamodel = LocalLDAModel.load("topic_modelling/ldamodel_neg")
    f2out = open("topic_modelling/negative_topics", "w+")
    topics = ldamodel.describeTopics(
        maxTermsPerTopic=10).rdd.map(lambda x: list(x)).collect()
    vocabulary = cv_model.vocabulary
    for topic in range(len(topics)):
        towrite = "topic {} : \n".format(topic)
        f2out.write(towrite)
        words = topics[topic][1]
        scores = topics[topic][2]
        stri = ''
        for word in range(len(words)):
            stri += str(scores[word]) + "*" + vocabulary[words[word]] + " + "
        f2out.write(stri[:-3] + "\n")
    f2out.close()
Esempio n. 11
0
    def countVectorizer(self, infoData):
        originalColName = infoData.get(pc.ORIGINALCOLMNAME)
        dataset = infoData.get(pc.DATASET)
        oneHotEncoderMapping = infoData.get(pc.ONEHOTENCODERPATHMAPPING)
        countVectorizerPath = oneHotEncoderMapping.get(originalColName)
        countVectorizer = CountVectorizerModel.load(countVectorizerPath)
        encodedColmName = infoData.get(pc.ENCODEDCOLM)
        dataset = dataset.drop(encodedColmName)

        dataset = countVectorizer.transform(dataset)
        infoData.update({pc.DATASET: dataset})

        infoData = pu.featureAssembler(infoData)

        return infoData
Esempio n. 12
0
def topicPredict(inputs):
    #output_path = "/user/llbui/bigdata45_500"
    output_path = "C:/Users/linhb/bigdata45_500"
    query = inputs
    n = 10  #number of similar document to return
    feature = "abstract"  #feature to compare

    df = sc.parallelize([(0, query)]).toDF(["id", feature])

    tokenizer = RegexTokenizer(inputCol=feature,
                               outputCol="words",
                               pattern="\\P{Alpha}+")
    df2 = tokenizer.transform(df)

    remover = StopWordsRemover(inputCol="words", outputCol="words2")
    df3 = remover.transform(df2)

    udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType()))
    df4 = df3.withColumn("words3", udf_remove_words(df3.words2))

    # text to feature vector - TF_IDF
    countTF_model = CountVectorizerModel.load(output_path + "/tf_model")
    df_countTF = countTF_model.transform(df4)

    idf_model = IDFModel.load(output_path + "/idf_model")
    df_IDF = idf_model.transform(df_countTF)

    # LDA Model
    lda_model = LocalLDAModel.load(output_path + "/lda_model")

    #output topics for document -> topicDistribution
    df_Feature = lda_model.transform(df_IDF)
    feature_vector = df_Feature.select("id",
                                       "topicDistribution").collect()[0][1]
    print("Feature Vector:", feature_vector)

    #Load existing document
    df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet")
    udf_cosineSimilarity = udf(
        lambda x_vector: cosineSimilarity(x_vector, feature_vector),
        FloatType())
    df_Similarity = df_Document.withColumn(
        "similarity", udf_cosineSimilarity("topicDistribution"))
    df_Similarity_Sorted = df_Similarity.sort(desc("similarity"))
    return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract",
                                                "url",
                                                "topicDistribution").collect()
Esempio n. 13
0
def run(sc, args):
    ds_path    = args[0]
    output_path  = args[1]
    
    model_path   = args[2]
    model_name   = args[3]

    original_dim = int(args[4])
    hidden_dim   = int(args[5])
    latent_dim   = int(args[6])

    cv_model_path  = 'text-reuse/pipeline/vsh-hashing/cv_model'
    cv_model  = CountVectorizerModel.load(cv_model_path)

    threshold = None
    if latent_dim == 8:
        threshold = np.array([ 0.1457837 ,  0.061413  , -0.03391605,  0.04686656, -0.14745404,
       -0.08641829, -0.04190724, -0.05972087])

    elif latent_dim == 16:
        threshold = np.array([ 0.00231892, -0.00791987,  0.00027306,  0.07018767, -0.07945273,
        0.01763633,  0.01450929,  0.04488222, -0.0289745 ,  0.02851318,
        0.01496754,  0.00133035, -0.00523619, -0.10513094,  0.07906742,
       -0.07930097])

    else:
       threshold = np.array([-0.01227623, -0.00382998, -0.00029179, -0.04484864, -0.02657753,
        0.01505825,  0.00319679, -0.01186464, -0.03057225,  0.02324941,
        0.01272652, -0.01289577, -0.02995954,  0.04656317, -0.01781761,
       -0.01934269,  0.1332021 ,  0.00064231,  0.01289176, -0.00131864,
        0.02279386, -0.06245026, -0.02096441,  0.01817522,  0.02722896,
        0.0211685 ,  0.01392594, -0.06448705,  0.00062385,  0.02365676,
       -0.01207885,  0.02566718])


    vdsh_loader = VSH.VDSHLoader(model_path, model_name,  threshold , original_dim, hidden_dim, latent_dim)

    df = sc.pickleFile(ds_path).toDF()

    tfidf_df  = tfidf(df, cv_model, 'paragraph', 'tfidf')
    tfidf_rdd = tfidf_df.rdd.repartition(8000)

    tfidf_rdd = tfidf_rdd.mapPartitions(lambda p: hash_partition(p, vdsh_loader))
    tfidf_rdd.saveAsPickleFile(output_path)
    
Esempio n. 14
0
def main(context):
    """Main function takes a Spark SQL context."""
    comments_df = context.read.parquet("comments.parquet")
    submissions_df = context.read.parquet("submissions.parquet")
    labeled_data_df = context.read.parquet("labeled_data.parquet")

    if path.exists("df_label.parquet"):
        labeled_df = context.read.parquet("df_label.parquet")
        comments_df = cleanedCommentDF(comments_df)

    else:
        labeled_df = createLabeledDF(comments_df, labeled_data_df)
        comments_df = cleanedCommentsDF(comments_df)

    if path.exists("cvModel"):
        cvModel = CountVectorizerModel.load("cvModel")
        posModel = CrossValidatorModel.load("pos.model")
        negModel = CrossValidatorModel.load("neg.model")
    else:
        cvModel, posModel, negModel = train(labeled_df)

    # the "final join" without actually joining!
    output = cvModel.transform(comments_df)
    output = output.drop('score')
    output = output.drop('ngrams_combined')
    output = output.drop('link_id_cleaned')
    posResult = posModel.transform(output)
    posResult = posResult.drop('rawPrediction')
    posResult = posResult.drop('prediction')
    posResult = posResult.withColumnRenamed('probability', 'pos_prob')
    fullResult = negModel.transform(posResult)
    fullResult = fullResult.withColumnRenamed('probability', 'neg_prob')
    fullResult = fullResult.drop('rawPrediction')
    fullResult = fullResult.drop('prediction')
    fullResult = fullResult.withColumn(
        'neg',
        when(get_probability_udf(fullResult.neg_prob) > 0.25, 1).otherwise(0))
    fullResult = fullResult.withColumn(
        'pos',
        when(get_probability_udf(fullResult.pos_prob) > 0.2, 1).otherwise(0))
    fullResult.write.parquet("resulting_df.parquet")
    #fullResult_df = context.read.parquet("resulting_df.parquet")

    print(fullResult.count())
Esempio n. 15
0
    def test_java_params(self):
        import re

        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [
            pyspark.ml.feature,
            pyspark.ml.classification,
            pyspark.ml.clustering,
            pyspark.ml.evaluation,
            pyspark.ml.pipeline,
            pyspark.ml.recommendation,
            pyspark.ml.regression,
        ]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if (not name.endswith("Model") and not name.endswith("Params")
                        and issubclass(cls, JavaParams)
                        and not inspect.isabstract(cls)
                        and not re.match("_?Java", name) and name != "_LSH"
                        and name != "_Selector"):
                    check_params(self, cls(), check_params_exist=True)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel

        check_params(self,
                     CountVectorizerModel.from_vocabulary(["a"], "input"),
                     check_params_exist=True)
        check_params(self,
                     StringIndexerModel.from_labels(["a", "b"], "input"),
                     check_params_exist=True)
Esempio n. 16
0
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
from pyspark.ml.linalg import Vector, Vectors
import numpy as np

from pyspark.sql.types import DoubleType
from pyspark.sql import functions as F

from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.sql.types import *
from pyspark.sql.functions import udf

ldaModel_path = "lda_model"
ldaModel = LocalLDAModel.load(ldaModel_path)
model_path = "token"
model = CountVectorizerModel.load(model_path)

l_test = [(1, "I f*****g hate covid-19")]


def test(text, ldaModel, model):
    rdd_ = sc.parallelize(text)
    data = rdd_.map(lambda kv: Row(idd=kv[0], Text=kv[1].split(" ")))
    docDF = spark.createDataFrame(data)
    result = model.transform(docDF)

    corpus = result.select("idd", "vectors").rdd.map(lambda xy: [
        xy[0], Vectors.sparse(xy[1].size, xy[1].indices, xy[1].values)
    ]).cache()
    columns = ['id', 'features']
    corpus = corpus.toDF(columns)
def main(sqlContext):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    # load files
    label = sqlContext.read.load("labeled_data.csv",
                                 format="csv",
                                 sep=",",
                                 inferSchema="true",
                                 header="true")
    if (flag):
        comments = sqlContext.read.json("comments-minimal.json.bz2")
        submissions = sqlContext.read.json("submissions.json.bz2")
        print("loading done")
        comments.write.parquet("comments_data")
        submissions.write.parquet("submissions_data")
        print("writing done")
    else:
        comments = sqlContext.read.parquet("comments")
        submissions = sqlContext.read.parquet("submissions")
        print("loading done")
    comments.show()
    exit()
    if (save):
        # task 7 starts here
        associated = join(comments, label)
        withngrams = associated.withColumn("ngrams",
                                           makeNgrams_udf(associated['body']))
        withplabels = withngrams.withColumn("poslabel",
                                            pLabel_udf(withngrams['labeldjt']))
        withpnlabels = withplabels.withColumn(
            "neglabel", nLabel_udf(withplabels['labeldjt'])).select(
                "id", "ngrams", "poslabel", "neglabel")
        # withpnlabels.show()
        cv = CountVectorizer(binary=True,
                             inputCol="ngrams",
                             outputCol="features")
        model = cv.fit(withpnlabels)
        model.save("cv.model")
        # model.transform(withpnlabels).show()
        pos = model.transform(withpnlabels).select(
            "id",
            col("poslabel").alias("label"), "features")
        neg = model.transform(withpnlabels).select(
            "id",
            col("neglabel").alias("label"), "features")
        # pos.show()
        # neg.show()
        poslr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        neglr = LogisticRegression(labelCol="label",
                                   featuresCol="features",
                                   maxIter=10)
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam,
                                                  [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam,
                                                  [1.0]).build()
        posCrossval = CrossValidator(estimator=poslr,
                                     evaluator=posEvaluator,
                                     estimatorParamMaps=posParamGrid,
                                     numFolds=2)  # for test
        negCrossval = CrossValidator(estimator=neglr,
                                     evaluator=negEvaluator,
                                     estimatorParamMaps=negParamGrid,
                                     numFolds=2)  # for test
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        posModel.save("pos.model")
        negModel.save("neg.model")
        print("trained")
    else:
        # comments.show()
        # submissions.show()
        posModel = CrossValidatorModel.load("pos.model")
        negModel = CrossValidatorModel.load("neg.model")
        model = CountVectorizerModel.load("cv.model")
        # withngrams = comments.withColumn("ngrams", makeNgrams_udf(comments['body']))
        # cv = CountVectorizer(binary=True, inputCol="ngrams", outputCol="features")
        # model = cv.fit(withngrams)
        print("model loaded")

        if (predict == 0):
            # task 8 starts here
            temp_comments = comments.select("id", "link_id",
                                            "author_flair_text", "created_utc",
                                            "body")
            clean_comments = temp_comments.withColumn(
                "true_id", getLinkid_udf(temp_comments['link_id']))
            # print(clean_comments.count())
            clean_submissions = submissions.select(
                col("id").alias("sub_id"), "title")
            # clean_comments.show()
            # clean_submissions.show()
            com_sub = clean_comments.join(
                clean_submissions,
                clean_comments.true_id == clean_submissions.sub_id, "inner")
            com_sub.write.parquet("com_sub")
        else:
            # task 9 starts here
            com_sub = sqlContext.read.parquet("com_sub")
            com_sub = com_sub.sample(False, 0.0001, None)
            filtered = com_sub.filter(
                "body NOT LIKE '%/s%' and body NOT LIKE '&gt;%'")
            # print(filtered.count())
            filtered_ngrams = filtered.withColumn(
                "ngrams", makeNgrams_udf(filtered['body']))
            # filtered_ngrams = filtered_ngrams.sample(False, 0.01, None)
            print("prepared")
            featuredata = model.transform(filtered_ngrams).select(
                "id", "author_flair_text", "created_utc", "sub_id", "title",
                "features")
            posResult = posModel.transform(featuredata)
            negResult = negModel.transform(featuredata)
            # posResult.show()
            # negResult.show()
            poslabel = posResult.withColumn(
                "positive", posTh_udf(posResult['probability'])
            )  # .select("id", "author_flair_text", "created_utc", "title", "positive")
            neglabel = negResult.withColumn(
                "negtive", negTh_udf(negResult['probability'])
            )  # .select(col("id").alias("nid"), "author_flair_text", "created_utc", "title", "negtive")
            print("predict done")
            # poslabel.show()
            # neglabel.show()
            # how to combine these 2 tables???

            # task 10 starts here
            # c_all = poslabel.count()
            all_day = poslabel.withColumn(
                "date",
                from_unixtime('created_utc').cast(
                    DateType())).groupby("date").count()
            pos_posts = poslabel.filter("positive = 1")
            # c_pos_posts = pos_posts.count()
            # p_pos_posts = c_pos_posts/c_all
            # print(p_pos_posts)
            # neg_posts = neglabel.filter("negtive = 1")
            # c_neg_posts = neg_posts.count()
            # p_neg_posts = c_neg_posts/c_all
            # print(p_neg_posts)
            pos_day = pos_posts.withColumn(
                "pos_date",
                from_unixtime('created_utc').cast(
                    DateType())).groupby("pos_date").count().withColumnRenamed(
                        "count", "pos_count")
            p_pos_day = all_day.join(pos_day, all_day.date == pos_day.pos_date,
                                     "left").withColumn(
                                         "pos_per", pos_count / count).show()

            print("end")
def read_parquet(parquet_path):
    parquet_df = spark.read.parquet(parquet_path)

    parquet_df = parquet_df.drop('id')
    parquet_df = parquet_df.drop('one_area_price')
    parquet_df = parquet_df.drop('agency_nameVec')
    parquet_df = parquet_df.drop('districtVec')
    parquet_df = parquet_df.drop('room_type')
    parquet_df.show(truncate=False)
    print('parquet_df.count()==========11', parquet_df.count(),
          parquet_df.columns)
    for i in parquet_df.columns:
        if ('Vec' not in i) & ('facilities_vectors' not in i):

            if parquet_df.filter(parquet_df[i].isNull()).count() > 0:

                parquet_df = parquet_df.na.fill(0, i)
            elif parquet_df.filter(parquet_df[i] == 'NULL').count() > 0:

                parquet_df = parquet_df.filter(parquet_df[i] != 'NULL')

            parquet_df = parquet_df.select(
                '*', parquet_df[i].cast('float').alias('tmp_name')).drop(i)
            parquet_df = parquet_df.withColumnRenamed('tmp_name', i)
        parquet_df = parquet_df.filter(parquet_df[i].isNotNull())
        print('parquet_df.count()==========22', i, parquet_df.count())

    columns = parquet_df.columns
    columns.remove('price')
    from pyspark.ml.feature import OneHotEncoder, StringIndexer, StringIndexerModel
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
    model_path = "/user/limeng/ganji_daxing_save_models/"
    columns_list = []
    for i in columns:
        if i == 'facilities_vectors':
            loadedCountVectorizerModel = CountVectorizerModel.load(
                model_path + 'count-vectorizer-model')
            temp = loadedCountVectorizerModel.vocabulary
            columns_list.extend(temp)
        elif i == 'rent_typeVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelrent_type')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'agency_nameVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelagency_name')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'directionVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modeldirection')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'zoneVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelzone')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'pay_typeVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modelpay_type')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        elif i == 'districtVec':
            loadedStringIndexerModel = StringIndexerModel.load(
                model_path + 'stringIndexer_modeldistrict')
            temp = loadedStringIndexerModel.labels
            columns_list.extend(temp)
        else:
            columns_list.append(i)

    vecAssembler = VectorAssembler(inputCols=columns, outputCol="features")
    parquet_df = vecAssembler.transform(parquet_df).select('features', 'price')

    parquet_df = parquet_df.withColumnRenamed('price', 'label')

    return parquet_df, columns_list
Esempio n. 19
0
# tdidf
# 词频,即tf
from pyspark.ml.feature import CountVectorizer

# vocabSize是总词汇的大小,minDF是文本中出现的最少次数
cv = CountVectorizer(inputCol="words",
                     outputCol="countFeatures",
                     vocabSize=200 * 10000,
                     minDF=1.0)
# 训练词频统计模型
cv_model = cv.fit(words_df)
cv_model.write().overwrite().save("models/CV.model")

from pyspark.ml.feature import CountVectorizerModel

cv_model = CountVectorizerModel.load("models/CV.model")
# 得出词频向量结果
cv_result = cv_model.transform(words_df)

# idf
from pyspark.ml.feature import IDF

idf = IDF(inputCol="countFeatures", outputCol="idfFeatures")
idf_model = idf.fit(cv_result)
idf_model.write().overwrite().save("models/IDF.model")

# tf-idf
from pyspark.ml.feature import IDFModel

idf_model = IDFModel.load("models/IDF.model")
tfidf_result = idf_model.transform(cv_result)
    for word in s:
        if word not in stopwords and word != '\r':
            wordlist.append(word)
    texts[len(texts) - 1].append(count)
    texts[len(texts) - 1].append(wordlist)
    count += 1

spark = SparkSession.builder.appName("dataFrame").getOrCreate()
df = spark.createDataFrame(texts, ["id", "words"])

# sc =SparkContext()
# dataset = sc.parallelize(texts)
# dataset = dataset.zipWithIndex()
#cv=CountVectorizer(inputCol="words",outputCol="features",vocabSize=1000,minDF=2.0)

model = CountVectorizerModel.load("hdfs:/User/" + user + "/" + user +
                                  "_cv.model")
result = model.transform(df)
#result.show()
voclist = model.vocabulary

# for x in df.collect():
# 	print("#######################################################")
# 	print(x)
#lda = LDA(k=3,maxIter=10)
#lda.save("hdfs:/"+user+"_text.model")
# print("ss")
#ldamodel =lda.fit(result)
ldamodel = LocalLDAModel.load("hdfs:/User/" + user + "/" + user + "_lda.model")
# ll=model.logLikelihood(dataset)
# lp=model.logPerplexity(dataset)
# print("ll"+str(ll))
Esempio n. 21
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    start = time.time()
    # task 1
    if(read_raw):
        comments = sqlContext.read.json('comments-minimal.json.bz2')
        submissions = sqlContext.read.json('submissions.json.bz2')
        label = sqlContext.read.load('labeled_data.csv', format = 'csv', sep = ',',header="true")
        print("load done")
        comments.write.parquet('comments')
        submissions.write.parquet('submissions')
        label.write.parquet('label')
    else:
        comments = context.read.load('comments')
        submissions = context.read.load('submissions')
        label = context.read.load('label')
    print("task 1 complete: read data")
    #result.show()

    if(training):
        # task 2
        associate = associated(comments, label).select(col('id'), col('body'), col('labeldjt'))
        print("task 2 complete: associate data")

        # task 4, 5
        newColumn = associate.withColumn('ngrams', sanitize_udf(associate['body']))
        print("task 4, 5 complete: generate unigrams")

        # task 6A
        cv = CountVectorizer(inputCol = 'ngrams', outputCol = "features", binary = True)
        model = cv.fit(newColumn)
        tmp = model.transform(newColumn)
        print("task 6A complete: cv model")

        # task 6B
        result = tmp.withColumn('poslabel', F.when(col('labeldjt') == 1, 1).otherwise(0))
        result = result.withColumn('neglabel', F.when(col('labeldjt') == -1, 1).otherwise(0))
        pos = result.select(col('poslabel').alias('label'), col('features'))
        neg = result.select(col('neglabel').alias('label'), col('features'))
        print("task 6B complete: relabel data")

        # task 7
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10)
        neglr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(
            estimator = poslr,
            evaluator = posEvaluator,
            estimatorParamMaps = posParamGrid,
            numFolds = 5)
        negCrossval = CrossValidator(
            estimator = neglr,
            evaluator = negEvaluator,
            estimatorParamMaps = negParamGrid,
            numFolds = 5)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])

        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.save("pos.model")
        negModel.save("neg.model")
        model.save("cv.model")
        print("task 7 complete: training")

        # posModel = CrossValidatorModel.load('pos.model')
        # negModel = CrossValidatorModel.load('neg.model')

        # point 7
        pos_trans = posModel.transform(posTest)
        neg_trans = negModel.transform(negTest)

        pos_results = pos_trans.select(['probability', 'label'])
        pos_trans_collect = pos_results.collect()
        pos_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in pos_trans_collect]
        pos_scoreAndLabels = sc.parallelize(pos_trans_results_list)

        pos_metrics = metric(pos_scoreAndLabels)
        print("The ROC score of positive results is: ", pos_metrics.areaUnderROC)

        neg_results = neg_trans.select(['probability', 'label'])
        neg_trans_collect = neg_results.collect()
        neg_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in neg_trans_collect]
        neg_scoreAndLabels = sc.parallelize(neg_trans_results_list)

        neg_metrics = metric(neg_scoreAndLabels)
        print("The ROC score of negative results is: ", neg_metrics.areaUnderROC)

        plot_ROC(pos_trans_results_list, 'positive_results')
        plot_ROC(neg_trans_results_list, 'negative_results')
        print("point 7 complete: ROC")

    else:
        model = CountVectorizerModel.load('cv.model')
        posModel = CrossValidatorModel.load('pos.model')
        negModel = CrossValidatorModel.load('neg.model')
        print("model loaded")

        # task 8
        comments_tmp = comments.select(col('id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('com_score'))
        comments_full = comments_tmp.withColumn('link_id', process_id_udf(comments_tmp['link_id']))
        submissions_full = submissions.select(col('id').alias('sub_id'), col('title'), col('score').alias('sub_score'))

        if(joinFull):
            com_sub = comments_full.join(submissions_full, comments_full.link_id == submissions_full.sub_id, 'inner')
            com_sub = com_sub.select(col('id'), col('title'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('com_score'), col('sub_score'))
            com_sub.write.parquet('com_sub')
        else:
            com_sub = context.read.load('com_sub')# .sample(False, 0.01, None)
        print('task 8 complete: comment with submission')

        # task 9
        filtered = com_sub.filter("body NOT LIKE '%/s%' and body NOT LIKE '&gt;%'")
        filtered_result = filtered.withColumn('ngrams', sanitize_udf(filtered['body']))
        feaResult = model.transform(filtered_result).select(col('id'), col('link_id'), col('created_utc'), \
                                    col('features'), col('author_flair_text'), col('com_score'), col('sub_score'), col('title'))
        posResult = posModel.transform(feaResult)
        negResult = negModel.transform(feaResult)
        print("transformed")

        pos = posResult.withColumn('pos', threshold_pos_udf(posResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'pos', 'com_score', 'sub_score', 'title')
        neg = negResult.withColumn('neg', threshold_neg_udf(negResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'neg', 'com_score', 'sub_score', 'title')
        #final_probs = pos.join(neg, pos.id == neg.id_neg, 'inner').select('id', 'created_utc', 'author_flair_text', 'title', 'pos', 'neg')
        #final_probs.show()
        #pos.write.parquet('pos')
        #neg.write.parquet('neg')
        print('task 9 complete: predict')

        # task 10
        # compute 1
        num_rows = pos.count()
        pos_filtered = pos.filter(pos.pos == 1)
        neg_filtered = neg.filter(neg.neg == 1)
        num_pos = pos_filtered.count()
        num_neg = neg_filtered.count()

        print('Percentage of positive comments: {}'.format(num_pos / num_rows))
        print('Percentage of negative comments: {}'.format(num_neg / num_rows))
        print('finish compute 1')

        # compute 2
        pos_time = pos.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType()))
        neg_time = neg.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType()))

        num_pos_time = pos_time.groupBy('time').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('time')
        num_neg_time = neg_time.groupBy('time').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('time')

        num_pos_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_pos_time')
        num_neg_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_neg_time')
        print('finish compute 2')

        # compute 3
        state = sqlContext.createDataFrame(states, StringType())
        pos_state = pos.groupBy('author_flair_text').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive'))
        neg_state = neg.groupBy('author_flair_text').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative'))

        pos_state = pos_state.join(state, pos_state.author_flair_text == state.value, 'inner')
        pos_state = pos_state.na.drop(subset=['value'])
        pos_state = pos_state.select(col('author_flair_text').alias('state'), col('Percentage of positive').alias('Positive'))

        neg_state = neg_state.join(state, neg_state.author_flair_text == state.value, 'inner')
        neg_state = neg_state.na.drop(subset=['value'])
        neg_state = neg_state.select(col('author_flair_text').alias('state'), col('Percentage of negative').alias('Negative'))

        pos_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_state')
        neg_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_state')
        print('finish compute 3')

        # compute 4
        pos_com_score = pos.groupBy('com_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('com_score')
        pos_sub_score = pos.groupBy('sub_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('sub_score')
        neg_com_score = neg.groupBy('com_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('com_score')
        neg_sub_score = neg.groupBy('sub_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('sub_score')

        pos_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_com_score')
        pos_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_sub_score')
        neg_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_com_score')
        neg_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_sub_score')
        print('finish compute 4')

        # compute 5
        pos_story = pos.groupBy('title').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy(F.desc('Percentage of positive')).limit(10)
        neg_story = neg.groupBy('title').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy(F.desc('Percentage of negative')).limit(10)

        pos_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_story')
        neg_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_story')
        print('finish compute 5')

        end = time.time()
        print('time consumed: {}'.format(end - start))
Esempio n. 22
0
def loadData(data_ingestion,
             train_cv=1,
             binarize=True,
             minDF=3,
             TFIDF_b=False,
             PCA_b=False,
             PCA_k=1000):
    if train_cv:
        # we train cv...
        cv_model = CountVectorizer(inputCol='words',
                                   outputCol='X',
                                   minDF=minDF)
    else:
        # we load cv !
        cv = CountVectorizerModel.load(cvModelPath)

    tokenizer = Tokenizer(inputCol="comment", outputCol="words")

    # Creation of an empty DataFrame
    field1 = StructField('score', IntegerType(), True)
    field2 = StructField('X', VectorUDT(), True)

    fields = []
    fields.append(field1)
    fields.append(field2)

    schema = StructType(fields)

    X = spark.createDataFrame(sc.emptyRDD(), schema)

    # Ingestion par fichier
    for filePath in data_ingestion:
        file = sc.textFile(filePath)
        data = file.map(lambda line: line.split("\t")).toDF()
        data = data.withColumnRenamed('_2', 'comment')

        data = data.withColumnRenamed('_1', 'score')
        data = data.withColumn('score', data['score'].cast(IntegerType()))

        data = tokenizer.transform(data)

        if train_cv:
            cv = cv_model.fit(data)

        data = cv.transform(data)

        X = X.union(data.select('score', 'X'))

    try:
        shutil.rmtree(cvModelPath, ignore_errors=True)
    except:
        pass

    cv.save(cvModelPath)

    if binarize:
        X_1 = X.where((X.score == 4) | (X.score == 5)).withColumn(
            'score', lit(1))
        X_0 = X.where((X.score == 0) | (X.score == 1) | (X.score == 2)
                      | (X.score == 3)).withColumn('score', lit(0))
        X = X_1.union(X_0)

    if TFIDF_b:
        idf = IDF(inputCol="X", outputCol="X_TFIDF")
        model = idf.fit(X)
        X = model.transform(X)
        X = X.select('score', 'X_TFIDF')
        X = X.withColumnRenamed('X_TFIDF', 'X')

    if PCA_b:
        pca = PCA(k=PCA_k, inputCol="X", outputCol="X_PCA")
        model = pca.fit(X)
        X = X.select('score', 'X_PCA')
        X = X.withColumnRenamed('X_PCA', 'X')

    return (X)
    for row in partition:
        sentence = re.sub("<.*?>", "", row.sentence)  # 替换掉标签数据
        words = cut_sentence(sentence)
        yield row.article_id, row.channel_id, words


# 分词
sqlContext.sql("use article")
articleDF = sqlContext.sql("select * from article_data")

# words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(["article_id", "channel_id", "words"])
wordsDF = articleDF.rdd.mapPartitions(segmentation, 5).toDF(
    ["article_id", "channel_id", "words"])

cv_model = CountVectorizerModel.load(
    "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/CV.model"
)
idf_model = IDFModel.load(
    "D:/WorkSpace/ToutiaoRecommenderWorkSpace/toutiao_project/reco_sys/output/IDF.model"
)

cv_result = cv_model.transform(wordsDF)
tfidf_result = idf_model.transform(cv_result)


def func(partition):
    TOPK = 20
    for row in partition:
        # 找到索引与IDF值并进行排序
        _ = list(zip(row.idfFeatures.indices, row.idfFeatures.values))
        _ = sorted(_, key=lambda x: x[1], reverse=True)
Esempio n. 24
0
    def multionehot(self, df, column):
        """
        // Prepare training documents from a list of (id, text, label) tuples.
        val data = spark.createDataFrame(Seq(
          (0L, Seq("A", "B")),
          (1L, Seq("B")),
          (2L, Seq.empty),
          (3L, Seq("D", "E"))
        )).toDF("id", "categories")

        // Get distinct tags array
        val tags = data
          .flatMap(r ⇒ r.getAs[Seq[String]]("categories"))
          .distinct()
          .collect()
          .sortWith(_ < _)

        val cvmData = new CountVectorizerModel(tags)
          .setInputCol("categories")
          .setOutputCol("sparseFeatures")
          .transform(data)

        val asDense = udf((v: Vector) ⇒ v.toDense)

        cvmData
          .withColumn("features", asDense($"sparseFeatures"))
          .select("id", "categories", "features")
          .show()

        :param df:
        :param column:
        :return:
        """
        df.select(column).show()
        categories = list(
            set(
                df.select(column).distinct().rdd.flatMap(
                    lambda x: print(x, type(x), '\n')).collect()))
        categories = list(
            set(
                df.select(column).distinct().rdd.flatMap(
                    lambda x: x[0] if x is not None else None).collect()))
        categories.sort(reverse=False)
        # sorted(categories, key=(lambda x: x[0]))
        print(categories)
        cvm = CountVectorizerModel.from_vocabulary(categories,
                                                   inputCol=column,
                                                   outputCol=column +
                                                   "_sparse_vec").transform(df)
        cvm.show()

        @udf(ArrayType(IntegerType()))
        def toDense(v):
            print(v)
            print(Vectors.dense(v).toArray())
            v = DenseVector(v)

            new_array = list([int(x) for x in v])

            return new_array

        result = cvm.withColumn('features_vec',
                                toDense(column + "_sparse_vec"))
        result = result.drop(column + "_sparse_vec")

        return result
Esempio n. 25
0
def main(context):

    # dem(context)
    # gop(context)

    # SAVED PARQUETS
    # comments is the comments-minimal.json
    # submissions is the submissions.json
    # task7 is the result of the count vectorizer
    # commentsFull is the comments-minimal.json joined with submissions with the sarcasm removed and the &gt removed

    #TASK 1

    # Read from JSON
    #comments = sqlContext.read.json("comments-minimal.json.bz2")
    #comments.registerTempTable("commentsTable")
    #submissions = sqlContext.read.json("submissions.json.bz2")
    #submissions.registerTempTable("submissionsTable")

    # Write the Parquets
    #comments.write.parquet("comments.parquet")
    #submissions.write.parquet("submissions.parquet")

    # Read the parquets
    comments = sqlContext.read.parquet("comments.parquet")
    comments.registerTempTable("commentsTable")
    submissions = sqlContext.read.parquet("submissions.parquet")
    submissions.registerTempTable("submissionsTable")

    # Read the CSV
    labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labeled_data.csv")
    labels.registerTempTable("labelsTable")

    #TASK 2
    dfTask2 = sqlContext.sql("SELECT commentsTable.* FROM commentsTable INNER JOIN labelsTable ON commentsTable.id = labelsTable.Input_id")

    #TASK 4 and TASK 5
    def do_something(text):
        return parser.sanitize(text)

    udf_func = udf(do_something, ArrayType(StringType()))
    dfTask4 = dfTask2.withColumn("udf_results", udf_func(col("body")))

    #TASK 6A and Task 6B
    if(not os.path.exists("cvModel")):
        cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0)
        model = cv.fit(dfTask4)
        model.write().overwrite().save("cvModel")

    model = CountVectorizerModel.load("cvModel")
    dfTask6A = model.transform(dfTask4)
    dfTask6A.registerTempTable("dfTask6ATable")
    dfTask6B = sqlContext.sql("SELECT dfTask6ATable.*, IF(labelsTable.labeldjt=1, 1, 0) AS pos_label, if(labelsTable.labeldjt=-1, 1, 0) AS neg_label FROM dfTask6ATable INNER JOIN labelsTable ON dfTask6ATable.id = labelsTable.Input_id")
    dfTask6B.registerTempTable("dfTask6BTable")

    pos = sqlContext.sql('select pos_label as label, features from dfTask6BTable')
    neg = sqlContext.sql('select neg_label as label, features from dfTask6BTable')

    if(not os.path.exists("www/neg.model") or not os.path.exists("www/pos.model")):
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
        neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(
            estimator=poslr,
            evaluator=posEvaluator,
            estimatorParamMaps=posParamGrid,
            numFolds=2)
        negCrossval = CrossValidator(
            estimator=neglr,
            evaluator=negEvaluator,
            estimatorParamMaps=negParamGrid,
            numFolds=2)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])
        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)

        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.write().overwrite().save("www/pos.model")
        negModel.write().overwrite().save("www/neg.model")

    # TO LOAD BACK IN
    posModel = CrossValidatorModel.load("www/pos.model")
    negModel = CrossValidatorModel.load("www/neg.model")

    # Task 8
    dfTask8 = sqlContext.sql('SELECT commentsTable.id, commentsTable.body, commentsTable.created_utc, commentsTable.author_flair_text, submissionsTable.title, commentsTable.score AS comment_score, submissionsTable.score AS story_score FROM commentsTable INNER JOIN submissionsTable ON RIGHT(commentsTable.link_id, 6)=submissionsTable.id')
    dfTask8 = dfTask8.sample(False, 0.1, None)

    #TASK 4 and TASK 5
    def do_something(text):
        return parser.sanitize(text)

    udf_func = udf(do_something, ArrayType(StringType()))
    dfTask9_1 = dfTask8.withColumn("udf_results", udf_func(col("body")))

    #TASK 6A and Task 6B
    model = CountVectorizerModel.load("cvModel")
    dfTask9_2 = model.transform(dfTask9_1)
    dfTask9_2.registerTempTable("dfTask9_2Table")

    # Task 9
    dfTask9_3 = sqlContext.sql("SELECT * FROM dfTask9_2Table WHERE dfTask9_2Table.body NOT LIKE '%/s%' AND dfTask9_2Table.body NOT LIKE '&gt%'")
    dfTask9_3.registerTempTable("dfTask9_3Table")

    posResult_1 = posModel.transform(dfTask9_3)
    posResult_1.registerTempTable("posResult_1Table")
    posResult_2 = sqlContext.sql("SELECT posResult_1Table.id, posResult_1Table.body, posResult_1Table.author_flair_text, posResult_1Table.created_utc, posResult_1Table.title, posResult_1Table.comment_score, posResult_1Table.story_score, posResult_1Table.features, posResult_1Table.prediction AS pos FROM posResult_1Table")
    finalResult_1 = negModel.transform(posResult_2)
    finalResult_1.registerTempTable("finalResult_1Table")
    finalResult_2 = sqlContext.sql("SELECT finalResult_1Table.id, finalResult_1Table.body, finalResult_1Table.created_utc, finalResult_1Table.author_flair_text, finalResult_1Table.title, finalResult_1Table.comment_score, finalResult_1Table.story_score, finalResult_1Table.pos, finalResult_1Table.prediction AS neg FROM finalResult_1Table")
    finalResult_2.registerTempTable("finalResult_2Table")

    if(not os.path.exists("final.parquet")):
        finalResult_2.write.parquet("final.parquet")

    final = sqlContext.read.parquet("final.parquet")
    final.registerTempTable("finalTable")

    # Task 10
    if(not os.path.exists("question1.csv")):
        question1 = sqlContext.sql("SELECT (100 * sum(pos) / COUNT(*)) AS percent_pos, (100 * sum(neg) / COUNT(*)) AS percent_neg FROM finalTable")
        question1.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question1.csv")

    if(not os.path.exists("question2.csv")):
        question2 = sqlContext.sql("SELECT DATE(from_unixtime(finalTable.created_utc)) AS date, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY date ORDER BY date")
        question2.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question2.csv")

    if(not os.path.exists("question3.csv")):
        question3 = sqlContext.sql("SELECT finalTable.author_flair_text AS place, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY place ORDER BY place")
        question3.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question3.csv")

    if(not os.path.exists("question4_comment.csv")):
        question4_comment = sqlContext.sql("SELECT finalTable.comment_score AS comment_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY comment_score ORDER BY comment_score")
        question4_comment.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_comment.csv")

    if(not os.path.exists("question4_story.csv")):
        question4_story = sqlContext.sql("SELECT finalTable.story_score AS story_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY story_score ORDER BY story_score")
        question4_story.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("question4_story.csv")
Esempio n. 26
0
def main(context):

    # Read from JSON
    comments = sqlContext.read.json("comments-minimal.json.bz2")
    comments.registerTempTable("commentsTable")
    submissions = sqlContext.read.json("submissions.json.bz2")
    submissions.registerTempTable("submissionsTable")
    
    # Read the CSV
    labels = sqlContext.read.format('csv').options(header='true', inferSchema='true').load("labeled_data.csv")
    labels.registerTempTable("labelsTable")

    df = sqlContext.sql("SELECT commentsTable.* FROM commentsTable INNER JOIN labelsTable ON commentsTable.id = labelsTable.Input_id")

    # unigrams, bigrams, trigrams
    def unigrams_bigrams_trigrams(text):
        return parsetext.clean_up(text)

    udf_function = udf(unigrams_bigrams_trigrams, ArrayType(StringType()))
    df_2 = df.withColumn("udf_results", udf_function(col("body")))

    # countVectorizer
    if(not os.path.exists("cvModel")):
        cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0)
        model = cv.fit(df_2)
        model.write().overwrite().save("cvModel")

    model = CountVectorizerModel.load("cvModel")
    df_3A = model.transform(df_2)
    df_3A.registerTempTable("df_3ATable")
    df_3B = sqlContext.sql("SELECT df_3ATable.*, IF(labelsTable.labeldjt=1, 1, 0) AS pos_label, if(labelsTable.labeldjt=-1, 1, 0) AS neg_label FROM df_3ATable INNER JOIN labelsTable ON df_3ATable.id = labelsTable.Input_id")
    df_3B.registerTempTable("df_3BTable")

    pos = sqlContext.sql('select pos_label as label, features from df_3BTable')
    neg = sqlContext.sql('select neg_label as label, features from df_3BTable')

    if(not os.path.exists("www/neg.model") or not os.path.exists("www/pos.model")):
        # Initialize two logistic regression models.
        poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2)
        neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25)
        
        # Binary classifier
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()

        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
        # 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(
            estimator=poslr,
            evaluator=posEvaluator,
            estimatorParamMaps=posParamGrid,
            numFolds=2)
        negCrossval = CrossValidator(
            estimator=neglr,
            evaluator=negEvaluator,
            estimatorParamMaps=negParamGrid,
            numFolds=2)
        
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])
        
        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)

        # Save the models and load them again later.
        posModel.write().overwrite().save("www/pos.model")
        negModel.write().overwrite().save("www/neg.model")

    # TO LOAD BACK IN
    posModel = CrossValidatorModel.load("www/pos.model")
    negModel = CrossValidatorModel.load("www/neg.model")

    
    df_4 = sqlContext.sql('SELECT commentsTable.id, commentsTable.body, commentsTable.created_utc, commentsTable.author_flair_text, submissionsTable.title, submissionsTable.pinned, commentsTable.score AS comment_score, submissionsTable.score AS story_score FROM commentsTable INNER JOIN submissionsTable ON RIGHT(commentsTable.link_id, 6)=submissionsTable.id')
    df_4 = df_4.sample(False, 0.05, None)

    # unigrams, bigrams, trigrams
    def unigrams_bigrams_trigrams(text):
        return parsetext.clean_up(text)

    udf_function = udf(unigrams_bigrams_trigrams, ArrayType(StringType()))
    df_5_1 = df_4.withColumn("udf_results", udf_function(col("body")))

    # countVectorizer
    model = CountVectorizerModel.load("cvModel")
    df_5_2 = model.transform(df_5_1)
    df_5_2.registerTempTable("df_5_2Table")

    
    df_5_3 = sqlContext.sql("SELECT * FROM df_5_2Table WHERE df_5_2Table.body NOT LIKE '%/s%' AND df_5_2Table.body NOT LIKE '&gt%'")
    df_5_3.registerTempTable("df_5_3Table")

    posResult_1 = posModel.transform(df_5_3)
    posResult_1.registerTempTable("posResult_1Table")
    posResult_2 = sqlContext.sql("SELECT posResult_1Table.id, posResult_1Table.body, posResult_1Table.author_flair_text, posResult_1Table.created_utc, posResult_1Table.title, posResult_1Table.comment_score, posResult_1Table.story_score, posResult_1Table.features, posResult_1Table.pinned, posResult_1Table.prediction AS pos FROM posResult_1Table")
    finalResult_1 = negModel.transform(posResult_2)
    finalResult_1.registerTempTable("finalResult_1Table")
    finalResult_2 = sqlContext.sql("SELECT finalResult_1Table.id, finalResult_1Table.body, finalResult_1Table.created_utc, finalResult_1Table.author_flair_text, finalResult_1Table.title, finalResult_1Table.comment_score, finalResult_1Table.story_score, finalResult_1Table.pos, finalResult_1Table.pinned, finalResult_1Table.prediction AS neg FROM finalResult_1Table")
    finalResult_2.registerTempTable("finalResult_2Table")

    if(not os.path.exists("final.parquet")):
        finalResult_2.write.parquet("final.parquet")

    final = sqlContext.read.parquet("final.parquet")
    final.registerTempTable("finalTable")

    # computations
    if(not os.path.exists("submissions.csv")):
        question1 = sqlContext.sql("SELECT (100 * sum(pos) / COUNT(*)) AS percent_pos, (100 * sum(neg) / COUNT(*)) AS percent_neg FROM finalTable")
        question1.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("submissions.csv")

    if(not os.path.exists("days.csv")):
        question2 = sqlContext.sql("SELECT DATE(from_unixtime(finalTable.created_utc)) AS date, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY date ORDER BY date")
        question2.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("days.csv")

    if(not os.path.exists("states.csv")):
        question3 = sqlContext.sql("SELECT finalTable.author_flair_text AS place, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY place ORDER BY place")
        question3.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("states.csv")

    if(not os.path.exists("comment.csv")):
        question4_comment = sqlContext.sql("SELECT finalTable.comment_score AS comment_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY comment_score ORDER BY comment_score")
        question4_comment.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("comment.csv")

    if(not os.path.exists("story.csv")):
        question4_story = sqlContext.sql("SELECT finalTable.story_score AS story_score, 100*SUM(finalTable.pos)/COUNT(*) AS percent_pos, 100*SUM(finalTable.neg)/COUNT(*) AS percent_neg FROM finalTable GROUP BY story_score ORDER BY story_score")
        question4_story.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("story.csv")
Esempio n. 27
0
 def get_cv_model(self):
     from pyspark.ml.feature import CountVectorizerModel
     cv_model = CountVectorizerModel.load(self.cv_path)
     return cv_model
        queries = idf_model.transform(queries)
        queries = scalerModel.transform(queries)
        preds = model.transform(queries)
        preds.select('payload', 'prediction').show()

    except:
        print('No data')


APP_NAME = "BigData"
conf = pyspark.SparkConf().setAll([('spark.app.name', APP_NAME),
                                   ('spark.executor.memory', '8g'),
                                   ('spark.cores.max', '2'),
                                   ('spark.driver.memory', '8g')])
sc = SparkContext(conf=conf)
sqlc = SQLContext(sc)

ngrams = udf(to_ngram, StringType())
tokenizer = Tokenizer.load('models/Tokenizer')
vectorizer = CountVectorizerModel.load('models/Vectorizer')
idf_model = IDFModel.load('models/idf')
scalerModel = StandardScalerModel.load('models/scalerModel')
model = LogisticRegressionModel.load('models/Logistic_Regression_Model')
ssc = StreamingContext(sc, batchDuration=3)
lines = ssc.socketTextStream("localhost", 9999)

lines.foreachRDD(get_prediction)

ssc.start()
ssc.awaitTermination()
'''
4、计算N篇文章数据的TFIDF值
步骤:
4.1、获取两个模型相关参数,计算并保存所有的13万文章中的关键字对应的idf值和索引。
为什么要保存这些值?并且存入数据库当中?
后续计算tfidf画像需要使用,避免放入内存中占用过多,持久化使用

Hive中建立表:idf_keywords_values
CREATE TABLE idf_keywords_values(
keyword STRING comment "article_id",
idf DOUBLE comment "idf",
index INT comment "index");
'''
from pyspark.ml.feature import CountVectorizerModel
# cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/countVectorizerOfArticleWords.model")
cv_model = CountVectorizerModel.load("hdfs://hadoop-master:9000/headlines/models/CV.model")

from pyspark.ml.feature import IDFModel
# idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDFOfArticleWords.model")
idf_model = IDFModel.load("hdfs://hadoop-master:9000/headlines/models/IDF.model")

keywords_list_with_idf = list(zip(cv_model.vocabulary, idf_model.idf.toArray()))

def func(data):
    for index in range(len(data)):
        data[index] = list(data[index])
        data[index].append(index)
        data[index][1] = float(data[index][1])

print(len(keywords_list_with_idf))
func(keywords_list_with_idf)
Esempio n. 30
0
    item_info = sqlContext.sql("select * from cmp_tmp_rec_item_info_feature ")
    item_info = item_info.filter("impression_freqs>%d" % min_TF)
    # reference_list is NULL 表示需要有上下文信息

    train_data = sqlContext.sql(
        "select * from cmp_tmp_rec_train_agg where reference_list is not NULL "
    )
    test_data = sqlContext.sql(
        "select * from cmp_tmp_rec_test_agg where action_type=='clickout item'  and  reference_list is not  NULL"
    )

    has_cv = True
    if has_cv:
        from pyspark.ml.feature import CountVectorizerModel

        cv = CountVectorizerModel()
        cv_model = cv.load(os.path.join(model_pth, model_name))

    else:
        cv = CountVectorizer(inputCol="item_seq", outputCol='item_seq_enc')
        cv_model = cv.fit(train_data)

    vocab = cv_model.vocabulary
    import pandas as pd

    vocab = pd.DataFrame(np.array([range(len(vocab)), vocab]).T,
                         columns=['impression_id', 'impression'])
    df_vocab = sqlContext.createDataFrame(vocab)

    test_data = data_transform(test_data, cv_model)
    train_data = data_transform(train_data, cv_model)
        print(
            '********************* after preprocessing testing files *****************'
        )
        print(
            '********************* after preprocessing testing files *****************'
        )

        #converting the testing rdd to df
        testing_data = rddToDf_testing(testing_data)

        print('********************* after converting to df *****************')
        print('********************* after converting to df *****************')
        print('********************* after converting to df *****************')

        #reading the saved countvector model
        cv = CountVectorizerModel.load(args.model_path + '/countvector_model')
        #transforming test data to count vector
        testing_data = cv.transform(testing_data)
        #saving the transformed data as parquet file
        testing_data.write.parquet(args.model_path + '/testingdata.parquet')

        print(
            '********************* after cv transformation *****************')
        print(
            '********************* after cv transformation *****************')
        print(
            '********************* after cv transformation  *****************')

        #reading the saved random forest model
        rfModel = RandomForestClassificationModel.load(args.model_path +
                                                       '/rfmodel')
Esempio n. 32
0
words_df = article_dataframe.rdd.mapPartitions(segmentation).toDF(['article_id','channel_id','words'])
# print(words_df.collect())
try:
    print('正在判断CV模型是否存在')
    ktt.textFile('hdfs://master:9000/headlines/model/CV.model')
    # print(cv_model)
    print('模型存在')
except Exception as e:
    print(e)
    print('不存在模型,启动训练')
    cv = CountVectorizer(inputCol='words', outputCol='countFeatures', vocabSize=200 * 10000, minDF=1.0)
    # cv_model = cv.fit(words_df)
    # cv_model.write().overwrite().save('hdfs://master:9000/headlines/model/CV.model')
finally:
    print('读取模型:。。。。')
    cv_model= CountVectorizerModel.load('hdfs://master:9000/headlines/model/CV.model')
    print('将模型的词频统计转化为词的向量')
    cv_result = cv_model.transform(words_df)
    print('利用计算的词向量训练向量模型并保存')
    idf = IDF(inputCol='countFeatures',outputCol='idfFeatures')
    idfmode = idf.fit(cv_result)
    idfmode.write().overwrite().save('hdfs://master:9000/headlines/model/IDF.model')
    print('IDF模型训练并保存成功')


print('查看cv_model模型效果:')
print(cv_model.vocabulary)
print('查看IDF模型效果:')
print(idfmode.idf.toArray[:20])