コード例 #1
0
 def _transform(self, dataset):
     t = StringType()
     out_col = self.getOutputCol()
     in_col = dataset[self.getInputCol()]
     return dataset.withColumn(
         out_col,
         udf(lambda x: LabeledPoint(1, Vectors.fromML(x)), t)(in_col))
コード例 #2
0
ファイル: model.py プロジェクト: brettbevers/miner
    def __init__(self,
                 training_data,
                 max_iterations=200,
                 spark_session=None,
                 ll_sample_size=5,
                 ll_sample_fraction=0.99,
                 fit_model_retries=10):
        if hasattr(training_data, 'rdd'):
            self.ml_training_data = training_data  # type: DataFrame
            self.mllib_training_data = training_data.rdd\
                .map(lambda r: Vectors.fromML(r.features)).persist()  # type: RDD
        else:
            if spark_session is None:
                raise Exception(
                    "Spark session must be provided if training data is not a dataframe."
                )
            self.mllib_training_data = training_data  # type: RDD
            self.ml_training_data = spark_session.createDataFrame(
                training_data.map(lambda v: (MlVectors.dense(v), )),
                ['features'])  # type: DataFrame

        self.max_iterations = max_iterations
        self.ll_sample_size = ll_sample_size
        self.ll_sample_fraction = ll_sample_fraction
        self.ll_samples = {}
        self.fit_model_retries = fit_model_retries
コード例 #3
0
ファイル: analyze_data.py プロジェクト: rubcuadra/midterm2018
def getKeywordsInDataRange(sDF,
                           oldestTime,
                           newestTime,
                           topics=1,
                           wordsPerTopic=20):  #yyyy-MM-dd
    #Filter
    oldestTime = datetime.strptime(oldestTime, '%Y-%m-%d')
    newestTime = datetime.strptime(newestTime, '%Y-%m-%d')

    filteredText = sDF\
                    .select( "id", date_format('day','yyyy-MM-dd').alias('time'), col("title").alias("text") )\
                    .where( (col("time") >= oldestTime) & (col("time") <= newestTime) )

    #StartPipeline for preparing data
    textToWords = RegexTokenizer(
        inputCol="text", outputCol="splitted",
        pattern="[\\P{L}]+")  #Remove signs and split by spaces
    stopRemover = StopWordsRemover(
        inputCol="splitted",
        outputCol="words",
        stopWords=StopWordsRemover.loadDefaultStopWords("english"))
    countVectorizer = CountVectorizer(inputCol="words", outputCol="features")
    pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer])

    #GetCorups for LDA
    try:
        model = pipeline.fit(filteredText)
    except IllegalArgumentException:
        return []
    result = model.transform(filteredText)
    corpus = result.select("id", "features").rdd.map(
        lambda r: [mhash(r.id) % 10**8,
                   Vectors.fromML(r.features)]).cache()

    # Cluster the documents into k topics using LDA
    ldaModel = LDA.train(corpus,
                         k=topics,
                         maxIterations=100,
                         optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.stages[2].vocabulary  #CountVectorizer
    topicIndices = spark.sparkContext.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordsPerTopic))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordsPerTopic):
            term = vocabArray[terms[i]]
            result.append(term)
        return result

    # topics_final = topicIndices.map(lambda topic: topic_render(topic)).collect()
    # for topic in range(len(topics_final)):
    #     print ("Topic" + str(topic) + ":")
    #     for term in topics_final[topic]:
    #         print (term)
    #     print ('\n')
    return topicIndices.map(lambda topic: topic_render(topic)).collect()
コード例 #4
0
    def train_SVM(idf_df, iterations=50, regress_param=0.3):
        """
        通过上面划分的数据向量来训练SVM
        注:这里必须是静态方法,否则会出现sparkContext广播错误(sparkContext只能由全局driver使用)
        :param idf_df:
        :param iterations:
        :param regress_param:
        :return:
        """
        splits = idf_df.select(['idf_output', 'label']).randomSplit([0.8, 0.2],
                                                                    seed=100)
        train = splits[0].cache()
        test = splits[1].cache()

        train_lb = train.rdd.map(
            lambda row: LabeledPoint(row[1], MLLibVectors.fromML(row[0])))
        # SVM model
        svm = SVMWithSGD.train(train_lb, iterations, regParam=regress_param)

        test_lb = test.rdd.map(
            lambda row: LabeledPoint(row[1], MLLibVectors.fromML(row[0])))
        scoreAndLabels_test = test_lb.map(
            lambda x: (float(svm.predict(x.features)), x.label))
        spark = SparkSession \
            .builder \
            .appName("Python Spark SQL") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
        score_label_test = spark.createDataFrame(scoreAndLabels_test,
                                                 ["prediction", "label"])

        # F1 score
        f1_eval = MulticlassClassificationEvaluator(labelCol="label",
                                                    predictionCol="prediction",
                                                    metricName="f1")
        svm_f1 = f1_eval.evaluate(score_label_test)
        print("F1 score: %.4f" % svm_f1)
        return svm
コード例 #5
0
 def test_ml_mllib_vector_conversion(self):
     # to ml
     # dense
     mllibDV = Vectors.dense([1, 2, 3])
     mlDV1 = newlinalg.Vectors.dense([1, 2, 3])
     mlDV2 = mllibDV.asML()
     self.assertEqual(mlDV2, mlDV1)
     # sparse
     mllibSV = Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV1 = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV2 = mllibSV.asML()
     self.assertEqual(mlSV2, mlSV1)
     # from ml
     # dense
     mllibDV1 = Vectors.dense([1, 2, 3])
     mlDV = newlinalg.Vectors.dense([1, 2, 3])
     mllibDV2 = Vectors.fromML(mlDV)
     self.assertEqual(mllibDV1, mllibDV2)
     # sparse
     mllibSV1 = Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mlSV = newlinalg.Vectors.sparse(4, {1: 1.0, 3: 5.5})
     mllibSV2 = Vectors.fromML(mlSV)
     self.assertEqual(mllibSV1, mllibSV2)
コード例 #6
0
def df_to_simple_rdd(df, categorical=False, nb_classes=None, features_col='features', label_col='label'):
    """Convert DataFrame into RDD of pairs
    """
    sql_context = df.sql_ctx
    sql_context.registerDataFrameAsTable(df, "temp_table")
    selected_df = sql_context.sql(
        "SELECT {0} AS features, {1} as label from temp_table".format(features_col, label_col))
    if isinstance(selected_df.first().features, MLLibVector):
        lp_rdd = selected_df.rdd.map(
            lambda row: LabeledPoint(row.label, row.features))
    else:
        lp_rdd = selected_df.rdd.map(lambda row: LabeledPoint(
            row.label, MLLibVectors.fromML(row.features)))
    rdd = lp_to_simple_rdd(lp_rdd, categorical, nb_classes)
    return rdd
コード例 #7
0
ファイル: utils.py プロジェクト: markkod/tweet-sentiment
def tfidf(row_df):
    hashingTF = HashingTF(inputCol='bigrams',
                          outputCol='TF',
                          numFeatures=20000)
    tf_df = hashingTF.transform(row_df)

    idf = IDF(inputCol='TF', outputCol='TF-IDF')
    idfModel = idf.fit(tf_df)
    idf_df = idfModel.transform(tf_df)

    # Convert labels to sparse vectors, that are needed by the classifer
    coordinates = tf_df.select("coordinates").rdd.flatMap(
        lambda x: x).collect()
    tweets = tf_df.select('sentence').rdd.flatMap(lambda x: x).collect()
    return tweets, coordinates, tf_df.rdd.map(
        lambda row: LabeledPoint(0.0, Vectors.fromML(row.TF)))
コード例 #8
0
ファイル: 2ndTask.py プロジェクト: rubcuadra/BigDataTasks
def A1():  #1) apply LDA and find topics in user's posts (including reposts)
    textToWords = RegexTokenizer(
        inputCol="text", outputCol="splitted",
        pattern="[\\P{L}]+")  #Remove signs and split by spaces
    stopRemover = StopWordsRemover(
        inputCol="splitted",
        outputCol="words",
        stopWords=StopWordsRemover.loadDefaultStopWords("russian") +
        StopWordsRemover.loadDefaultStopWords("english"))
    countVectorizer = CountVectorizer(inputCol="words", outputCol="features")

    #Filter if post id exists?
    data = uWallP\
        .filter( uWallP.text != "" )\
        .select("id","text")\
        .limit(10)\

    pipeline = Pipeline(stages=[textToWords, stopRemover, countVectorizer])
    model = pipeline.fit(data)
    result = model.transform(data)
    corpus = result.select("id", "features").rdd.map(
        lambda r: [r.id, Vectors.fromML(r.features)]).cache()

    # Cluster the documents into k topics using LDA
    ldaModel = LDA.train(corpus, k=8, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.stages[2].vocabulary  #CountVectorizer
    wordNumbers = 20  # number of words per topic
    topicIndices = spark.sparkContext.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordNumbers):
            term = vocabArray[terms[i]]
            result.append(term)
        return result

    topics_final = topicIndices.map(
        lambda topic: topic_render(topic)).collect()

    for topic in range(len(topics_final)):
        print("Topic" + str(topic) + ":")
        for term in topics_final[topic]:
            print(term)
        print('\n')
コード例 #9
0
ファイル: word_topics.py プロジェクト: tehblasian/aita
def word_topics(num_topics=NUM_TOPICS, num_words_per_topics=NUM_WORDS_PER_TOPICS):
    """Generates topics from word clusters.

    Arguments:
        num_topics {integer} -- Number of topics to infer
        num_words_per_topics {integer} -- Number of terms to collect for each topic

    Returns:
        None
    """
    spark = init_spark(AITA_CLEANED_COLLECTION)
    data_rdd = spark.read.format('mongo').load().rdd

    preprocessed_rdd = data_rdd\
        .flatMap(lambda row: [row['header'].lower().split(' ') + row['content'].lower().split(' ')]) \
        .zipWithIndex() \
        .map(lambda x: Row(index=x[1], words=x[0]))

    preprocessed_df = spark.createDataFrame(preprocessed_rdd)

    cv = CountVectorizer(inputCol='words', outputCol='vectors')
    model = cv.fit(preprocessed_df)
    vector_df = model.transform(preprocessed_df)

    corpus = vector_df.select('index', 'vectors').rdd.map(lambda x: [x[0], Vectors.fromML(x[1])]).cache()

    lda_model = LDA.train(corpus, k=num_topics, maxIterations=100, optimizer='online')
    vocab_array = model.vocabulary

    topic_indices = spark.sparkContext.parallelize(lda_model.describeTopics(maxTermsPerTopic=num_words_per_topics))

    def vector_id_to_word(topic):
        terms = topic[0]
        weights = topic[1]
        result = []
        for i in range(num_words_per_topics):
            result.append((vocab_array[terms[i]], weights[i]))
        return result

    topics = topic_indices.map(lambda topic: vector_id_to_word(topic)).collect()

    for i in range(len(topics)):
        print('Topic {}:'.format(i))
        for item in topics[i]:
            print(item)
        print('\n')
コード例 #10
0
ファイル: LDA_pyspark.py プロジェクト: bz10bis/FinalProject
def LDA_Treatment(str):
    finalTopics = []
    txt = wordTokenize(str)
    data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd=val[1], _words=val[0].split(" ")))
    docDF = spark.createDataFrame(data, ["_words"])
    Vector = CountVectorizer(inputCol="_words", outputCol="vectors")
    model = Vector.fit(docDF)
    result = model.transform(docDF)
    corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache()
    ldaModel = LDA.train(corpus, k=nbTopics, maxIterations=1000, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.vocabulary
    topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
    topics_final = topicIndices.map(lambda topic: topic_render(topic, vocabArray)).collect()

    for topic in range(len(topics_final)):
        for term in topics_final[topic]:
            term = unidecode.unidecode(term)
            finalTopics.append(term)

    return finalTopics
コード例 #11
0
def mllib_linear_regression(s_file, r_file, iter_ ):

    def data_process(s_file, r_file):
        table_s = spark.read.csv(s_file, inferSchema = True, header = True, sep = ",")
        table_r = spark.read.csv(r_file, inferSchema = True, header = True, sep = ",")

        table_r = table_r.withColumn("default", lit(1))
        table_s = table_s.select(*(col(c).cast("float").alias(c) for c in table_s.columns))
        table_r = table_r.select(*(col(c).cast("float").alias(c) for c in table_r.columns))
        table_s.registerTempTable("table_s")
        table_r.registerTempTable("table_r")


        table_joint = spark.sql("SELECT * FROM table_s LEFT JOIN table_r ON table_s.fk = table_r.rid")
        table_joint.registerTempTable("table_joint")
        table_joint = table_joint.select(*(col(c).cast("float").alias(c) for c in table_joint.columns))

        # make joint data
        col_size_s = len(table_s.columns)
        col_size_r = len(table_r.columns)
        feature_cols = table_joint.columns[3:col_size_s]+table_joint.columns[col_size_s+1:]

        vectorAssembler = VectorAssembler(inputCols = feature_cols, outputCol = 'X')
        table_assemble = vectorAssembler.transform(table_joint)
        exprs = [col(column).alias(column.replace(' ', '_')) for column in table_assemble.columns]
        #R(RID, X_R)
        Tdata = table_assemble.select(*exprs).selectExpr("y as y", "X as X")

        return Tdata.rdd
    #processing data
    #Sdata_rdd, Rdata_rdd, feat_size_s, feat_size_r,Sdata_size = data_pre_process(s_file, r_file)

    #processing data
    Tdata_rdd = data_process(s_file, r_file)
    trainingData = Tdata_rdd.map(lambda row: LabeledPoint(row.y, MLLibVectors.fromML(row.X)))

    lr_model = LinearRegressionWithSGD.train(trainingData, iterations=iter_, step=0.01, miniBatchFraction=1.0)
    W = list(lr_model.weights)
    return np.array(W)
コード例 #12
0
    sc = pyspark.SparkContext.getOrCreate(conf=conf)
    sqlcontext = pyspark.SQLContext(sc)
    training_set = (sqlcontext.read.format("parquet").option(
        "header", True).load(data_dir))

    # TF
    cv = sf.CountVectorizer(inputCol="text",
                            outputCol="tf_features",
                            vocabSize=input_dim)
    # IDF
    idf = sf.IDF(inputCol="tf_features", outputCol="features")
    label_string = sf.StringIndexer(inputCol="first_label", outputCol="label")
    pipeline_dl = Pipeline(stages=[cv, idf, label_string])
    df = pipeline_dl.fit(training_set).transform(training_set)
    df = df.rdd.map(lambda x: (LabeledPoint(x[
        'label'], MLLibVectors.fromML(x['features']))))
    logger.info("Pipeline created ...")
    logger.info("Transforms the text into tf idf RDD ...")
    model = create_keras_model(input_dim, output_dim)

    logger.info("Starts Training ...")
    spark_model = SparkMLlibModel(model=model,
                                  frequency='epoch',
                                  mode='asynchronous',
                                  parameter_server_mode='socket')
    spark_model.fit(df,
                    epochs=epochs,
                    batch_size=132,
                    verbose=1,
                    validation_split=0.2,
                    categorical=True,
コード例 #13
0
def do_query(issues, input_file, _log):
    """
    Get the Latent Dirochelet Allocation topics for this group of articles
    """

    # Extract parameters from input_rules
    with open(input_file, 'r') as infile:
        keys = load(infile)
        keyword = keys['keyword']
        optimizer = keys['optimizer']
        if optimizer != 'online' and optimizer != 'em':
            raise ValueError(
                "Optmizer must be 'online' or 'em' but is '{}'".format(
                    optimizer))
        max_iterations = keys['max_iterations']
        if max_iterations < 1:
            raise ValueError('max_iterations must be at least 1')
        ntopics = keys['ntopics']
        if ntopics < 1:
            raise ValueError('ntopics must be at least 1')
        topic_words = keys['topic_words']
        if topic_words < 1:
            raise ValueError('topic_words must be at least 1')
    keyword_pattern = comp(r'\b{}\b'.format(keyword), U | I)

    # Map each article in each issue to a year of publication
    min_year, max_year = issues \
        .filter(lambda issue: issue.date) \
        .map(lambda issue: (issue.date.year, issue.date.year)) \
        .reduce(find_min_and_max)

    articles_rdd = issues.flatMap(lambda issue: issue.articles) \
        .filter(contains_keyword(keyword_pattern)) \
        .zipWithIndex() \
        .map(to_row_with_words)

    spark = SparkSession \
        .builder \
        .appName('lda') \
        .getOrCreate()

    articles_df = spark.createDataFrame(articles_rdd)

    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    articles_df = remover.transform(articles_df)

    vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors')
    model = vectortoriser.fit(articles_df)
    vocab_array = model.vocabulary
    articles_df = model.transform(articles_df)

    corpus = articles_df \
        .select('idx', 'vectors') \
        .rdd \
        .map(lambda a: [a[0], Vectors.fromML(a[1])]) \
        .cache()

    # Cluster the documents into n topics using LDA
    lda_model = LDA.train(corpus,
                          k=ntopics,
                          maxIterations=max_iterations,
                          optimizer=optimizer)
    # topics = lda_model.topicsMatrix()
    # _log.error(topics)
    topics_final = [
        topic_render(topic, topic_words, vocab_array)
        for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words)
    ]

    topics = [('Years', [min_year, max_year])]
    for i, topic in enumerate(topics_final):
        t_words = []
        for term in topic:
            t_words.append(term)
        topics.append((str(i), t_words))

    return topics
コード例 #14
0
        for i in range(1, k):
            if 'f:'+str(i) in line:
                indexList.append(i)
                valList.append(line['f:'+str(i)])
        label = int(line['l:'+str(col)])
        if label == -1:
            label = 0
        features.append((Vectors.sparse(k, indexList, valList),label))
    features = sc.parallelize(features)
    #sclines = sc.parallelize(lines)
    #features = sclines.map(featuresToSparseVecFromLine)
    featureDataFrame = spark.createDataFrame(features, ["features", "label"])
    pca = PCA(k=100, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(featureDataFrame)
    #pcaresult = model.transform(featureDataFrame).select("pcaFeatures").collect()
    #lp = []
    #c = 0
    #for com in pcaresult:
    #    lp.append(LabeledPoint(lines[c]['l:' + str(col)], mllibVectors.fromML(com.pcaFeatures)))
    #    c += 1
    #lp = sc.parallelize(lp)
    pcaresult = model.transform(featureDataFrame).rdd
    lp = pcaresult.map(lambda r: LabeledPoint(r.label, mllibVectors.fromML(r.pcaFeatures)))
    model = SVMWithSGD.train(lp)
    model.save(sc, "svm/SVM" + str(col))
    labelsAndPreds = lp.map(lambda p: (p.label, model.predict(p.features)))
    err = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("err at node " + str(col) + " = " + str(err))

sc.stop()
コード例 #15
0
ファイル: lda.py プロジェクト: madhav-datt/spark-mines
conf = (SparkConf().setMaster("local").setAppName("My").set(
    "spark.executor.memory", "1g"))
sc = SparkContext(conf=conf)
sc.setLogLevel("OFF")

sqlContext = SQLContext(sc)
path = 'clean_test.txt'  # path of the txt file

data = sc.textFile(path).zipWithIndex().map(
    lambda line: Row(idd=line[1], words=line[0].split(" ")))
os.system('rm -f metastore_db/dbex.lck')
docDF = sqlContext.createDataFrame(data)

Vector = CountVectorizer(inputCol="words", outputCol="vectors")
Vector = Vectors.fromML(Vector)
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus_size = result.count()  # total number of words
corpus = result.select(
    "idd", "vectors").rdd.map(lambda line: [line[0], line[1]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3, maxIterations=100, optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 10  # number of words per topic
topicIndices = sc.parallelize(
    ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))
コード例 #16
0
def main(sc):

    train_id = utils.load("data_id/train.p")
    test_id = utils.load("data_id/test.p")

    meta(train_id)

    train_id = [[idx] for idx in train_id]
    test_id = [[idx] for idx in test_id]

    sqlContext = SQLContext(sc)
    train_f = sqlContext.createDataFrame(train_id, ['biz_id'])
    test_f = sqlContext.createDataFrame(test_id, ['biz_id'])

    # Register user defined functions
    # city = udf(lambda b_id: get_city(b_id), StringType())
    state = udf(lambda b_id: MLVectors.dense(get_state(b_id)), VectorUDT())
    stars = udf(lambda b_id: get_stars(b_id), FloatType())
    popularity = udf(lambda b_id: get_popularity(b_id), IntegerType())
    name_size = udf(lambda b_id: get_name_size(b_id), IntegerType())
    name_polar = udf(lambda b_id: get_name_polar(b_id), FloatType())
    pos_neg_score = udf(lambda b_id: MLVectors.dense(get_PosNeg_score(b_id)),
                        VectorUDT())
    # clarity = udf(lambda b_id: get_clarity(b_id), ArrayType(FloatType()))
    elite_cnt = udf(lambda b_id: get_elite_cnt(b_id), IntegerType())
    label = udf(lambda b_id: get_y(b_id), IntegerType())

    # Generate feature columns
    # data_f = data_f.withColumn("city", city(data_f['biz_id']))
    train_f = train_f.withColumn("state", state(train_f['biz_id']))
    train_f = train_f.withColumn("stars", stars(train_f['biz_id']))
    train_f = train_f.withColumn("popularity", popularity(train_f['biz_id']))
    train_f = train_f.withColumn("name_size", name_size(train_f['biz_id']))
    train_f = train_f.withColumn("name_polar", name_polar(train_f['biz_id']))
    train_f = train_f.withColumn("pos_neg_score",
                                 pos_neg_score(train_f['biz_id']))
    # data_f = data_f.withColumn("clarity", clarity(data_f['biz_id']))
    train_f = train_f.withColumn("elite_cnt", elite_cnt(train_f['biz_id']))
    train_f = train_f.withColumn("y", label(train_f['biz_id']))
    train_f.show(5)

    # Generate feature columns
    test_f = test_f.withColumn("state", state(test_f['biz_id']))
    test_f = test_f.withColumn("stars", stars(test_f['biz_id']))
    test_f = test_f.withColumn("popularity", popularity(test_f['biz_id']))
    test_f = test_f.withColumn("name_size", name_size(test_f['biz_id']))
    test_f = test_f.withColumn("name_polar", name_polar(test_f['biz_id']))
    test_f = test_f.withColumn("pos_neg_score",
                               pos_neg_score(test_f['biz_id']))
    test_f = test_f.withColumn("elite_cnt", elite_cnt(test_f['biz_id']))
    test_f = test_f.withColumn("y", label(test_f['biz_id']))
    test_f.show(5)

    # One-hot encoding
    # encoder = OneHotEncoder(inputCol="state", outputCol="stateVec")
    # train_f = encoder.transform(train_f)
    train_f.show(5)
    # test_f = encoder.transform(test_f)
    test_f.show(5)

    # Assemble columns to features
    assembler = VectorAssembler(inputCols=[
        "state", "stars", "popularity", "name_size", "name_polar",
        "pos_neg_score", "elite_cnt"
    ],
                                outputCol="features")

    train_f = assembler.transform(train_f)
    train_f.show(5)
    test_f = assembler.transform(test_f)
    test_f.show(5)

    train_f = train_f.filter(train_f.y != -1)
    test_f = test_f.filter(test_f.y != -1)


    train_d = (train_f.select(col("y"), col("features")) \
                .rdd \
                .map(lambda row: LabeledPoint(float(row.y), MLLibVectors.fromML(row.features))))
    m = SVMWithSGD.train(train_d)
    predictionAndLabels = test_f.rdd.map(lambda row: (float(
        m.predict(MLLibVectors.fromML(row.features))), float(row.y)))
    # Grid search for best params and model
    # scores = {}
    # max_score = 0
    # for m in model_list:
    #     print ('run', m)
    #     evaluator = BinaryClassificationEvaluator()
    #     cv = CrossValidator(estimator=model_list[m],
    #                 estimatorParamMaps=params_list[m],
    #                 evaluator=evaluator,
    #                 numFolds=3)
    #     cv.fit(train)
    #     scores[m] = cv.get_best_score()
    #     if scores[m] > max_score:
    #         op_params = params_list[m][cv.get_best_index()]
    #         op_model = cv.get_best_model()
    #         op_m_name = m

    # predictionAndLabels = test.map(lambda lp: (float(op_model.predict(lp.features)), lp.y))

    # Instantiate metrics object
    bi_metrics = BinaryClassificationMetrics(predictionAndLabels)
    mul_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = %s" % bi_metrics.areaUnderPR)
    # Area under ROC curve
    print("Area under ROC = %s" % bi_metrics.areaUnderROC)
    # Confusion Matrix
    print("Confusion Matrix")
    print(mul_metrics.confusionMatrix().toArray())

    # Overall statistics
    precision = mul_metrics.precision()
    recall = mul_metrics.recall()
    f1Score = mul_metrics.fMeasure()
    accuracy = mul_metrics.accuracy
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("Accuracy = %s" % accuracy)

    # Individual label stats
    labels = [0, 1]
    for label in labels:
        print("Class %s precision = %s" %
              (label, mul_metrics.precision(label)))
        print("Class %s recall = %s" % (label, mul_metrics.recall(label)))
コード例 #17
0
        wordsFiltered.append(w)

txt = " ".join(wordsFiltered).lower()

data = sc.parallelize([
    txt
]).zipWithIndex().map(lambda val: Row(idd=val[1], words=val[0].split(" ")))

docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus = result.select(
    "idd",
    "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3, maxIterations=700, optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 5  # number of words per topic
topicIndices = sc.parallelize(
    ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))


def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
コード例 #18
0
    'yyyy-MM-dd').alias('no_timestamp')).groupby('no_timestamp').count().sort(
        F.col('no_timestamp'))
print(dates.show(dates.count()))
dates.toPandas().plot(kind='line', x='no_timestamp', y='count')

dates.toPandas().plot(kind='bar', x='no_timestamp')

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
prep_df = tokenizer.transform(df)
cv_prep = CountVectorizer(inputCol="words", outputCol="prep")
cv_model = cv_prep.fit(prep_df)
ready_df = cv_model.transform(prep_df)
# stopWords = [word for word in cv_prep.vocabulary if any(char.isdigit() for char in word)]
# remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwords)
# prep_df = remover.transform(prep_df)

trainable = ready_df.select(
    'tweet_id', 'prep').rdd.map(lambda x, y: [x, Vectors.fromML(y)]).cache()
print("Trainable")
print(trainable.take(10))
print("take")
model = LDA.train(trainable, k=5, seed=1, optimizer="online")
exit(0)
#Print the topics in the model
topics = model.describeTopics(maxTermsPerTopic=15)
for x, topic in enumerate(topics):
    print('topic nr: ' + str(x))
    words = topic[0]
    weights = topic[1]
    for n in range(len(words)):
        print(cv_prep.vocabulary[words[n]] + ' ' + str(weights[n]))
コード例 #19
0
    #     tweets.append(new_tweet)
    # f.close()
    # fd = codecs.open('cleaned_example.txt', 'w', encoding = 'utf-8')
    # for tweet in tweets:
    #     fd.write(tweet+'\n')

    rdd = sc.textFile('opinion.txt').zipWithIndex().map(
        lambda (words, idd): Row(idd=idd, words=words.split(" ")))
    docDF = spark.createDataFrame(rdd)
    Vector = CountVectorizer(inputCol="words", outputCol="vectors")
    model = Vector.fit(docDF)
    result = model.transform(docDF)

    corpus = result.select(
        "idd",
        "vectors").rdd.map(lambda (x, y): [x, Vectors.fromML(y)]).cache()

    # Cluster the documents into three topics using LDA
    ldaModel = LDA.train(corpus, k=5, maxIterations=100, optimizer='online')
    topics = ldaModel.topicsMatrix()
    vocabArray = model.vocabulary

    wordNumbers = 5  # number of words per topic
    topicIndices = sc.parallelize(
        ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))

    def topic_render(topic):  # specify vector id of words to actual words
        terms = topic[0]
        result = []
        for i in range(wordNumbers):
            term = vocabArray[terms[i]]
コード例 #20
0
#import os
#import sys
#import io
#os.environ["PYSPARK_PYTHON"]="/usr/bin/python2"
#sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
path = "Adelaide Airport-adelaide-0-2016.txt"
sc = SparkContext()
spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" ")))
docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus = result.select("idd", "vectors").rdd.map(lambda (x,y): [x,Vectors.fromML(y)]).cache()
# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=5,maxIterations=100,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 5  # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
    return result
コード例 #21
0
adjectifs = ["ABSOLU","ADMIRABLE","AGREABLE","AIMABLE","AMUSANT","APOCALYPTIQUE","APPROXIMATIF","ATTACHANT","BANAL","BAS","BAVAROIS","BIEN","BOF","BON","BOULEVERSANT","BOUTE EN TRAIN","CAPTIVANT","CARACTERIEL","CATACLYSMIQUE","CATASTROPHIQUE","CELESTE","CHARMANT","CHEF D'OEUVRE","CHOUETTE","COMMUN","CONVENABLE","CONVIVIAL","COQUET","CORRECT","CREDIBLE","CROQUANTE","CYNIQUE","DEGUEULASSE","DELECTABLE","DELICIEUSE","DISJONCTE","DIVIN","DOUCE","DOUE","DROLE","EBLOUISSANT","EBOURIFFE","EFFICACE","EMBALLANT","EMOUVANT","ENDIABLE","ENNUYANT","ENRAGE","ENTHOUSIASMANT","EPATANT","EPOUSTOUFLANT","EPOUVANTABLE","EQUITABLE","EXALTANT","EXCEPTIONNEL","EXCUSABLE","EXEMPLAIRE","EXTRA","FERU","FESTIF","FLAMBOYANTE","FORMIDABLE","GRANDIOSE","HARDI","HONNETE","HORRIBLE","IMPORTANT","IMPRESSIONNANT","INCONNU","INCREDULE","INDEPENDANT","INFERNAL","INNOMMABLE","INSIGNIFIANT","INSUFFISANT","INSUPPORTABLE","INTENABLE","INTERESSANT","IRRESISTIBLE","LIBIDINEUX","LOUABLE","MAJESTUEUX","MAGISTRAL","MAGNIFIQUE","MEDIOCRE","MERDIQUE","MERVEILLEUX","MIGNON","MINABLE","MIROBOLANTE","MORTEL","MOYEN","NEGLIGEABLE","NUL","ORDINAIRE","ORIGINAL","PARFAIT","PIRE","PASSABLE","PASSIONNANT","PERCUTANT","PERSEVERANT","PHENOMENAL","PLACIDE","PLAISANT","PRESTANT","PRODIGIEUX","PROVERBIAL","QUELCONQUE","RAVISSANT","RECYCLE","RELATIF","REMARQUABLE","RENVERSANT","REVENDICATRICE","REVOLUTIONNAIRE","ROCAMBOLESQUE","RUTILANT","SAINT","SATISFAISANT","SEDUISANT","SEXY","SOMPTUEUX","SPIRITUEUX","SPLENDIDE","SUAVE","SUBLIME","SULFUREUSE","SUPERBE","SUPREME","SUPPORTABLE","TALENTUEUX","TOLERABLE","TRAGIQUE","TREPIDANT","TRES","TROUBLANT","VALABLE","VALEUREUX","VENERABLE","VITAMINES","VIVABLE","VULGAIRE"]
articles = ['LE', 'LA', 'LES', 'UN', 'DES','COMME', 'A', 'QUE', 'PLUS', 'OUI', 'NON', 'PEUT', 'CES', 'CETTE', 'CET', 'MAIS', 'OU', 'ET', 'DONC', 'TOUS', 'TOUTE', 'LEUR', 'TOUTES', 'LEURS', 'AINSI', 'BIEN', 'MAL', 'ETRE', 'AVOIR', 'FAIRE', 'AVEC', 'SANS', 'PLUS', 'MOINS']

for w in words:
    if w not in stopWords and w.upper() not in articles and w.upper() not in adjectifs and len(w)>2:
        wordsFiltered.append(w)

txt = " ".join(wordsFiltered).lower()

data = sc.parallelize([txt]).zipWithIndex().map(lambda val: Row(idd= val[1], _words = val[0].split(" ")))
docDF = spark.createDataFrame(data, ["_words"])
Vector = CountVectorizer(inputCol="_words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus = result.select("idd", "vectors").rdd.map(lambda val: [val[0],Vectors.fromML(val[1])]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=1,maxIterations=700,optimizer='online')

topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 6  # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
コード例 #22
0
def buildTfIdfRddAllTopics(business, sports, politics, entertainment):
	business_df = buildTextRDD(business, BUSINESS_LABEL)
	politics_df = buildTextRDD(sports, POLITICS_LABEL)
	sports_df = buildTextRDD(politics, SPORTS_LABEL)
	entertainment_df = buildTextRDD(entertainment, ENTERTAINMENT_LABEL)

	# Union together all dataframes
	main_df = business_df.union(politics_df)
	main_df = main_df.union(sports_df)
	main_df = main_df.union(entertainment_df)
	main_df = main_df.withColumnRenamed('_1', 'label')
	main_df = main_df.withColumnRenamed('_2', 'content')
	tokenizer = Tokenizer(inputCol="content", outputCol="words")
	wordsData = tokenizer.transform(main_df)
	hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=8)
	featurizedData = hashingTF.transform(wordsData)
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	idfModel = idf.fit(featurizedData)
	rescaledData = idfModel.transform(featurizedData)
	return rescaledData.select([c for c in rescaledData.columns if c in ['label', 'features']]).rdd.map(lambda x: LabeledPoint(x.label, MLLibVectors.fromML(x.features)))
コード例 #23
0
valPredsRDD = valPreds.rdd
valuesAndPredsVal = valPredsRDD.map(lambda x: (x.label, x.prediction))
print('Validation RMSE: {}'.format(rootMeanSquaredError(valuesAndPredsVal)))

####################################################################################
## part 3
print('*' * 100)
print('Part 3 - Visualize the log of the training error\n')

# convert data sets
from pyspark.mllib.linalg import Vectors as MLLibVectors
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD

train_dataRDD = train_data.rdd
train_dataRDD = train_dataRDD.map(
    lambda x: LabeledPoint(x[0], MLLibVectors.fromML(x[1])))
train_dataRDD.persist()

numIters = 50
errors = []
for i in range(1, numIters + 1):
    model = LinearRegressionWithSGD.train(train_dataRDD,
                                          iterations=i,
                                          step=0.01)
    valuesAndPredsTrain = train_dataRDD.map(
        lambda x: (x.label, model.predict(x.features)))
    errors.append(rootMeanSquaredError(valuesAndPredsTrain))
    print(errors)

# visualize actual vs. prediction
x = np.arange(1, numIters + 1)
コード例 #24
0
ファイル: generate.py プロジェクト: vpopil/Package-Picker
# Restructure the dataframe in preparation for one-hot encoding
grouped = df.groupBy("application_id").agg(collect_list("package_id"))
grouped = grouped.withColumnRenamed("collect_list(package_id)", "package_ids")
grouped = grouped.withColumn("package_ids",
                             col("package_ids").cast("array<string>"))

# One-hot encode the data (rows are applications, columns are packages)
vectorizer = CountVectorizer(inputCol="package_ids",
                             outputCol="packages_encoded")
vectorizer_model = vectorizer.fit(grouped)
transformedDf = vectorizer_model.transform(grouped)
transformedDf = transformedDf.drop(col("package_ids"))

# Extract vectors from the DataFrame in preparation for computing the similarity matrix
array = [
    Vectors.fromML(row.packages_encoded) for row in transformedDf.collect()
]

# Create a RowMatrix
matrix = RowMatrix(sc.parallelize(array))

# Compute column similarity matrix
similarity = matrix.columnSimilarities()

# Convert the matrix to a DataFrame
entries = similarity.entries.collect()
similarityDf = spark.createDataFrame(entries).toDF("package_a", "package_b",
                                                   "similarity")

# Write to the database
url_connect = f"jdbc:postgresql://{host}/"
コード例 #25
0
ファイル: naive2.py プロジェクト: vyom1911/elinor-p2
data = indexer.fit(data).transform(data)
print('indexed')
data.write.parquet("gs://elinor/NBTrainData")
#------------------------------FIT RANDOM FOREST-----------------------------#
print("fittingRF")
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="features",
                            numTrees=10)
rfmodel = rf.fit(data)
print("fitted")
#------------------------------FIT NB ---------------------------------------#
print("fitting NB")

data2 = data.rdd\
            .map(lambda x: tuple(x))\
            .map(lambda x: LabeledPoint(x[3],MLLibVectors.fromML(x[1])))\
            .toDF()

data2 = data2.withColumn("features", as_ml("features"))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
nbmodel = nb.fit(data2)
print("fitted")
#------------------------------DEAL WITH TEST DATA-----------------------------#
print("loading testing data")
if DEBUG:
    wtf = sc.textFile(TEST_DATA)\
            .map(lambda x: "data/bytes/" + x + ".bytes")\
            .reduce(lambda accum,x: accum + "," + x)

else:
コード例 #26
0
    tf_df = hashingtf.transform(gram_df)

    #tf-idf
    idf = IDF(inputCol="tf", outputCol="idftf")
    idfModel = idf.fit(tf_df)
    idf_df = idfModel.transform(tf_df)

    #convert dataframe t rdd, to make a LabeledPoint tuple(label, feature, vector) for machine
    tf_rdd = tf_df.rdd

    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.linalg import Vectors as MLLibVectors

    #we also need to convert ml.sparsevector mllib.sparse vector, because naive bayes only accepts mllib.sparsevector type
    train_dataset = tf_rdd.map(
        lambda x: LabeledPoint(float(x.sentiment), MLLibVectors.fromML(x.tf)))

    #split dataset into train, test

    train, test = train_dataset.randomSplit([0.9, 0.1], seed=11L)

    print(train.first())
    print(test.first())

    #create Model
    #now train and save the model
    from pyspark.mllib.classification import NaiveBayes
    import shutil

    #training
    print("************************TRAINIG*******************************")
コード例 #27
0
    def calculate_distance(self, sdf1, sdf2):
        """
        This will calculate the distance between the vector-type columns of two spark dataframes

        :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector)
        :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector)
        :return:
        """

        cov = RowMatrix(
            sdf1.select(["v1"]).withColumnRenamed("v1", "v").union(
                sdf2.select(["v2"]).withColumnRenamed(
                    "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict(
                    )["v"]))).computeCovariance().toArray()

        x, v = np.linalg.eigh(cov)

        indices = 1e-10 <= x

        # we are trying to enfore the data types to be only python types
        n = int(v.shape[0])
        m = int(indices.sum())

        v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()]

        v_spark = DenseMatrix(n, m, v_vals)

        x_vals = [
            float(val)
            for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist()
        ]

        x_spark = DenseMatrix(m, m, x_vals)

        # we get the index to maintain the order
        _sdf1 = sdf1.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF()

        _sdf1.persist()

        _sdf2 = sdf2.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF()

        _sdf2.persist()

        # we get our indexed row matrix
        _sdf1_mat = IndexedRowMatrix(
            _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v1"]))))

        _sdf2_mat = IndexedRowMatrix(
            _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v2"]))))

        # we apply our transformation and then set it as our new variable
        _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v1=indexed_row.vector)).toDF(), "index")

        _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v2=indexed_row.vector)).toDF(), "index")

        @F.udf(DoubleType(), VectorUDT())
        def tmp(vec):
            return float(vec[0].squared_distance(vec[1]))**0.5

        all_sdf = _sdf1.crossJoin(_sdf2)

        dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff'))

        dist_sdf.persist()

        return dist_sdf
コード例 #28
0
def do_query(issues, config_file=None, logger=None, context=None):
    """
    Gets the Latent Dirochelet Allocation (LDA) topics for words
    within articles.

    config_file must be the path to a LDA configuration file in YAML
    format. For example:

        keyword: <KEYWORD>
        optimizer: online|em
        max_iterations: <N>
        ntopics: <N>
        topic_words: <N>

    <N> must be >= 1 for each parameter.

    The keyword and words in documents are normalized, by removing all
    non-'a-z|A-Z' characters.

    Returns result of form:

        {
          <0>: [<WORD_0>, ..., <WORD_topicwords>],
          <1>: [<WORD_0>, ..., <WORD_topicwords>],
          <2>: [<WORD_0>, ..., <WORD_topicwords>],
          ...
          <ntopics>: [<WORD_0>, ..., <WORD_topicwords>],
          years:[<MIN_YEAR>, <MAX_YEAR>]
        }

    :param issues: RDD of defoe.papers.issue.Issue
    :type issues: pyspark.rdd.PipelinedRDD
    :param config_file: query configuration file
    :type config_file: str or unicode
    :param logger: logger (unused)
    :type logger: py4j.java_gateway.JavaObject
    :return: LDA topics
    :rtype: dict
    """
    with open(config_file, 'r') as f:
        config = load(f)
        keyword = config['keyword']
        optimizer = config['optimizer']
        if optimizer != 'online' and optimizer != 'em':
            raise ValueError("optmizer must be 'online' or 'em' but is '{}'"
                             .format(optimizer))
        max_iterations = config['max_iterations']
        if max_iterations < 1:
            raise ValueError('max_iterations must be at least 1')
        ntopics = config['ntopics']
        if ntopics < 1:
            raise ValueError('ntopics must be at least 1')
        topic_words = config['topic_words']
        if topic_words < 1:
            raise ValueError('topic_words must be at least 1')

    keyword = query_utils.normalize(keyword)

    # [date, ...]
    # =>
    # [(yesr, year), ...]
    # =>
    # (year, year)
    min_year, max_year = issues \
        .filter(lambda issue: issue.date) \
        .map(lambda issue: (issue.date.year, issue.date.year)) \
        .reduce(min_max_tuples)

    # [issue, issue, ...]
    # =>
    # [article, article, ...]
    # =>
    # [(article, 0), (article, 1), ...]
    # =>
    # [Row, Row, ...]
    articles_rdd = issues.flatMap(lambda issue: issue.articles) \
        .filter(lambda article:
                article_contains_word(article,
                                      keyword,
                                      PreprocessWordType.NORMALIZE)) \
        .zipWithIndex() \
        .map(article_idx_to_words_row)

    spark = SparkSession \
        .builder \
        .appName('lda') \
        .getOrCreate()

    articles_df = spark.createDataFrame(articles_rdd)

    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    articles_df = remover.transform(articles_df)

    vectortoriser = CountVectorizer(inputCol='filtered', outputCol='vectors')
    model = vectortoriser.fit(articles_df)

    vocabulary = model.vocabulary
    articles_df = model.transform(articles_df)

    corpus = articles_df \
        .select('idx', 'vectors') \
        .rdd \
        .map(lambda a: [a[0], Vectors.fromML(a[1])]) \
        .cache()

    # Cluster the documents into N topics using LDA.
    lda_model = LDA.train(corpus,
                          k=ntopics,
                          maxIterations=max_iterations,
                          optimizer=optimizer)
    topics_final = [topic_render(topic, topic_words, vocabulary)
                    for topic in lda_model.describeTopics(maxTermsPerTopic=topic_words)]

    topics = [('years', [min_year, max_year])]
    for i, topic in enumerate(topics_final):
        term_words = []
        for term in topic:
            term_words.append(term)
        topics.append((str(i), term_words))
    return topics
コード例 #29
0
                                  'comments'))

# comm_lemm.show(truncate=False)

# Indexed Subreddit name so that we can train LDA
indexer = StringIndexer(inputCol="id", outputCol="index")
indexed = indexer.fit(comm_lemm).transform(comm_lemm)

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="comments", outputCol="vectors")
count_vectorizer_model = cv.fit(indexed)
result = count_vectorizer_model.transform(indexed)
# result.show(truncate=False)

corpus = result.select(result['index'].cast('long'), result['vectors']) \
    .rdd.map(lambda x: [x[0], Vectors.fromML(x[1])]).cache()

# # for x in corpus.collect():
# #     print(x)
#
ldaModel = LDA.train(corpus, k=10)
topics = ldaModel.topicsMatrix()

# vocabArray = count_vectorizer_model.vocabulary
print(topics)

# for topic in range(10):
#     print("Topic " + str(topic) + ":")
#     for word in range(0, ldaModel.vocabSize()):
#         print(" " + str(topics[word][topic]))
コード例 #30
0
def main():
    SC = SparkContext("local[1]", "pkgpkr")

    # Connect to the database
    USER = os.environ.get("DB_USER")
    PASSWORD = os.environ.get("DB_PASSWORD")
    HOST = os.environ.get("DB_HOST")
    DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST)
    CUR = DB.cursor()

    # Load the raw data into Spark
    CUR.execute("SELECT * FROM dependencies")
    DEPENDENCIES = CUR.fetchall()
    SPARK = SparkSession.builder.master("local[1]").appName("pkgpkr").getOrCreate()
    DF = SPARK.createDataFrame(DEPENDENCIES).toDF("application_id", "package_id")

    # Close the database connection
    CUR.close()
    DB.close()

    # Restructure the dataframe in preparation for one-hot encoding
    GROUPED = DF.groupBy("application_id").agg(collect_list("package_id"))
    GROUPED = GROUPED.withColumnRenamed("collect_list(package_id)", "package_ids")
    GROUPED = GROUPED.withColumn("package_ids", col("package_ids").cast("array<string>"))

    # One-hot encode the data (rows are applications, columns are packages)
    VECTORIZER = CountVectorizer(inputCol="package_ids", outputCol="packages_encoded")
    VECTORIZER_MODEL = VECTORIZER.fit(GROUPED)
    TRANSFORMED_DF = VECTORIZER_MODEL.transform(GROUPED)
    TRANSFORMED_DF = TRANSFORMED_DF.drop(col("package_ids"))

    # Extract vectors from the DataFrame in preparation for computing the similarity matrix
    ARRAY = [Vectors.fromML(row.packages_encoded) for row in TRANSFORMED_DF.collect()]

    # Create a RowMatrix
    MATRIX = RowMatrix(SC.parallelize(ARRAY, numSlices=100))

    # Compute column similarity matrix
    SIMILARITY = MATRIX.columnSimilarities()

    # Convert the matrix to a DataFrame
    ENTRIES = SIMILARITY.entries.collect()
    SIMILARITY_DF = SPARK.createDataFrame(ENTRIES).toDF("a", "b", "similarity")

    # Map the package identifiers back to their pre-vectorized values
    MAPPING = create_map([lit(x) for x in chain(*enumerate(VECTORIZER_MODEL.vocabulary))])
    SIMILARITY_DF = SIMILARITY_DF.withColumn("package_a", MAPPING.getItem(col("a")).cast("integer")) \
                                 .withColumn("package_b", MAPPING.getItem(col("b")).cast("integer"))
    SIMILARITY_DF = SIMILARITY_DF.drop(col("a")).drop(col("b"))

    # Mirror the columns and append to the existing dataframe so we need only query the first column
    SIMILARITY_DF = SIMILARITY_DF.select('package_a', 'package_b', 'similarity') \
                                 .union(SIMILARITY_DF.select('package_b', 'package_a', 'similarity'))

    # Write similarity scores to the database
    URL_CONNECT = f"jdbc:postgresql://{HOST}/"
    TABLE = "similarity"
    MODE = "overwrite"
    PROPERTIES = {"user": USER, "password": PASSWORD, "driver": "org.postgresql.Driver"}
    SIMILARITY_DF.write.jdbc(URL_CONNECT, TABLE, MODE, PROPERTIES)

    #
    # Update popularity scores
    #

    POPULARITY_UPDATE = """
    UPDATE packages
    SET popularity = s.popularity
    FROM (
      SELECT package_b, COUNT(package_b) AS popularity
      FROM similarity
      GROUP BY package_b
    ) s
    WHERE packages.id = s.package_b;
    """

    POPULARITY_NULL_TO_ZERO = """
    UPDATE packages
    SET popularity = 0
    WHERE popularity IS NULL;
    """

    BOUNDED_POPULARITY_UPDATE = """
    UPDATE packages
    SET bounded_popularity = s.popularity
    FROM (
      SELECT id, WIDTH_BUCKET(LOG(popularity + 1), 0, (SELECT MAX(LOG(popularity + 1)) FROM packages), 9) AS popularity
      FROM packages
    ) s
    WHERE packages.id = s.id;
    """

    # Connect to the database
    DB = psycopg2.connect(user=USER, password=PASSWORD, host=HOST)
    CUR = DB.cursor()

    # Execute popularity updates
    CUR.execute(POPULARITY_UPDATE)
    CUR.execute(POPULARITY_NULL_TO_ZERO)
    CUR.execute(BOUNDED_POPULARITY_UPDATE)

    #
    # Update trending scores
    #

    MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO = """
    UPDATE packages
    SET monthly_downloads_last_month = 0
    WHERE monthly_downloads_last_month IS NULL;
    """

    MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO = """
    UPDATE packages
    SET monthly_downloads_a_year_ago = 0
    WHERE monthly_downloads_a_year_ago IS NULL;
    """

    ABSOLUTE_TREND_UPDATE = """
    UPDATE packages
    SET absolute_trend = s.absolute_trend
    FROM (
      SELECT id, WIDTH_BUCKET(
        LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1),
        (SELECT MIN(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages),
        (SELECT MAX(LOG(monthly_downloads_last_month + 1) - LOG(monthly_downloads_a_year_ago + 1)) FROM packages),
        9
      ) AS absolute_trend
      FROM packages
    ) s
    WHERE packages.id = s.id;
    """

    RELATIVE_TREND_UPDATE = """
    UPDATE packages
    SET relative_trend = s.relative_trend
    FROM (
      SELECT id, WIDTH_BUCKET(
        LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1),
        (SELECT MIN(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages),
        (SELECT MAX(LOG(monthly_downloads_last_month + 1) / (LOG(monthly_downloads_a_year_ago + 1) + 1)) FROM packages),
        9
      ) AS relative_trend
      FROM packages
    ) s
    WHERE packages.id = s.id;
    """

    # Execute trending updates
    CUR.execute(MONTHLY_DOWNLOADS_LAST_MONTH_NULL_TO_ZERO)
    CUR.execute(MONTHLY_DOWNLOADS_A_YEAR_AGO_NULL_TO_ZERO)
    CUR.execute(ABSOLUTE_TREND_UPDATE)
    CUR.execute(RELATIVE_TREND_UPDATE)

    # Commit changes and close the database connection
    DB.commit()
    CUR.close()
    DB.close()