Example #1
0
 def sample_tf_idf(self, dataRDD, nd_idf, agg_idf):
     dataDF = dataRDD.map(lambda i: Row(
         **{
             'salary': int(i.salary),
             'agg': [i.education] + [i.city] + [i.work_lable] +
             [i.work_exp],
             'name_and_desp': desp_text_division(i.name + ',' + i.work_desp)
         })).toDF()
     dataDF.show()
     ndtf = HashingTF(inputCol='name_and_desp',
                      outputCol='ndFeatures',
                      numFeatures=10240)
     aggtf = HashingTF(inputCol='agg',
                       outputCol='Features_agg',
                       numFeatures=256)
     data = ndtf.transform(dataDF)
     data = aggtf.transform(data)
     data.show()
     idfdata = nd_idf.transform(data)
     idfdata = agg_idf.transform(idfdata)
     idfdata.select('salary', 'ndfeatures', 'features_agg')
     RDD = idfdata.rdd
     featuresRDD = RDD.map(lambda i: (i.salary, i.ndfeatures.toArray(
     ).tolist() + i.features_agg.toArray().tolist()))
     return featuresRDD
Example #2
0
    def extract_featrues(self, train_rdd=None, test_rdd=None):
        """
        train_rdd: type rdd, the raw rdd of train data (text content, label)
        test_rdd: type rdd, the raw rdd of test data (text content, doc_id)
        return: type data frame, a data frame where each record contains the extracred features
        """
        print('****************************')
        print('Feature Extraction: TF-IDF\n')

        train_raw_df = train_rdd.map(lambda row:
                                     (self.convert(row[0]), row[1])).toDF(
                                         ['words', 'label'])
        test_raw_df = test_rdd.map(lambda row:
                                   (self.convert(row[0]), row[1])).toDF(
                                       ['words', 'doc_id'])

        ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
        train_ngram_df = ngram.transform(train_raw_df).drop('words')
        test_ngram_df = ngram.transform(test_raw_df).drop('words')

        hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features')
        train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop(
            'ngrams')
        test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop(
            'ngrams')

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(train_raw_featured_data)

        train_df = idf_model.transform(train_raw_featured_data).drop(
            'raw_features')
        test_df = idf_model.transform(test_raw_featured_data).drop(
            'raw_features')

        return (train_df, test_df)
Example #3
0
def vectorize(preprocessed_df, incl_idf=False):
    """ Generate Feature Vectors from the Pre-processed corpus using the
    hashingTF transformer on the filtered, stemmed and normalised list of Tokens
    """

    # Generate Term Frequency Feature Vectors by passing the sequence of tokens to the HashingTF Transformer.
    # Then fit an IDF Estimator to the Featurized Dataset to generate the IDFModel.
    # Finally pass the TF Feature Vectors to the IDFModel to scale based on frequency across the corpus
    if incl_idf:
        hashing_tf = HashingTF(inputCol="tokens",
                               outputCol="raw_features",
                               numFeatures=280)
        features_df = hashing_tf.transform(preprocessed_df)

        idf = IDF(inputCol="raw_features", outputCol="features")
        idf_model = idf.fit(features_df)
        scaled_features_df = idf_model.transform(features_df)

        return scaled_features_df
    else:
        hashing_tf = HashingTF(inputCol="tokens",
                               outputCol="features",
                               numFeatures=280)
        features_df = hashing_tf.transform(preprocessed_df)
        # Return the final vectorized DataFrame
        return features_df
def get_top_N(sc, major, minor, inputtext, N=5):
    # load TF-IDF feature
    #idfmodel = IDFModel.load("file:///Users/nileshbhoyar/Documents/Docker/idfmodel")
    idfmodel = load_model(sc, major, minor)
    df = load_data(sc, major, minor)
    raw_df = df.select("appl_doc_number", "claim_text")

    tokenizer = Tokenizer(inputCol="claim_text", outputCol="words")
    wordsData = tokenizer.transform(
        raw_df.dropna(how="any", subset="claim_text"))

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfmodel.transform(featurizedData)
    model = rescaledData.select("appl_doc_number", "features")
    # prepare candidate input text
    sqlContext = SQLContext(sc)
    candidate_raw = sqlContext.createDataFrame(
        [(9999999, inputtext)], ["appl_doc_number", "claim_text"])
    tokenizer = Tokenizer(inputCol="claim_text", outputCol="words")
    candidate = tokenizer.transform(candidate_raw)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    candidateTf = hashingTF.transform(candidate)
    candidateTfIdf = idfmodel.transform(candidateTf)
    # load child models for similarity calculations.
    #model = load_model(sc,major, minor)
    # find similarities
    result = find_similar(model, candidateTfIdf, N)
    top = sorted(result, key=lambda x: -x[1])[0:N]

    return get_claim_text(sc, [int(i[0]) for i in top], major, minor, df)
Example #5
0
def processing1(request):
    if request.method!='POST':
        return HttpResponseRedirect('/sp/processing')
    else:
        progress=request.POST.get('progress')
        sc = SparkContext('local', 'test')
        spark = SparkSession.builder.getOrCreate()
        if progress=='1':
            city = request.POST.get('city')
            edu = request.POST.get('education')
            introduce = request.POST.get('introduce')
            position = request.POST.get('job')
            exp = request.POST.get('exp')
            print(city,edu,introduce,position,exp)
            explain1='第一步:将信息按照类别形成RDD后,通过map操作,将传入的信息进行合并,转化,并根据词频进行分词处理,后形成dataframe,便于下一步操作,此时信息的状态如下'
            dataRDD = sc.parallelize([[edu, city, position, exp, introduce]])
            dataDF = dataRDD.map(lambda i: Row(**{
                'education': i[0],
                'work_area': i[1],
                'work_lable': i[2],
                'work_exp': i[3],
                'work_desp': i[4]
            })).map(lambda i: Row(**{
                'education': str(new_edu_trans(i.education)),
                'city': [i.work_area],
                'work_desp': i.work_desp,
                'work_lable': [i.work_lable],
                'work_exp': [i.work_exp]
            })).map(lambda i: Row(**{
                'agg': [i.education] + i.city + i.work_lable + i.work_exp,
                'name_and_desp': desp_text_division(i.work_desp)
            })).toDF()
            dct={}
            for i in dataDF.collect():
                dct['agg1']=i[0]
                dct['nd1']=i[1]
            # spark.stop()
            # sc.stop()
            agg_pro1='将学历,城市,职位,经验合并为一行:'
            nd_pro1='将个人简介单独为一行:'
            agg_pro2='学历,经验,职位,城市形成的向量:'
            nd_pro2='个人介绍形成的向量:'
            explain2='第二步:将形成的list通过spark的机器学习包转化包通过if-idf算法形成特征向量,便于下一步机器学习的使用'
            nd_idf = IDFModel.load('hdfs://localhost:9000/nd_idf_test')
            agg_idf = IDFModel.load('hdfs://localhost:9000/agg_idf_test')
            ndtf = HashingTF(inputCol='name_and_desp', outputCol='ndFeatures', numFeatures=10240)
            aggtf = HashingTF(inputCol='agg', outputCol='Features_agg', numFeatures=256)
            data = ndtf.transform(dataDF)
            data = aggtf.transform(data)
            idfdata = nd_idf.transform(data)
            idfdata = agg_idf.transform(idfdata)
            for i in idfdata.collect():
                dct['agg2']=i[3]
                dct['nd2']=i[2]
            spark.stop()
            sc.stop()
            return render(request,'processing1.html',{'data':dct,'explain1':explain1,'agg_pro1':agg_pro1,'nd_pro1':nd_pro1,
                                                      'agg_pro2':agg_pro2,'nd_pro2':nd_pro2,'explain2':explain2})
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
Example #7
0
    def get_product_similarity(self):
        """
        Calculate the similarity between items/users
        """
        product_taxonomy = self.data.select(self.productCol,
                                            self.taxonomyCol).distinct()
        product_taxonomy = self.__data_manipulation(product_taxonomy)

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(product_taxonomy)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        col1 = "i." + self.productCol
        col2 = "j." + self.productCol

        dot_udf = udf(lambda x, y: float(x.dot(y)), DoubleType())
        result = norma_data.alias("i").crossJoin(norma_data.alias("j"))\
            .select(
                col(col1).alias("i"),
                col(col2).alias("j"),
                dot_udf("i.norm", "j.norm").alias("dot"))\
            .sort("i", "j")

        result = result.filter(result.i < result.j & result.dot > 0.5)

        return result
Example #8
0
    def __data_manipulation(self, col):

        data = self.data.select(col, self.taxonomyCol).distinct()
        data = data.withColumn(self.taxonomyCol,
                               data[self.taxonomyCol].cast(StringType()))

        concat_list = udf(lambda lst: ", ".join(lst), StringType())
        data = data.groupby(col).agg(
            collect_list(self.taxonomyCol).alias(self.taxonomyCol))

        data = data.withColumn(self.taxonomyCol, concat_list(self.taxonomyCol))
        data = data.withColumn(
            self.taxonomyCol,
            split(regexp_replace(self.taxonomyCol, " ", ""), ','))

        hashingTF = HashingTF(inputCol=self.taxonomyCol, outputCol="tf")
        tf = hashingTF.transform(data)

        idf = IDF(inputCol="tf", outputCol="feature").fit(tf)
        tfidf = idf.transform(tf)

        normalizer = Normalizer(inputCol="feature", outputCol="norm")
        norma_data = normalizer.transform(tfidf)

        return norma_data
def tf_idf_usecase():
    spark = getSparkSession()
    sentenceData = spark.createDataFrame(
        [(0.0, "Hi I heard about Spark"),
         (0.0, "I wish Java could use case classes"),
         (1.0, "Logistic regression models are neat")], ["label", "sentence"])
    """
        Tokenizer:分词器
    """
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    wordsData.show(truncate=False)
    """
        HashinfTF:将words列的所有文本转换为词袋进行表示
    """
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    featurizedData.select("words", "rawFeatures").show(truncate=False)
    """
        TF-IDF:TF=>单词在单篇文档中出现的频率
               IDF=>(文档数+1)/(出现单词的文档数+1)取对数,文档数是固定的,
                    单词出现的文档数越多,说明该词的重要性越低
    """
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features").show(truncate=False)
def run_minhash_lsh():
    df = util.read_all_json_from_bucket(sql_context,
                                        config.S3_BUCKET_BATCH_PREPROCESSED)

    mh = MinHashLSH(inputCol="text_body_vectorized",
                    outputCol="min_hash",
                    numHashTables=config.LSH_NUM_BANDS)

    # Vectorize so we can fit to MinHashLSH model

    htf = HashingTF(inputCol="text_body_stemmed",
                    outputCol="raw_features",
                    numFeatures=1000)
    htf_df = htf.transform(df)

    vectorizer = VectorAssembler(inputCols=["raw_features"],
                                 outputCol="text_body_vectorized")
    vdf = vectorizer.transform(htf_df)

    if (config.LOG_DEBUG):
        print(colored("[MLLIB BATCH]: Fitting MinHashLSH model...", "green"))
    model = mh.fit(vdf)

    # Compute pairwise LSH similarities for questions within tags
    if (config.LOG_DEBUG):
        print(
            colored(
                "[BATCH]: Fetching questions in same tag, comparing LSH and MinHash, uploading duplicate candidates back to Redis...",
                "cyan"))
    find_dup_cands_within_tags(model)
Example #11
0
def tfidf_lda(df):
    '''
    TFIDF+LDA
    :param df:
    :return: model
    '''
    # hashingTF
    hashingTF = HashingTF(inputCol="content", outputCol="features")
    df_TF = hashingTF.transform(df)
    print('df_TF')
    df_TF.show(truncate=False)
    # IDF
    idf = IDF(inputCol="features", outputCol="idf")
    model_idf = idf.fit(df_TF)
    df_idf = model_idf.transform(df_TF)
    print('df_idf')
    df_idf.cache()
    df_idf.show(truncate=False)
    # LDA
    lda = LDA(k=20, seed=1, optimizer="em")
    model_lda = lda.fit(df_idf)
    model_lda.describeTopics(maxTermsPerTopic=20)
    df_lda = model_lda.transform(df_idf)
    df_lda.select("content", "topicDistribution").show(truncate=False)
    sparkEntrance.spark.createDataFrame(df_lda.rdd, ['content', 'topicDistribution'])
Example #12
0
 def vectorize(self, df, n_features=16):
     '''
     generates vectorized features from the self.traindf dataframe
     --------
     Parameters
     df: spark dataframe - object to be featurized
     n_features: int -  max number of words to be used as features
     --------
     Returns
     None - Vectorized and rescaled data.
     '''
     self.spark.udf.register('listjoin', lambda x: ' '.join(x))
     remover = StopWordsRemover(inputCol="content", outputCol="filtered")
     df_lab_stopped = remover.transform(df)
     df_lab_stopped.registerTempTable('df_lab_stopped')
     stop_strings = self.spark.sql('''
                 SELECT listjoin(filtered) as filtered, content, label
                 FROM df_lab_stopped
                 ''')
     tokenizer = Tokenizer(inputCol="filtered", outputCol="words")
     wordsData = tokenizer.transform(stop_strings)
     hashingTF = HashingTF(inputCol="words",
                           outputCol="rawFeatures",
                           numFeatures=n_features)
     featurizedData = hashingTF.transform(wordsData)
     featurizedData.cache()
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idfModel = idf.fit(featurizedData)
     rescaledData = idfModel.transform(featurizedData)
     return rescaledData
Example #13
0
def test(spark):
    sc = spark.sparkContext

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=8000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    srcdf = sc.textFile('predict.csv').map(parse_line)
    testing = srcdf.toDF()

    model = DecisionTreeClassificationModel.load('Bayes20000')

    testWordsData = tokenizer.transform(testing)
    testFeaturizedData = hashingTF.transform(testWordsData)
    testIDFModel = idf.fit(testFeaturizedData)
    testRescaledData = testIDFModel.transform(testFeaturizedData)
    testRescaledData.persist()

    testDF = testRescaledData.select("features", "label").rdd.map(
        lambda x: Row(label=float(x['label']),
                      features=Vectors.dense(x['features']))).toDF()
    predictions = model.transform(testDF)
    predictions.select('prediction').write.csv(path='submit',
                                               header=True,
                                               sep=',',
                                               mode='overwrite')

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("The accuracy on test-set is " + str(accuracy))
Example #14
0
def pipeline(df):
    print(df.head())
    df = df.withColumn("length", length(df['Speech']))
    # Create the data processing pipeline functions here (note: StringIndexer will be used to encode
    # your target variable column. This column should be named 'label' so our model will recognize it later)
    review_data = Tokenizer(inputCol="Speech", outputCol="Words")
    reviewed = review_data.transform(df)
    #reviewed.show()
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    newFrame = remover.transform(reviewed)
    #newFrame.show()
    hashing = HashingTF(inputCol="filtered",
                        outputCol="hashedValues",
                        numFeatures=pow(2, 10))
    # Transform in a DF
    hashed_df = hashing.transform(newFrame)
    hashed_df.show(truncate=False)
    idf = IDF(inputCol="hashedValues", outputCol="feature")
    idfModel = idf.fit(hashed_df)
    rescaledData = idfModel.transform(hashed_df)
    rescaledData.select("words", "feature").show(truncate=False)
    # indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label")

    # indexed = indexer.fit(rescaledData).transform(rescaledData)

    assembler = VectorAssembler(inputCols=["feature", "length"],
                                outputCol="features")

    return assembler.transform(rescaledData)
Example #15
0
def create_TFIDF_v0(trainData,
                    applyData,
                    inputCol="text",
                    outputCol="features",
                    minDocFreq=3,
                    numFeatures=20):
    tokenizer = RegexTokenizer(pattern="[.:\s]+",
                               inputCol=inputCol,
                               outputCol="z_words")
    wordsData1 = tokenizer.transform(trainData)
    wordsData2 = tokenizer.transform(applyData)

    remover = StopWordsRemover(inputCol="z_words",
                               outputCol="z_filtered",
                               stopWords=STOPWORDS_v0)
    wordsDataFiltered1 = remover.transform(wordsData1)
    wordsDataFiltered2 = remover.transform(wordsData2)

    hashingTF = HashingTF(inputCol="z_filtered",
                          outputCol="z_rawFeatures",
                          numFeatures=numFeatures)
    featurizedData1 = hashingTF.transform(wordsDataFiltered1)
    featurizedData2 = hashingTF.transform(wordsDataFiltered2)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="z_rawFeatures",
              outputCol=outputCol,
              minDocFreq=minDocFreq)
    idfModel = idf.fit(featurizedData1)

    rescaledData = idfModel.transform(featurizedData2)
    return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures",
                             inputCol)
Example #16
0
def get_feature(dataframe=df_train_x, nFeature=200):
    # convert the input string to lowercase and then split it by regex pattern
    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    words_data = regexTokenizer.transform(dataframe)
    #count_tokens = udf(lambda words: len(words), IntegerType()) # count the number of words in each review
    #words_data.select("words").withColumn("tokens", count_tokens(col("words"))).show(5,truncate=True)

    # remove stop words (e.g the, who, which, at, on, I)
    stopWordsRemover = StopWordsRemover(inputCol="words",
                                        outputCol="words_removed")
    words_removed_data = stopWordsRemover.transform(words_data)
    #count_tokens_new = udf(lambda words_removed: len(words_removed), IntegerType())
    #words_removed_data.select("words_removed").withColumn("tokens_new", count_tokens_new(col("words_removed"))).show(5,truncate=True)

    # transform input features into n-grams
    #nGram = NGram(n=2, inputCol="words_removed", outputCol="ngrams")
    #ngrams_data = nGram.transform(words_removed_data)

    # transform list of words to words frequency vectors
    hashingTF = HashingTF(inputCol="words_removed",
                          outputCol="words_freq",
                          numFeatures=nFeature)
    words_freq_data = hashingTF.transform(words_removed_data)
    #words_freq_data.select("words_freq").show(5,truncate=True)

    # compute the IDF vector and scale words frequencies by IDF
    idf = IDF(inputCol="words_freq", outputCol="features")
    idf_model = idf.fit(words_freq_data)
    feature_data = idf_model.transform(words_freq_data).select("features")

    return feature_data
def get_word(text):
    # 문장을 단어 단위로 쪼갬
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(text)
    #wordsData.show()

    # # 불용어 제거
    # remover = StopWordsRemover() \
    #         .setStopWords(mystopwords) \
    #         .setCaseSensitive(False) \
    #         .setInputCol("words") \
    #         .setOutputCol("filtered")
    # remover.transform(wordsData).show()

    # tf 벡터화 과정- HashingTF to hash the sentence into a feature vector.
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=50)
    featurizedData = hashingTF.transform(wordsData)
    featurizedData.show()

    # idf 벡터화 과정 - IDF to rescale the feature vectors
    # idf = IDF(inputCol="rawFeatures", outputCol="features")
    # idfModel = idf.fit(featurizedData)  # fit 명령어를 통해서 text 변수에 저장된 데이터를 학습

    # tf-idf 벡터화 최종 결과
    # rescaledData = idfModel.transform(featurizedData)
    # rescaledData.show()
    result = featurizedData.select('words', 'rawFeatures').rdd.map(lambda x: x)
    for i in result.collect():
        print(i)
    # $example off$
    return True
Example #18
0
 def calculate_hashingtf_idf(self, files_df):
     hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=262144)
     featurized_data = hashing_tf.transform(files_df)
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idf_model = idf.fit(featurized_data)
     rescaled_data = idf_model.transform(featurized_data)
     return rescaled_data
Example #19
0
def feature_engineering(class_balancedDf):
    # N-Gram
    ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams")
    ngramDataFrame = ngram.transform(class_balancedDf)

    # Hashing TF
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(ngramDataFrame)

    # IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # K-Means
    kmeans = KMeans().setK(6).setSeed(1)
    kmodel = kmeans.fit(rescaledData).transform(rescaledData)

    #LDA
    lda = LDA(k=10, maxIter=10)
    ldamodel = lda.fit(kmodel).transform(kmodel)

    # changing label column to int
    data = ldamodel.withColumn(
        "label", ldamodel.label.cast("Integer")).drop("prediction")

    return data
def tf_idf(words):
    hashing_tf = HashingTF(numFeatures=1000, inputCol="words", outputCol="tf")
    tf = hashing_tf.transform(words)
    tf.cache()
    idf = IDF(minDocFreq=3, inputCol="tf", outputCol="features")
    model = idf.fit(tf)
    idf_res = model.transform(tf)
    return idf_res
 def __preprocess_tdfidf(self, df: DataFrame):
     hashingTF = HashingTF().setInputCol("preprocessedData").setOutputCol(
         "tf").setNumFeatures(1500000)
     idf = IDF().setInputCol("tf").setOutputCol("features")
     df = hashingTF.transform(df)
     df_model = idf.fit(df)
     df = df_model.transform(df)
     return df
def tf_idf_feature(wordsData):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    for features_label in rescaledData.select("features", "id").take(3):
        print(features_label)
Example #23
0
 def ece_idf(self, mergeRDD):
     dataDF = mergeRDD.map(lambda p: Row(**{'edu_city_exp': p[1]})).toDF()
     ece_hashingTF = HashingTF(inputCol='edu_city_exp',
                               outputCol='eceFeatures',
                               numFeatures=64)
     featuresData = ece_hashingTF.transform(dataDF)
     ece_idf = IDF(inputCol='eceFeatures', outputCol='ecefeatures')
     ece_idfModel = ece_idf.fit(featuresData)
     return ece_idfModel
Example #24
0
def tf_idf_feature(wordsData):
    hashingTF = HashingTF(inputCol="filtered",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    for features_label in rescaledData.select("features", "id").take(3):
        print(features_label)
Example #25
0
File: ml.py Project: ribonj/lsir
def term_frequency(df, column):
    """
    Compute term-frequency of a token contained in a column.
    Transformation: array<string> --> vector
    """ 
    tf = HashingTF(inputCol=column, outputCol='_'+column)
    df = tf.transform(df)
    
    df = replace(df, column, '_'+column)
    return df
Example #26
0
 def nl_idf(self, mergeRDD):
     dataDF = mergeRDD.map(
         lambda p: Row(**{'leibie and name': p[2]})).toDF()
     nl_hashingTF = HashingTF(inputCol='leibie and name',
                              outputCol='nlFeatures',
                              numFeatures=256)
     featuresData = nl_hashingTF.transform(dataDF)
     nl_idf = IDF(inputCol='nlFeatures', outputCol='nlfeatures')
     nl_idfModel = nl_idf.fit(featuresData)
     return nl_idfModel
Example #27
0
def extract_tf_features(p_df, input_col, output_col):
    """
    Extracts TF features.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.    
    """
    hashingTF = HashingTF(inputCol=input_col, outputCol=output_col, numFeatures=3000)
    return hashingTF.transform(p_df)
Example #28
0
def tokenize(df):
    tokenizer = Tokenizer(inputCol="itemdesc", outputCol="tokenizedText")
    tokenizedData = tokenizer.transform(df)
    numFeatures = 1000
    hashingScheme = HashingTF(inputCol="tokenizedText",
                              outputCol="features",
                              numFeatures=numFeatures)
    featurizedData = hashingScheme.transform(tokenizedData)
    processedData = featurizedData.withColumn("label", featurizedData["label"]) \
                             .select(["features", "label"])
    return processedData
Example #29
0
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):

    global idfModel
    
    hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
    featurizedData = hashingTF.transform(dataframe)
    idf = IDF(inputCol=in_col2, outputCol=out_col2)
    idfModel = idf.fit(featurizedData)
    dataframe = idfModel.transform(featurizedData)
    
    return dataframe
Example #31
0
def vectorizer_pipeline(preprocessed_df):
    
    """ Generate Feature Vectors from the Pre-processed corpus using the 
    hashingTF transformer on the filtered, stemmed and normalised list of Tokens
    """
    
    hashingTF = HashingTF(inputCol="tokens", outputCol="features", numFeatures=280)
    features_df = hashingTF.transform(preprocessed_df)
    
    # Return the final vectorized DataFrame
    return features_df
Example #32
0
def train(spark):
    sc = spark.sparkContext
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=8000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    srcdf = sc.textFile('part.csv').map(parse_line)
    srcdf = srcdf.toDF()
    training, testing = srcdf.randomSplit([0.9, 0.1])

    wordsData = tokenizer.transform(training)
    featurizedData = hashingTF.transform(wordsData)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.persist()

    trainDF = rescaledData.select("features", "label").rdd.map(
        lambda x: Row(label=float(x['label']),
                      features=Vectors.dense(x['features']))).toDF()
    naivebayes = NaiveBayes()
    model = naivebayes.fit(trainDF)

    testWordsData = tokenizer.transform(testing)
    testFeaturizedData = hashingTF.transform(testWordsData)
    testIDFModel = idf.fit(testFeaturizedData)
    testRescaledData = testIDFModel.transform(testFeaturizedData)
    testRescaledData.persist()

    testDF = testRescaledData.select("features", "label").rdd.map(
        lambda x: Row(label=float(x['label']),
                      features=Vectors.dense(x['features']))).toDF()
    predictions = model.transform(testDF)
    predictions.show()

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("The accuracy on test-set is " + str(accuracy))
    model.save('Bayes20000')
Example #33
0
def nlpTransform(data):
    tokenizer = Tokenizer(inputCol="combi_text", outputCol="words")
    wordsData = tokenizer.transform(data)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
    featurizedData = hashingTF.transform(wordsData)
    scaler = StandardScaler(inputCol="rawFeatures",
                            outputCol="features",
                            withStd=True,
                            withMean=False)
    featureData = scaler.fit(featurizedData)
    featureD = featureData.transform(featurizedData)
    return featureD
Example #34
0
def extract_tf_features(p_df, input_col, output_col):
    """
    Extracts TF features.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.    
    """
    hashingTF = HashingTF(inputCol=input_col,
                          outputCol=output_col,
                          numFeatures=3000)
    return hashingTF.transform(p_df)
Example #35
0
    def test_apply_binary_term_freqs(self):

        df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
        n = 10
        hashingTF = HashingTF()
        hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
        output = hashingTF.transform(df)
        features = output.select("features").first().features.toArray()
        expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(features[i]))
def classify_tweets(inbound_dataset):
    # Run the cleansing UDF for tweet column
    udf_cleansing = functions.udf(cleansing)
    inbound_dataset = inbound_dataset.withColumn(
        "tweet_cleansed", udf_cleansing(functions.col("tweet")))

    # Tokenizing
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer(inputCol="tweet_cleansed", outputCol="words")
    inbound_dataset = tokenizer.transform(inbound_dataset)

    # Generating features
    from pyspark.ml.feature import HashingTF
    features_generator = HashingTF(inputCol="words", outputCol="features")
    inbound_dataset = features_generator.transform(inbound_dataset)

    model_folder = os.path.join(os.getcwd(), "saved_models")
    model_full_path = os.path.join(model_folder, "twitter_sentiment_spark")
    if not os.path.exists(model_folder):
        print("model does not exists")

    from pyspark.ml.classification import NaiveBayesModel
    loaded_model = NaiveBayesModel.load(model_full_path)

    # Classifying using saved model
    classified = loaded_model.transform(inbound_dataset)

    spark = getSparkSessionInstance(inbound_dataset.rdd.context.getConf())
    if files_source == "hdfs":
        labels = spark.read.load(os.path.join("file://" + model_folder,
                                              "labels.csv"),
                                 format="csv",
                                 header=True)
    else:
        labels = spark.read.load(os.path.join(model_folder, "labels.csv"),
                                 format="csv",
                                 header=True)

    classified = classified.join(labels,
                                 classified["NB_pred"] == labels["label_id"])

    udf_get_probability = functions.udf(get_probability)
    classified = classified.withColumn(
        "probability",
        udf_get_probability(functions.col("NB_prob"),
                            functions.col("NB_pred")))

    classified = classified.withColumn(
        "label_predicted",
        functions.when(classified.probability < probability_threshold,
                       "2").otherwise(classified.label_predicted))

    return classified
Example #37
0
def termFrequency(table):

    #calculates the term frequency of attributes
    hashingTF = HashingTF(inputCol='key_words', outputCol='hashing')
    tf = hashingTF.transform(table)
    tf.cache()

    #normalises the term frequency data
    normalizer = Normalizer(inputCol='hashing', outputCol='norm')
    term = normalizer.transform(tf)

    return term
def predictLabel(label,title,model):
    """预测新闻的标签"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col):
    #from pyspark.sql.functions import udf
    #from pyspark.sql.types import *
    output_raw_col = ip_col+"raw_features"
    output_col = ip_col+"features"
    hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
    featurizedData = hashingTF.transform(df)
    idf = IDF(inputCol=output_raw_col, outputCol=output_col)
    idfModel = idf.fit(featurizedData)
    rescaled_data = idfModel.transform(featurizedData)
    rescaled_data.show(5)
    print(rescaled_data.count())
    return rescaled_data
Example #41
0
def makeTFIDF(sc, spark, reviews):
    # count vectorizer and tfidf
    # cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
    # cvModel = cv.fit(reviews)
    # reviews = cvModel.transform(reviews)

    # HashingTF for fewer dimensions:
    hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
    reviews = hashingtf.transform(reviews)

    # create TF-IDF matrix
    idf = IDF().setInputCol('tf').setOutputCol('tfidf')
    tfidfModel = idf.fit(reviews)
    reviews = tfidfModel.transform(reviews)
Example #42
0
 def append_tf_idf(self, df):
     """
     Calculate term frequency and inverse document frequency
      based on at least 1 visit hourly in this case. Compares how often the tokens appeared
      at least once per hour compared to other tokens. Not used for the main purpose of the project.
     Args:
         :param df: Dataframe parameter.
     Returns:
         :return:  Dataframe with term frequency and inverse document frequency added in the columns
                     'rawFeatures' and 'features' respectively.
     """
     #Create TF column.
     hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
     tf = hashingTF.transform(df)
     tf.persist(StorageLevel.MEMORY_AND_DISK)
     #Create IDF column.
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idfModel = idf.fit(tf)
     tfidf = idfModel.transform(tf)
     return tfidf
Example #43
0
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tf_idf_sample") \
    .master("local[*]") \
    .getOrCreate()

df1 = spark.createDataFrame([
    (0, "a a a b b c"),
    (0, "a b c"),
    (1, "a c a a d")]).toDF("label", "sentence")

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# 각 문장을 단어로 분리
df2 = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20)
df3 = hashingTF.transform(df2)

df3.cache()

idf = IDF(inputCol="TF-Features", outputCol="Final-Features")
idfModel = idf.fit(df3)

rescaledData = idfModel.transform(df3)
rescaledData.select("words", "TF-Features", "Final-Features").show()

spark.stop
data = pd.read_csv("sms_spam.csv")
#print(data.head(5))
    
##creating rdd file
sc = SparkContext("local", "app")
sqc = SQLContext(sc)
df = sqc.createDataFrame(data, ['type', 'text'])

#NEW VARIABLE GENERATION
dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text'])))
dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1]))
dfClean = sqc.createDataFrame(dataClean, ['label', 'words'])
dfClean.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000)
tf = hashingTF.transform(dfClean)
idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf)
dfFinal = idf.transform(tf)

# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal)

# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = dfFinal.randomSplit([0.8, 0.2])


# Train the model.
#rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
def main(sc, sqlContext):
    start = timer()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    print '---Pegando produtos---'
    start_i = timer()
    productRDD = sc.parallelize(findProductsByCategory([]))
    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())
    print '####levou %d segundos' % (timer()-start_i)

    print '---Pegando e persistindo dados de categoria e tokens---'
    start_i = timer()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
    numTokens = len(tokens)
    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Calculando TF-IDF dos produtos---'
    start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataDF = sqlContext.createDataFrame(wordsData)   

    #persistindo para a predicao
    wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)   

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")

    wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") 

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
    VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))

    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)    


    print '--Criando modelo Naive Bayes---'
    start_i = timer()
    model = NaiveBayes.train(VSMTrain)

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")

    model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Testando modelo Naive Bayes---'
    start_i = timer()
    prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
    print 'acuracidade de %f' % acuraccy
    print '####levou %d segundos' % (timer()-start_i)    
    
    print '---Pegando os posts---'

    start_i = timer()
    posts = list()
    wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
    sheet = wb['Menes']
    for row in sheet.iter_rows(row_offset=1):
        post = list()
        for cell in row:
            if cell.value is None:
                break
            post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))

        if len(post) > 0:            
            posts.append(tuple(post))

    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    postsRDD = sc.parallelize(posts)
    postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
                           .cache())

    print '####levou %d segundos' % (timer()-start_i)

    print '---Calculando TF-IDF dos Posts---'
    start_i = timer()
    wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1]))
    wordsDataDF = sqlContext.createDataFrame(wordsData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)   

    VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features))
    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)   

    print '--Criando modelo SVM---'
    start_i = timer()
    model = SVMWithSGD.train(VSMTrain, iterations=100)
    
    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm")

    model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")

    print '---Testando modelo SVM---'
    start_i = timer()
    prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features)))
    acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count())
    
    print 'acuracidade de %f' % acuraccy

    print '####levou %d segundos' % (timer()-start_i)   

    print 'O processo todo levou %d segundos' % (timer()-start)
Example #47
0
from pyspark.sql import Row
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


df = spark.read.load('/home/manh/Documents/data/result_pre.parquet')
df = df.select('id', 'stemmed')
rdd =  df.select('stemmed').rdd
pre_idf = rdd.map(lambda x: set(x[0])).flatMap(lambda x: x).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
pre_idf_collect =  pre_idf.collect()

rdd_words = pre_idf.map(lambda x: Row(word=[x[0]]))

df_words = spark.createDataFrame(rdd_words)

hashingTF = HashingTF(inputCol="word", outputCol="rawFeatures", numFeatures=100000)

featurizedData = hashingTF.transform(df_words)

featurizedData.rdd.map(lambda x: (x.word[0], x['rawFeatures'].indices[0])).map(lambda x: '%s  %s' % (x)).collect()
Example #48
0
	(20,"Apple iPhone Apple iPhone 6 16GB 412 2 cell 2895"),
	(20,"iPhone 6 T Mobile 16 GB"),
	(20,"Apple 6 16gb T Mobile")
], ["label","text"])

# Learn a mapping from words to Vectors.
#word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="textVec")
#model = word2Vec.fit(documentDF)
#result = model.transform(documentDF)
#print result.take(2)

tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText")
tokenizedTextData = tokenizer.transform(documentDF)

hashingTF = HashingTF(inputCol="tokenizedText", outputCol="rawFeatures")
featurizedData = hashingTF.transform(tokenizedTextData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
result1 = idfModel.transform(featurizedData)


for features_label in result.select("label","pcaFeatures").take(10):
  print(features_label)


wordsvectors = result["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))



Example #49
0

linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt")
partst = linest.map(lambda l: l.split(","))
ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0))
alldata = f.union(ft)

schemaApp = sqlContext.createDataFrame(alldata)

schemaApp.registerTempTable("data")

tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms")
permsData = tokenizer.transform(schemaApp)

hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures")
featurizedData = hashingTF.transform(permsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")


idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)

labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features)))

trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(wordsvectors.count())
print("Training Error = " + str(trainErr))
Example #50
0
from pyspark.ml.feature import RegexTokenizer, HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import RandomForest

## Load Dataset
df_pandas = pd.read_csv('sample.csv')

## Convert to Spark Dataframe
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(df_pandas)

## Tokenizer and Hashing 
tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features")
df_feat = hashingTF.transform(tokenizer.transform(df))

## Create LabeledPoint and Features for Prediction (predict the 1s observations)
lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features))
predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features)


## Compare predictions from Different Models


## Logistic Regression
lrm = LogisticRegressionWithSGD.train(lp, iterations=10)
logit_predict = lrm.predict(predict_feat)
logit_predict.sum()
#9112
# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)


# COMMAND ----------

idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)


# COMMAND ----------

from pyspark.ml.feature import Word2Vec
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text",
  outputCol="result")
model = word2Vec.fit(documentDF)
def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
from pyspark.ml.feature import Normalizer
normalizerUni = Normalizer(inputCol='words',outputCol='normWords',p=2.0)
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()