Ejemplo n.º 1
0
def shringles(x, fileName):
    # tokenize and ngrams
    tokenizer = RegexTokenizer(inputCol="value",
                               outputCol="words",
                               pattern="\\W")
    ngrams = NGram(n=x, inputCol="words", outputCol="kshringles")
    shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
Ejemplo n.º 2
0
def aggregate_spark(data, features, args):
    import pyspark.sql.functions as F
    from pyspark.ml.feature import StopWordsRemover, RegexTokenizer

    input_data = data.withColumn(features["col"],
                                 F.lower(F.col(features["col"])))
    regexTokenizer = RegexTokenizer(inputCol=features["col"],
                                    outputCol="token_list",
                                    pattern="\\W")
    regexTokenized = regexTokenizer.transform(data)

    remover = StopWordsRemover(inputCol="token_list",
                               outputCol="filtered_word_list")
    vocab_rows = (remover.transform(regexTokenized).select(
        F.explode(F.col("filtered_word_list")).alias("word")).groupBy(
            "word").count().orderBy(F.col("count").desc()).limit(
                args["vocab_size"]).select("word").collect())

    vocab = [row["word"] for row in vocab_rows]
    reverse_dict = {
        word: idx + len(args["reserved_indices"])
        for idx, word in enumerate(vocab)
    }

    return {**reverse_dict, **args["reserved_indices"]}
Ejemplo n.º 3
0
    def df_to_words(logger, df: DataFrame, input_col: str, output_col: str = "words", pattern: str = "\\W+",
                    to_lowercase: bool = True,
                    case_sensitive: bool = False) -> DataFrame:
        """
        Take each string in a column and parse it to a list of words via Tokenization and remove stop words.
        Args:
            logger: Logger instance used to log events
            df: Dataframe used
            input_col: Selected input column name
            output_col: Output column name
            pattern: The regex pattern used to tokenized
            to_lowercase: If all the word should be trim
            case_sensitive: Does the stop words should be case sensitive

        Returns: The modified dataframe

        """
        try:
            intermediate_output = output_col + "intermediate"
            regex_tokenizer = RegexTokenizer(inputCol=input_col, outputCol=intermediate_output, pattern=pattern,
                                             toLowercase=to_lowercase)
            remover = StopWordsRemover(inputCol=intermediate_output, outputCol=output_col, caseSensitive=case_sensitive)
            logger.info("Parsing to words the dataframe")
            return remover.transform(regex_tokenizer.transform(df)).drop(intermediate_output)
        except Exception as e:
            logger.error("Parsing to words failed: {}".format(e), traceback.format_exc())
            raise e
def extract_tokens(df):
    reTokenizer = RegexTokenizer(inputCol="Text",
                                 outputCol="clean_text",
                                 toLowercase=True,
                                 minTokenLength=3)
    newdf = reTokenizer.transform(df)
    return newdf
Ejemplo n.º 5
0
def frequency_vector_DataFrame(trainDF, cluster_count):
    regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]")
    dfTokenizer = regTokenizer.transform(trainDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    df_remover = remover.transform(dfTokenizer)

    # feature extraction using Word2vec
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")
    vectors = word2Vec.fit(df_remover).getVectors()
    vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features"))

    #  DF as kmeans
    kmeans = KMeans().setK(cluster_count).setSeed(1)
    km_model = kmeans.fit(vectors_DF)

    # Broadcast operation after getting the words and predictions
    vocabDF = km_model.transform(vectors_DF).select("word", "prediction")
    vocabDict = dict(vocabDF.rdd.collect())
    vocab_dict = sc.broadcast(vocabDict)

    # Cluster vector is in RDD form
    reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd
    clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict))


    cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count))

    cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF()
    cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label"))

    return cluster_freq_featureDF
Ejemplo n.º 6
0
def get_feature(dataframe=df_train_x, nFeature=200):
    # convert the input string to lowercase and then split it by regex pattern
    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    words_data = regexTokenizer.transform(dataframe)
    #count_tokens = udf(lambda words: len(words), IntegerType()) # count the number of words in each review
    #words_data.select("words").withColumn("tokens", count_tokens(col("words"))).show(5,truncate=True)

    # remove stop words (e.g the, who, which, at, on, I)
    stopWordsRemover = StopWordsRemover(inputCol="words",
                                        outputCol="words_removed")
    words_removed_data = stopWordsRemover.transform(words_data)
    #count_tokens_new = udf(lambda words_removed: len(words_removed), IntegerType())
    #words_removed_data.select("words_removed").withColumn("tokens_new", count_tokens_new(col("words_removed"))).show(5,truncate=True)

    # transform input features into n-grams
    #nGram = NGram(n=2, inputCol="words_removed", outputCol="ngrams")
    #ngrams_data = nGram.transform(words_removed_data)

    # transform list of words to words frequency vectors
    hashingTF = HashingTF(inputCol="words_removed",
                          outputCol="words_freq",
                          numFeatures=nFeature)
    words_freq_data = hashingTF.transform(words_removed_data)
    #words_freq_data.select("words_freq").show(5,truncate=True)

    # compute the IDF vector and scale words frequencies by IDF
    idf = IDF(inputCol="words_freq", outputCol="features")
    idf_model = idf.fit(words_freq_data)
    feature_data = idf_model.transform(words_freq_data).select("features")

    return feature_data
Ejemplo n.º 7
0
    def preprocessDF(self, df, cols):
        df = df.withColumn('joined_columns', functions.lower(df[cols[0]]))

        regex_tokenizer = RegexTokenizer(inputCol="joined_columns", outputCol="joinKey", pattern=r'\W+')

        df = regex_tokenizer.transform(df)
        return df
Ejemplo n.º 8
0
def dedup_min_hash(df, column, id_col, min_distance=0.1):
    """
    Deduplicates a dataset using MinHash on a token count basis.

    Removes all items with a distance smaller than min_distance.
    """
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()

    df.cache()
    tokenizer = RegexTokenizer(inputCol=column, outputCol="tokens")
    tokens = tokenizer.transform(df)
    cv = CountVectorizer(inputCol="tokens", outputCol="token_ids")
    vectorizer_model = cv.fit(tokens)
    with_token_ids = vectorizer_model.transform(tokens).drop("tokens", column)
    with_token_ids = with_token_ids.where(
        num_nonzeros(with_token_ids.token_ids) > 0).cache()
    mh = MinHashLSH(inputCol="token_ids",
                    outputCol="hashes",
                    seed=1,
                    numHashTables=10)
    dedup_model = mh.fit(with_token_ids)
    joined = dedup_model.approxSimilarityJoin(with_token_ids, with_token_ids, 1 - min_distance, distCol="dist")\
        .drop("token_ids", "hashes")\
        .filter(f"datasetA.{id_col} < datasetB.{id_col}")
    duplicate_ids = joined.rdd.flatMap(lambda row: (row.datasetA[id_col], row.datasetB[id_col]))\
        .distinct()\
        .map(lambda el: [el])\
        .toDF()
    return df.join(duplicate_ids, duplicate_ids._1 == df[id_col], "left")\
        .where(duplicate_ids._1.isNotNull())\
        .drop(duplicate_ids._1)
Ejemplo n.º 9
0
def create_TFIDF_v0(trainData,
                    applyData,
                    inputCol="text",
                    outputCol="features",
                    minDocFreq=3,
                    numFeatures=20):
    tokenizer = RegexTokenizer(pattern="[.:\s]+",
                               inputCol=inputCol,
                               outputCol="z_words")
    wordsData1 = tokenizer.transform(trainData)
    wordsData2 = tokenizer.transform(applyData)

    remover = StopWordsRemover(inputCol="z_words",
                               outputCol="z_filtered",
                               stopWords=STOPWORDS_v0)
    wordsDataFiltered1 = remover.transform(wordsData1)
    wordsDataFiltered2 = remover.transform(wordsData2)

    hashingTF = HashingTF(inputCol="z_filtered",
                          outputCol="z_rawFeatures",
                          numFeatures=numFeatures)
    featurizedData1 = hashingTF.transform(wordsDataFiltered1)
    featurizedData2 = hashingTF.transform(wordsDataFiltered2)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="z_rawFeatures",
              outputCol=outputCol,
              minDocFreq=minDocFreq)
    idfModel = idf.fit(featurizedData1)

    rescaledData = idfModel.transform(featurizedData2)
    return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures",
                             inputCol)
Ejemplo n.º 10
0
def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    comments = spark.read.json(in_directory, schema=schema)
    
    comments.cache() 
   
    wordbreak = r'[%s\s]+' % (re.escape(string.punctuation + '0123456789'),)

    # NLP processing code adapted from https://spark.apache.org/docs/latest/ml-features.html
    regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", minTokenLength=3, pattern=wordbreak)
    # alternatively, pattern="\\w+", gaps(False)

    countTokens = udf(lambda words: len(words), IntegerType())

    regexTokenized = regexTokenizer.transform(comments)
    docs = regexTokenized.select("body", "words", "subreddit")

    docs.cache()

    #extra_stop_words = ["www","http","gt"]

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    docs = remover.transform(docs).withColumn("tokens", countTokens(col("filtered")))

    docs = docs.drop("body")
    docs = docs.drop("words") 

    docs.groupBy("subreddit").agg(functions.avg("tokens")).show()

    # threshold for post length
    lthresh = 60
    uthresh = 100
    docs = docs.filter(docs['tokens'] > lthresh)
    docs = docs.filter(docs['tokens'] < uthresh)


    logs = docs.groupBy("subreddit").agg(functions.count("*")).show()


    #adds rank per subreddit type into a new column called rank
    ranked = docs.withColumn("rank", rank().over(Window.partitionBy("subreddit").orderBy(desc("tokens"))))

    #ranked.cache()

    group_size = 230
    #take group_size biggest docs from each group type
    ranked = ranked.filter(ranked['rank'] <= group_size)
    
    #convert arrays to columns so we can write csv
    for i in range(uthresh):
        ranked = ranked.withColumn('{0}'.format(i), ranked.filtered.getItem(i))

    #drop filtered so we can write to csv
    ranked = ranked.drop('filtered')
    ranked = ranked.drop('rank')
    ranked.show()

    ranked.write.csv(out_directory, mode='overwrite')
 def createToken(self, dataset, colName):
     dataset = dataset.drop(pc.DMXTOKENIZED)
     sentimentTokenizer = RegexTokenizer(inputCol=colName,
                                         outputCol=pc.DMXTOKENIZED,
                                         toLowercase=True,
                                         pattern="\\W")
     dataset = sentimentTokenizer.transform(dataset)
     return dataset
 def createToken(self, dataset, colName):
     dataset = dataset.drop("SA_tokenized")
     sentimentTokenizer = RegexTokenizer(inputCol=colName,
                                         outputCol="SA_tokenized",
                                         toLowercase=True,
                                         pattern="\\W")
     dataset = sentimentTokenizer.transform(dataset)
     return dataset
Ejemplo n.º 13
0
def wordTokenizer(data, columns):
    for c in columns:
        new_c = c + '_tokens'
        reTokenizer = RegexTokenizer(inputCol=c,
                                     outputCol=new_c,
                                     pattern='\\W',
                                     minTokenLength=2)
        data = reTokenizer.transform(data)
    return data
    def process(reviews):
        if (reviews.isEmpty()):
            pass
        else:
            start = time.time()

            #get reviews with overall rating > 3 and overall rating < 3
            pos_reviews = reviews.filter(lambda x: x[0] > 3.0)
            neg_reviews = reviews.filter(lambda x: x[0] < 3.0)

            #set label for each class. 0.0 is positive - 1.0 is negative
            review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)

            Words = Row('label', 'words')
            words = reviews.map(lambda r: Words(*r))
            words_df = spark.createDataFrame(words)

            #reviews tokenization
            token = RegexTokenizer(minTokenLength=2,
                                   pattern="[^A-Za-z]+",
                                   inputCol="words",
                                   outputCol="token",
                                   toLowercase=True)
            token_filtered = token.transform(words_df)

            #stopwords elimination
            remover = StopWordsRemover(inputCol="token",
                                       outputCol="stopwords",
                                       caseSensitive=False)
            stopwords_filtered = remover.transform(token_filtered)

            prep_filtered = (
                stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])

            #tf-idf calculation
            tf = HashingTF(numFeatures=numFeatures).transform(
                prep_filtered.map(porter_stem, preservesPartitioning=True))
            idf = IDF().fit(tf)
            train_tfidf = idf.transform(tf)

            #set training dataset with label
            training = review_labels.zip(train_tfidf).map(
                lambda x: LabeledPoint(x[0], x[1]))

            #train the model classifier
            model = SVMWithSGD.train(training, iterations=100)
            model_name = "svm" + str(counter_model)
            #save model classifier to HDFS
            output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name
            model.save(sc, output_dir)

            counter_model.add(1)

            end = time.time()
            print("Model Name : ", model_name, ", Total Reviews : ",
                  reviews.count(), "Processing Time : ", (end - start))
Ejemplo n.º 15
0
def tokenise_concat_field(df, spark):
    """
    Take the 'concat' field, which contains the whole record as a single string
    and split it into an array of tokens
    """
    tokenizer = RegexTokenizer(
        inputCol="concat", outputCol="tokens", pattern="[\s\-\.@]")
    df = tokenizer.transform(df)
    # df = df.drop('concat')  # Needed later for overall edit distance!
    return df
Ejemplo n.º 16
0
def tokenize_sentences(sentences_df):
    """ Used Spark ML tokenizer to tokenize each sentence
    :param sentences_df: one sentence per row
    :returns: same data frame with added column with tokenized array
    """
    regexTokenizer = RegexTokenizer(inputCol="sentenceText",
                                    outputCol="words",
                                    pattern="\\W")
    tokenized = regexTokenizer.transform(sentences_df)
    return tokenized
Ejemplo n.º 17
0
def tokenize(p_df, in_column, out_column):
    """
    Tokenizes a column in a DataFrame.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.
    """
    tokenizer = RegexTokenizer(inputCol=in_column, outputCol=out_column, pattern="\\W")
    return tokenizer.transform(p_df)
Ejemplo n.º 18
0
	def process(reviews):
		if(reviews.isEmpty()):
			pass
		else:
			model_name = "dt"
			updated_model = "dt0"
			model_path, data_path, metadata_path = '','',''
			
			#performing looping process to check the availability of new model classifier
			for i in range(25,-1,-1):
				model_path = "hdfs://VM10-1-0-14:9000/classifier/"+model_name+str(i)
				updated_model = model_name+str(i)
				data_path = model_path+"/data/part-r*"
				metadata_path = model_path+"/metadata/part-00000"
				if(patherror(data_path) == False and patherror(metadata_path) == False):
					break
			
			#load model classifier
			model = DecisionTreeModel.load(sc, model_path)

			start = time.time()
			reviews_label = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0)
			
			Words = Row('label', 'words')
			words = reviews.map(lambda r: Words(*r))
			words_df = spark.createDataFrame(words)
			
			#review tokenization
			token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
			token_filtered = token.transform(words_df)
			
			#stopwords elimination
			remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
			stopwords_filtered = remover.transform(token_filtered)

			prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])
			
			#tf-idf calculation
			tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
			idf = IDF().fit(tf)
			tfidf = idf.transform(tf)
			
			prediction = model.predict(tfidf)
			
			labeled_prediction = reviews_label.zip(prediction).map(lambda x: (float(x[1]), x[0]))
			
			metrics = MulticlassMetrics(labeled_prediction)	
			
			output = reviews.zip(prediction)
				
			filename = "hdfs://VM10-1-0-14:9000/output/" + re.sub('[^0-9]','',str(datetime.now())) + ".out"
			output.saveAsTextFile(filename)
			
			end = time.time()	
			print(updated_model,';',reviews.count(),';',metrics.accuracy,';',metrics.precision(0.0),';',metrics.precision(1.0),';',metrics.recall(0.0),';',metrics.recall(1.0),';',metrics.fMeasure(0.0),';',metrics.fMeasure(1.0),';',(end-start))
Ejemplo n.º 19
0
    def createToken(self, dataset, colName):
        sentimentTokenizer = RegexTokenizer(
            inputCol=colName,
            outputCol=self.dmxTokenized,
            toLowercase=True,
            pattern="\\W"
        )  # update the constant with in predictive constant class.
        dataset = sentimentTokenizer.transform(dataset)

        dataset = self.stopWordsRemover(dataset, self.dmxTokenized)
        return dataset
Ejemplo n.º 20
0
 def preprocessDF(self):
     reTokenizer = RegexTokenizer(pattern=r'\W+', inputCol='reformat2', outputCol='tokenKey', toLowercase=True)
     df_token = reTokenizer.transform(self.df)
     remover = StopWordsRemover(inputCol='tokenKey', outputCol='tokens')
     remover.setStopWords(list(self.stopWords))
     df_token = remover.transform(df_token)
     
     df_token=df_token.select('categories','created_at','favorite_count','quote_count','reply_count','retweet_count','user.followers_count',
     'user.favourites_count','user.friends_count','tokens')
     
     self.df1.write.json(os.path.join(bdenv_loc,'twitter_parse_reformat_2018_01-06.json'), mode='append')
     df_token.write.json(os.path.join(bdenv_loc,'twitter_parse_tokens_2018_01-06.json'), mode='append')
Ejemplo n.º 21
0
def tokenize(p_df, in_column, out_column):
    """
    Tokenizes a column in a DataFrame.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.
    """
    tokenizer = RegexTokenizer(inputCol=in_column,
                               outputCol=out_column,
                               pattern="\\W")
    return tokenizer.transform(p_df)
Ejemplo n.º 22
0
  def tokenize(self, df1, input_col, output_col='words', pattern=None):
    "Tokenize string -> https://spark.apache.org/docs/2.2.0/ml-features.html#tokenizer"
    from pyspark.ml.feature import Tokenizer, RegexTokenizer

    if pattern:
      tokenizer = RegexTokenizer(
        inputCol=input_col, outputCol=output_col, pattern=pattern)
    else:
      tokenizer = Tokenizer(inputCol=input_col, outputCol=output_col)

    tokenized_df = tokenizer.transform(df1)

    return tokenized_df
Ejemplo n.º 23
0
def tokenize(df, string_cols):
    output = df
    for c in string_cols:
        output = output.withColumn('temp', f.coalesce(f.col(c), f.lit('')))
        tokenizer = RegexTokenizer(inputCol='temp',
                                   outputCol=c + "_tokens",
                                   pattern="\\W")
        remover = StopWordsRemover(inputCol=c + "_tokens",
                                   outputCol=c + "_swRemoved")
        output = tokenizer.transform(output)
        output = remover.transform(output)\
          .drop('temp', c+"_tokens")

    return output
Ejemplo n.º 24
0
def aggregate_spark(data, input):
    from pyspark.ml.feature import RegexTokenizer
    import pyspark.sql.functions as F
    from pyspark.sql.types import IntegerType

    regexTokenizer = RegexTokenizer(inputCol=input,
                                    outputCol="token_list",
                                    pattern="\\W")
    regexTokenized = regexTokenizer.transform(data)

    max_review_length_row = (regexTokenized.select(
        F.size(F.col("token_list")).alias("word_count")).agg(
            F.max(F.col("word_count")).alias("max_review_length")).collect())

    return max_review_length_row[0]["max_review_length"]
Ejemplo n.º 25
0
    def test_preprocessing(self, content, cv):

        ## Load file
        X_files = sc.textFile('gs://chatrath/files/X_test.txt')
        X_asm_files = X_files.map(lambda x:
                                  (("gs://chatrath/data/asm/" + x + ".asm")))
        X_asm_files = X_asm_files.reduce(lambda x, y: x + "," + y)
        X_asm = sc.wholeTextFiles(X_asm_files)
        X_test_asm = X_asm.mapValues(lambda x: re.sub("""[\t{Z}]""", "", x))
        X_test_asm = X_test_asm.mapValues(
            lambda x: re.sub("""[+{Z}]+""", "", x))
        X_test_asm = X_test_asm.mapValues(
            lambda x: re.sub("""[-{Z}]+""", "", x))
        X_test_asm = X_test_asm.mapValues(
            lambda x: re.sub("""[={Z}]+""", "", x))
        X_test_asm = X_test_asm.mapValues(
            lambda x: re.sub("""[\r|{Z}]+""", "", x))
        X_test_asm = X_test_asm.mapValues(
            lambda x: re.sub("""[;{Z}]+""", "", x))
        X_test_asm = X_test_asm.mapValues(
            lambda x: re.sub("""[\n{Z}]+""", "", x))
        X_test_asm = X_test_asm.mapValues(lambda x: x.split())

        ## Filter out opcodes
        X_test_asm = X_test_asm.mapValues(
            lambda x: list(filter(lambda y: y in content, x)))
        X_test_asm = X_test_asm.mapValues(lambda x: " ".join(map(str, x)))
        X_test_asm = X_test_asm.map(lambda x:
                                    (x[0].split("/")[-1].split(".")[0], x[1]))

        ## Create test dataframe
        testdata = X_test_asm.map(
            lambda x: Row(filename=x[0], data=x[1])).toDF()

        ## Tokenizing data
        regexToken = RegexTokenizer(inputCol="data",
                                    outputCol="words",
                                    pattern="\\W")
        asm_df = regexToken.transform(testdata)
        asm_df = asm_df.drop('data')

        ## Using CountVectorizer to extract features
        # countvector = CountVectorizer(inputCol="words", outputCol="features")
        # cv = countvector.fit(asm_df)
        testdata = cv.transform(asm_df)
        # traindata = asm_df.withColumn('label', resultantdf['label'].cast('int'))

        return testdata
Ejemplo n.º 26
0
def topicPredict(inputs):
    #output_path = "/user/llbui/bigdata45_500"
    output_path = "C:/Users/linhb/bigdata45_500"
    query = inputs
    n = 10  #number of similar document to return
    feature = "abstract"  #feature to compare

    df = sc.parallelize([(0, query)]).toDF(["id", feature])

    tokenizer = RegexTokenizer(inputCol=feature,
                               outputCol="words",
                               pattern="\\P{Alpha}+")
    df2 = tokenizer.transform(df)

    remover = StopWordsRemover(inputCol="words", outputCol="words2")
    df3 = remover.transform(df2)

    udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType()))
    df4 = df3.withColumn("words3", udf_remove_words(df3.words2))

    # text to feature vector - TF_IDF
    countTF_model = CountVectorizerModel.load(output_path + "/tf_model")
    df_countTF = countTF_model.transform(df4)

    idf_model = IDFModel.load(output_path + "/idf_model")
    df_IDF = idf_model.transform(df_countTF)

    # LDA Model
    lda_model = LocalLDAModel.load(output_path + "/lda_model")

    #output topics for document -> topicDistribution
    df_Feature = lda_model.transform(df_IDF)
    feature_vector = df_Feature.select("id",
                                       "topicDistribution").collect()[0][1]
    print("Feature Vector:", feature_vector)

    #Load existing document
    df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet")
    udf_cosineSimilarity = udf(
        lambda x_vector: cosineSimilarity(x_vector, feature_vector),
        FloatType())
    df_Similarity = df_Document.withColumn(
        "similarity", udf_cosineSimilarity("topicDistribution"))
    df_Similarity_Sorted = df_Similarity.sort(desc("similarity"))
    return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract",
                                                "url",
                                                "topicDistribution").collect()
Ejemplo n.º 27
0
def aggregate_spark(data, features, args):
    from pyspark.ml.feature import StopWordsRemover, RegexTokenizer
    import pyspark.sql.functions as F
    from pyspark.sql.types import IntegerType

    regexTokenizer = RegexTokenizer(inputCol=features["col"],
                                    outputCol="token_list",
                                    pattern="\\W")
    regexTokenized = regexTokenizer.transform(data)

    remover = StopWordsRemover(inputCol="token_list",
                               outputCol="filtered_word_list")
    max_review_length_row = (remover.transform(regexTokenized).select(
        F.size(F.col("filtered_word_list")).alias("word_count")).agg(
            F.max(F.col("word_count")).alias("max_review_length")).collect())

    return max_review_length_row[0]["max_review_length"]
Ejemplo n.º 28
0
Archivo: ml.py Proyecto: ribonj/lsir
def tokenize(df, column):
    """
    Tokenize alpha-numeric words. Set all tokens to lower-case and 
    remove short terms having less than 3 characters.
    """
    # creates tokenizer based on regular expressions
    wordTokenizer = RegexTokenizer(
        inputCol=column, 
        outputCol='_'+column, 
        pattern='\w+'
    ).setGaps(False) # match tokens rather than gaps
    
    # transform: string --> array<string>
    df = wordTokenizer.transform(df) 
    
    df = replace(df, column, '_'+column)
    return df
Ejemplo n.º 29
0
    def preprocessing(self, resultantdf, cv):

        #removal of linenumber from each file data using regural expression
        resultantdf = resultantdf.withColumn(
            'data', F.regexp_replace('data', '\\b\\w{3,}\\s', ''))

        #using inbuilt regexTokenizer  api to tokenize the data
        regexTokenizer = RegexTokenizer(inputCol="data",
                                        outputCol="words",
                                        pattern="\\W")
        resultantdf = regexTokenizer.transform(resultantdf)
        resultantdf = resultantdf.drop('data')  #not required column dropped

        # bag of words count usinf count vectorizer
        resultantdf = cv.transform(resultantdf)

        return (resultantdf)
    def preprocessDF(self, df, cols):
        # concatenation
        df_concat = df.withColumn("concat", concat_ws(' ', *cols))

        # Split at whitespace and characters that are not letter
        tokenizer = RegexTokenizer(inputCol="concat",
                                   outputCol="words",
                                   pattern=r'\W+')
        df_tokenizer = tokenizer.transform(df_concat)

        # stopword remover
        remover = StopWordsRemover(inputCol="words",
                                   outputCol="joinKey",
                                   stopWords=self.stopWordsBC.value)
        df_remover = remover.transform(df_tokenizer) \
            .drop("concat").drop("words")
        return df_remover
Ejemplo n.º 31
0
def aggregate_spark(data, input):
    import pyspark.sql.functions as F
    from pyspark.ml.feature import RegexTokenizer

    regexTokenizer = RegexTokenizer(inputCol=input["col"],
                                    outputCol="token_list",
                                    pattern="\\W")
    regexTokenized = regexTokenizer.transform(data)

    vocab_rows = (regexTokenized.select(
        F.explode(F.col("token_list")).alias("word")).groupBy(
            "word").count().orderBy(F.col("count").desc()).limit(
                input["vocab_size"]).select("word").collect())

    vocab = [row["word"] for row in vocab_rows]
    reverse_dict = {word: 2 + idx for idx, word in enumerate(vocab)}
    reverse_dict["<PAD>"] = 0
    reverse_dict["<UNKNOWN>"] = 1
    return reverse_dict
Ejemplo n.º 32
0
def bayes_cv(business_id):
    """
    Crossvalidation of bayes model
    """
    spark = yelp_lib.spark
    review = yelp_lib.get_parq('review')
    business_df = review.filter(review['business_id'] == business_id)

    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    wordsDataFrame = regexTokenizer.transform(business_df)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    cleaned = remover.transform(wordsDataFrame)

    star_mapping = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0}

    cleaned = cleaned.replace(star_mapping, 'stars')
    cleaned = cleaned.withColumn("stars", cleaned["stars"].cast("double"))

    cv = CountVectorizer(inputCol="filtered", outputCol="features")
    model = cv.fit(cleaned)
    vectorized = model.transform(cleaned)

    vectorized = vectorized.select(
        col('stars').alias('label'), col('features'))

    splits = vectorized.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0)
    # train the model
    nb_model = nb.fit(train)
    # compute accuracy on the test set
    result = nb_model.transform(test)

    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    return "Accuracy: " + str(evaluator.evaluate(predictionAndLabels))
from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------
Ejemplo n.º 34
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TokenizerExample")\
        .getOrCreate()

    # $example on$
    sentenceDataFrame = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["label", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)

    tokenized = tokenizer.transform(sentenceDataFrame)
    for words_label in tokenized.select("words", "label").take(3):
        print(words_label)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    for words_label in regexTokenized.select("words", "label").take(3):
        print(words_label)
    # $example off$

    spark.stop()
Ejemplo n.º 35
0
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r"(\. |\n{2,})"
import re

matches = re.findall(pattern, "Wiki page. *More information*\n\n And a line\n that continues.")
print matches

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol="text", outputCol="sentences", pattern=pattern)
sentences = tokenizer.transform(parsed).select("sentences")
display(sentences)

# COMMAND ----------

from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType

sentenceRDD = sentences.flatMap(lambda r: r[0]).map(lambda x: Row(sentence=x))

sentenceSchema = StructType([StructField("sentence", StringType())])
sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema)

display(sentence)

# COMMAND ----------
Ejemplo n.º 36
0
from pyspark.ml.feature import RegexTokenizer, HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import RandomForest

## Load Dataset
df_pandas = pd.read_csv('sample.csv')

## Convert to Spark Dataframe
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(df_pandas)

## Tokenizer and Hashing 
tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features")
df_feat = hashingTF.transform(tokenizer.transform(df))

## Create LabeledPoint and Features for Prediction (predict the 1s observations)
lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features))
predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features)


## Compare predictions from Different Models


## Logistic Regression
lrm = LogisticRegressionWithSGD.train(lp, iterations=10)
logit_predict = lrm.predict(predict_feat)
logit_predict.sum()
#9112
Ejemplo n.º 37
0
# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r'(\. |\n{2,})'
import re
matches = re.findall(pattern, 'Wiki page. *More information*\n\n And a line\n that continues.')
print matches

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol='text', outputCol='sentences', pattern=pattern)
sentences = tokenizer.transform(parsed).select('sentences')
display(sentences)

# COMMAND ----------

from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType

sentenceRDD = (sentences
               .flatMap(lambda r: r[0])
               .map(lambda x: Row(sentence=x)))

sentenceSchema = StructType([StructField('sentence', StringType())])
sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema)

display(sentence)