コード例 #1
0
ファイル: spark_n.py プロジェクト: sazzad1012/NLP_Project
def feature_extract(train_t):
    stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')

    sw_remover1 = spark_ft.StopWordsRemover(inputCol='ntokens1',
                                            outputCol='clean_tokens1',
                                            stopWords=stopWords)

    text2vec1 = spark_ft.Word2Vec(vectorSize=50,
                                  minCount=1,
                                  seed=123,
                                  inputCol='ntokens1',
                                  outputCol='text_vec1',
                                  windowSize=1,
                                  maxSentenceLength=100)

    assembler1 = spark_ft.VectorAssembler(inputCols=['text_vec1'],
                                          outputCol='features1')

    sw_remover2 = spark_ft.StopWordsRemover(inputCol='ntokens2',
                                            outputCol='clean_tokens2',
                                            stopWords=stopWords)

    text2vec2 = spark_ft.Word2Vec(vectorSize=50,
                                  minCount=1,
                                  seed=123,
                                  inputCol='ntokens2',
                                  outputCol='text_vec2',
                                  windowSize=1,
                                  maxSentenceLength=100)

    assembler2 = spark_ft.VectorAssembler(inputCols=['text_vec2'],
                                          outputCol='features2')

    feature_pipeline = Pipeline(stages=[
        sw_remover1, text2vec1, assembler1, sw_remover2, text2vec2, assembler2
    ])

    feature_model = feature_pipeline.fit(train_t)

    train_featurized = feature_model.transform(train_t).persist()
    tA = train_featurized.select('text_vec1').collect()
    tA_array = np.array(tA)
    tB = train_featurized.select('text_vec2').collect()
    tB_array = np.array(tB)

    return tA_array, tB_array
コード例 #2
0
def canonicaltokens(df, inputColumn, outputColumn):
   """
   turn input column of strings into canonical format as output column of tokens
   return as output column added to the dataframe
   """

   newname = df.withColumn("cleanname", \
       f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \
       " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3"))

   newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words")
   chtokenized = newtokenizer.transform(newname).drop("cleanname")

   stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn)
   canonicalname = stopwordremover.transform(chtokenized).drop("words")

   return canonicalname
コード例 #3
0
ファイル: encoding.py プロジェクト: zhangyeejia/Optimus
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
コード例 #4
0
def test_unigram_and_bigram():
    df = SPARK_SESSION.sparkContext. \
        parallelize([['this is the best sentence ever'],
                     ['this is however the worst sentence available']]). \
        toDF(schema=types.StructType().add('sentence', types.StringType()))
    import requests
    stop_words = requests.get(
        'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
    ).text.split()

    tokenizer = feature.Tokenizer().setInputCol(
        'sentence') | feature.StopWordsRemover(stopWords=stop_words)
    unigram = feature.CountVectorizer()
    bigram = feature.NGram() | feature.CountVectorizer()
    trigram = feature.NGram(n=3) | feature.CountVectorizer()
    tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    assert_equal(
        tfidf_model.transform(df).select('sentence', 'features').count(), 2)
コード例 #5
0
spark = SparkSession(sc)

schema = StructType([StructField('documents', StringType(), True)])
text_1 = spark.read.format('text').schema(schema).load(
    '20news-19997/20_newsgroups/alt.atheism/49960.txt')
text_2 = spark.read.format('text').schema(schema).load(
    '20news-19997/20_newsgroups/alt.atheism/51060.txt')

text_data = text_1.union(text_2)

tokenizer = ft.RegexTokenizer(inputCol='documents',
                              outputCol='input_arr',
                              pattern=r'\s+|[,.\"]')
df1 = tokenizer.transform(text_data)

stopwords = ft.StopWordsRemover(inputCol='input_arr', outputCol='input_stop')
df2 = stopwords.transform(df1)

stringIndex = ft.CountVectorizer(inputCol='input_stop',
                                 outputCol='input_indexed')
cv_model = stringIndex.fit(df2)

df3 = cv_model.transform(df2)
df3.select('input_stop', 'input_indexed').show(truncate=False)

lda = LDA(k=2, maxIter=10, optimizer='em', featuresCol='input_indexed')
model = lda.fit(df3)
print("vocal size", model.vocabSize())
print(model.topicsMatrix)

topics = model.describeTopics()
コード例 #6
0
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)
print(train.count())
print(test.count())

# COMMAND ----------

# MAGIC %md #### Train Classifier

# COMMAND ----------

from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens',
                                       outputCol='clean_tokens',
                                       stopWords=stopWords)
tf = spark_ft.CountVectorizer(vocabSize=500,
                              inputCol='clean_tokens',
                              outputCol='tf')
idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='idf')

feature_pipeline = Pipeline(stages=[sw_remover, tf, idf])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()

# COMMAND ----------

display(train_featurized.groupBy("label").count())
top_restaurants_list = [(i.name) for i in top_restaurants.collect()]
df_review_top_rest = df_yelp_review.filter(
    df_yelp_review["name"].isin(top_restaurants_list))

df_review_top_rest = df_review_top_rest.select("text").limit(10000)

tokenizer = ft.RegexTokenizer(inputCol='text',
                              outputCol='word',
                              pattern='\s+|[,.\"]')

tok = tokenizer \
    .transform(df_review_top_rest) \
    .select('word')

stopwords = ft.StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                outputCol='input_stop')

ngram = ft.NGram(n=2, inputCol=stopwords.getOutputCol(), outputCol="nGrams")

pipeline = Pipeline(stages=[tokenizer, stopwords, ngram])

data_ngram = pipeline \
    .fit(df_review_top_rest) \
    .transform(df_review_top_rest)

data_ngram = data_ngram.select('nGrams')

FWords = data_ngram.rdd.flatMap(once)
WCount = FWords.reduceByKey(operator.add)
FreqWords = WCount.sortBy(lambda t: t[1], ascending=False).take(400)
FreqWordDict = dict(FreqWords)