def extract_featrues(self, train_rdd=None, test_rdd=None): """ train_rdd: type rdd, the raw rdd of train data (text content, label) test_rdd: type rdd, the raw rdd of test data (text content, doc_id) return: type data frame, a data frame where each record contains the extracred features """ print('****************************') print('Feature Extraction: TF-IDF\n') train_raw_df = train_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'label']) test_raw_df = test_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'doc_id']) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") train_ngram_df = ngram.transform(train_raw_df).drop('words') test_ngram_df = ngram.transform(test_raw_df).drop('words') hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features') train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop( 'ngrams') test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop( 'ngrams') idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(train_raw_featured_data) train_df = idf_model.transform(train_raw_featured_data).drop( 'raw_features') test_df = idf_model.transform(test_raw_featured_data).drop( 'raw_features') return (train_df, test_df)
def generate_nlp_columns(input_dataset,target): udf_remove_punc = udf(lambda s: removePunctuation(s) ) # Remove Punctuation input_dataset = input_dataset.withColumn(target,udf_remove_punc(target)) # Tokenize Title tokenizer = Tokenizer(inputCol=target, outputCol=target+"_words") input_dataset = tokenizer.transform(input_dataset) # Remove Stop Words remover = StopWordsRemover(inputCol=target+"_words", outputCol=target+"_cleanwords") input_dataset = remover.transform(input_dataset) # Generate N-Grams ngram = NGram(n=2, inputCol=target+"_cleanwords", outputCol=target+"_bigrams") input_dataset = ngram.transform(input_dataset) trigram = NGram(n=3, inputCol=target+"_cleanwords", outputCol=target+"_trigrams") input_dataset = trigram.transform(input_dataset) # Drop Extra Columns - Leave ngrams only. input_dataset = input_dataset.drop(target+"_words") input_dataset = input_dataset.drop(target+"_cleanwords") # Perform TFIDF #hashingTF = HashingTF(inputCol=target+"_trigrams", outputCol=target+"_hashing", numFeatures=20) #input_dataset = hashingTF.transform(input_dataset) #idf = IDF(inputCol=target+"_hashing", outputCol=target+"_features") #idfModel = idf.fit(input_dataset) #input_dataset = idfModel.transform(input_dataset) return input_dataset
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel): asmFiles = hashFiles.map( lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm") def fun(accum, x): return accum + ',' + x asmFileString = asmFiles.reduce(fun) rdd1 = sc.wholeTextFiles(asmFileString, 20) opcodesInDoc = rdd1.map(lambda x: x[1].split()).map( lambda x: [word for word in x if word in opcodes.value]).zipWithIndex( ).map(lambda x: (x[1], x[0])) ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"]) twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams") ngramFrame = threeGram.transform(ngramFrame) fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams") ngramFrame = fourGram.transform(ngramFrame) def getSegment(x): templist = [] for line in x: l = re.findall(r'\w+:?(?=:)', line) if l: templist.append(l[0]) return templist segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines( ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"]) featureFrame = ngramFrame.join(segments, "docId") featuresDF = featureFrame.rdd.map( lambda x: Row(did=x['docId'], docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[ '4grams'] + x['segments'])).toDF() featuresCV = featureFitModel.transform(featuresDF) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(testData, path) testData.show()
def ngrram(dataframe, column, x): tokens = Tokenizer(inputCol=column, outputCol='tokens') nn = NGram(n=x, inputCol='tokens', outputCol='ngrams') b = tokens.transform(dataframe) a = nn.transform(b) final = a.select(['tokens', 'ngrams']).show(4) return final
def Ngram_feature(N, feature_rdd): ''' Extract and count N-gram. Leave top 1000 n-gram features if it's 2-gram or more. Input: feature_rdd : [(<hash1>,<feature1>), (<hash1>,<feature2>), ..., (<hashN>,<featureK>)] Output: freq_ngram_count_rdd : [((<hash>,<ngram feature>),cnt), ...] ''' feature_rdd = feature_rdd.groupByKey().map(lambda x: (x[0],list(x[1]))) df = spark.createDataFrame(feature_rdd).toDF("file_names", "features") ngram = NGram(n=N, inputCol="features", outputCol="ngrams") ngramDataFrame = ngram.transform(df) ngram_rdd = ngramDataFrame.rdd.map(tuple).map(lambda x: (x[0],x[2])).flatMapValues(lambda x: x) ngram_count_rdd = ngram_rdd.map(lambda x: ((x),1)).reduceByKey(add) freq_ngram_count_rdd = ngram_count_rdd if not N == 1: #[(<ngram feature>,cnt), ...] topN_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],x[1])).reduceByKey(add) #[((<ngram feature>,cnt),index), ...] topN_ngram_count_rdd = topN_ngram_count_rdd.sortBy(lambda x: x[1],ascending=False).zipWithIndex() length = topN_ngram_count_rdd.count() #top [(<ngram feature>,cntSum), ...] topN_ngram_count_rdd = topN_ngram_count_rdd.filter(lambda x: x[1]<1000).map(lambda x: x[0]) #freq [(<ngram feature>,(<hash>,cnt)), ...] freq_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],(x[0][0],x[1]))) #[(<ngram feature>,(cntSum,(<hash>,cnt))), ...] freq_ngram_count_rdd = topN_ngram_count_rdd.join(freq_ngram_count_rdd).map(lambda x: ((x[1][1][0],x[0]),x[1][1][1])) return freq_ngram_count_rdd
def extract_collocations(records, num_collocations, collocation_window): """Extracts the most common collocations present in the records. Params: - records (pyspark.rdd.RDD): The tokenized and lemmatized records from the JSON file - num_collocations (int): The number of collocations to show - collocation_window (int): The text window within which to search for collocations. Returns: - best_collocations (list<tuple<str, int>>): The highest scored collocations present in the records, with their frequency of occurrence in the dataset. """ # @see: https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram from pyspark.ml.feature import NGram data_frame = records.map(lambda record: Row(record[constants.VALUE])).toDF( ['words']) ngram_model = NGram(n=2, inputCol='words', outputCol='ngrams') ngram_data_frame = ngram_model.transform(data_frame) ngram_rdd = ngram_data_frame.select('ngrams').rdd ngram_rdd = ngram_rdd.flatMap(lambda row: row['ngrams'])\ .map(lambda ngram: (ngram.encode('utf-8'), 1))\ .reduceByKey(add)\ .sortBy(lambda bigram_with_count: bigram_with_count[1], ascending=False) rdd_show(ngram_rdd) frequent_collocations = ngram_rdd.take(num_collocations) return frequent_collocations
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1]))) sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0]))) ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() featuresCV = featureFitModel.transform(ngramFrame) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(ngramFrame,path) testData.show()
def shringles(x, fileName): # tokenize and ngrams tokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W") ngrams = NGram(n=x, inputCol="words", outputCol="kshringles") shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
def train(allHex,labels,hashFiles,sc,sqlc,path): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000) featureFitModel = cv.fit(ngramFrame) featuresCV = featureFitModel.transform(ngramFrame) labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0])) labelFrame = labelRdd.toDF(["did","label"]) trainData = ngramFrame.featuresCV(labelFrame,"did") trainData.persist(StorageLevel(True, True, False, False, 1)) saveData(trainData,path) trainData.show() returm featureFitModel
def feature_engineering(class_balancedDf): # N-Gram ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams") ngramDataFrame = ngram.transform(class_balancedDf) # Hashing TF hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # K-Means kmeans = KMeans().setK(6).setSeed(1) kmodel = kmeans.fit(rescaledData).transform(rescaledData) #LDA lda = LDA(k=10, maxIter=10) ldamodel = lda.fit(kmodel).transform(kmodel) # changing label column to int data = ldamodel.withColumn( "label", ldamodel.label.cast("Integer")).drop("prediction") return data
def create_ngram(self, df, n, input_col, output_col='ngrams'): "Generate N-Gram -> https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram" from pyspark.ml.feature import NGram ngram = NGram(n=n, inputCol=input_col, outputCol=output_col) ngram_df = ngram.transform(df) return ngram_df
def test_ngram(self): dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def main(): # basic cleaning and getting of files get_moby() sentences = get_sentences() # create spark app, for use in iPython notebook OR as a standalone. spark = SparkSession\ .builder\ .appName("NGramSample")\ .getOrCreate() # build a distributed dataframe sentence_df = spark.createDataFrame(sentences, ['id', 'sentences']) # create a tokenizer and write a 'words' column to DF tokenizer = Tokenizer(inputCol='sentences', outputCol='words') words = tokenizer.transform(sentence_df) # create ngram generators for bi, tri, and quad grams bigram = NGram(n=2, inputCol='words', outputCol='bigrams') trigram = NGram(n=3, inputCol='words', outputCol='trigrams') quadgram = NGram(n=4, inputCol='words', outputCol='quadgrams') # add each one in turn to the df bigrams = bigram.transform(words) trigrams = trigram.transform(bigrams) final = quadgram.transform(trigrams) # write as traversable JSON if os.path.exists('ngrams'): shutil.rmtree('ngrams') final.coalesce(1).write.json('ngrams') # as an example, write out quadgrams to CSV if os.path.exists('bigrams'): shutil.rmtree('bigrams') # This tricky bit selects bigrams, explodes it, and regroups by unique # bigram, then adds a count, after filtering out extremely uncommon bigrams # It finally writes to a CSV final.select('bigrams')\ .withColumn('bigrams', explode('bigrams'))\ .groupBy('bigrams').count().orderBy('count', ascending=False)\ .filter('count > 10')\ .coalesce(1).write.csv('bigrams')
def test_ngram(self): dataset = self.spark.createDataFrame([ Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def calculate_vectors(data, n=2, binary=False): ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams") ngramDataFrame = ngram.transform(data) ngrams = ngramDataFrame.select("ngrams") cvectorizer = CountVectorizer( inputCol="ngrams", outputCol="vec", binary=binary ) model = cvectorizer.fit(ngrams) return model.transform(ngrams).select("vec")
def opcode_ngram(df_opcode, N): """ Generates n-grams opcode by opcode data frame. Returns n-grams opcode in RDD((filename, n-gram), total_counts) """ ngrams = NGram(n=N, inputCol="opcode", outputCol="ngrams") df_ngrams = ngrams.transform(df_opcode) rdd_ngrams = df_ngrams.select("filename", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\ .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add) return rdd_ngrams
def transformData(df, parameter): ''' Transformed dataframe based on the parameter Input : - parameter Output : - transformed dataframe ''' ngram = NGram(n=parameter["n"], inputCol=parameter["inputCol"], outputCol=parameter["outputCol"]) temp = '' if len(ngram.transform(df).head().inputTokens) < ngram.getN(): print('No element in ' + parameter["outputCol"]) else: temp = ngram.transform(df).show() return temp
def bytes_ngram(df_bytes, n): """ Generates n-grams bytes by bytes data frame. Returns n-grams bytes in RDD((hash, n-gram), total_counts) """ ngrams = NGram(n=n, inputCol="bytes", outputCol="ngrams") df_ngrams = ngrams.transform(df_bytes) rdd_ngrams = df_ngrams.select("hash", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\ .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add) return rdd_ngrams
def make_ngrams (df, n=1): df = df.withColumn('normalized_text', processing(F.col('text'))) tokenizer = Tokenizer(inputCol="normalized_text", outputCol="tokens") tokenized = tokenizer.transform(df).drop('normalized_text') ngram = NGram(n=n, inputCol="tokens", outputCol="n_gram") n_gram_df = ngram.transform(tokenized) n_gram_df = n_gram_df.withColumn('n_gram', F.explode('n_gram')) n_gram_df = n_gram_df.filter(F.length('n_gram')>2) return n_gram_df
def ngram(dataframe, in_col, out_col, n): ngram = NGram(n=n, inputCol=in_col, outputCol=out_col) dataframe = ngram.transform(dataframe) # summarise top n-grams dataframe\ .groupBy(out_col)\ .count()\ .sort(col("count").desc())\ .show() return dataframe
def learn(self, text_df): """Spark transformation to learn the adjacent terms of a given ngram""" ngram = NGram(n=self.n, inputCol='tokenized_text', outputCol='ngram') ngram_df = ngram.transform(text_df) # create the ngram to adjacent term mappings ngram_list = ngram_df.select("ngram").rdd.map(lambda r: r['ngram']).collect() self.ngram_model = ngram_df.rdd \ .map(lambda x: PreProcess.generate_adjacent_terms(x.asDict()['ngram'])) \ .flatMap(lambda xs: [x for x in xs]) \ .map(lambda y: (y[0], [y[1]])) \ .reduceByKey(lambda a, b: a + b).collect() # create list of the keys in the model and store them self.model_keys = self.ngram_model.map(lambda x: x[0]).collect()
def convert_ngrams(df, column): # convert tokens to ngram n = 5 # convert to ngrams 2 to n ngrams for i in range(1, n + 1): ngram = NGram(n=i, inputCol=column, outputCol='{}_{}'.format(column, i)) df = ngram.transform(df) return df.withColumn( column, concat(*['{}_{}'.format(column, i) for i in range(1, n + 1)])).drop( *['{}_{}'.format(column, i) for i in range(1, n + 1)])
def n_gram_fingerprint(df, input_cols, n_size=2): """ Calculate the ngram for a fingerprinted string :param df: Dataframe to be processed :param input_cols: Columns to be processed :param n_size: :return: """ def remote_white_spaces_remove_sort_join(value, args): # remove white spaces value = [x.replace(" ", "") for x in value] # sort and remove duplicated value = sorted(set(value)) # join the tokens back together value = "".join(value) return value input_cols = parse_columns(df, input_cols) for input_col in input_cols: ngram_col = name_col(input_col, NGRAM_COL) ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL) df = ( df.cols.copy(input_col, name_col( input_col, NGRAM_COL)).cols.lower(ngram_col).cols.remove_white_spaces( ngram_col).cols.remove_special_chars( ngram_col).cols.remove_accents(ngram_col) # For create n-grams we need an Array type column .cols.nest(input_cols=ngram_col, output_col=ngram_col, shape='array')) if Optimus.cache: df = df.cache() n_gram = NGram(n=n_size, inputCol=ngram_col, outputCol=ngram_fingerprint_col) df = n_gram.transform(df) df = df.cols.apply(ngram_fingerprint_col, remote_white_spaces_remove_sort_join, "string") return df
def get_ngrams(cases, region_path): if (debug): logging.info(region_path) for case_path in tqdm(cases): parsed = parse_file(case_path) text = get_case_text(parsed) date = get_decision_date(parsed).year state = parsed("case|court").attr('jurisdiction').strip() text = text.encode("ascii", "ignore") clean_word_list = alphanumeric.sub('', text).lower().split() text_df = spark.createDataFrame([Row(inputTokens=clean_word_list)]) for n in range(1,4): if n==1: ngrams = clean_word_list else: ngram_prepared = NGram(n=n, inputCol="inputTokens", outputCol="nGrams") ngrams = ngram_prepared.transform(text_df).head().nGrams sc.parallelize(ngrams).map(lambda word: (word,1)).reduceByKey(lambda v1,v2: v1 + v2).map(lambda word_tuple: write_to_file(word_tuple, date, case_path, state, region_path, n=n)).collect()
def n_gram_fingerprint(df, columns, n_size): """ Calculate the ngram for a fingerprinted string :param df: :param columns: :param n_size: :return: """ def remote_white_spaces_remove_sort_join(value, args): # remove white spaces value = [x.replace(" ", "") for x in value] # sort and remove duplicated value = sorted(set(value)) # join the tokens back together value = "".join(value) return value columns = parse_columns(df, columns) for col_name in columns: output_col = col_name + "_NGRAM" n_gram_col = col_name + "_NGRAM_FINGERPRINT" df = (df .withColumn(output_col, F.col(col_name)) .cols.lower(output_col) .cols.remove_white_spaces(output_col) .cols.remove_special_chars(output_col) .cols.remove_accents(output_col) # For create n-grams we need an Array type column .cols.nest(output_col, output_col, 'array') .repartition(1) # Needed for optimization in a single machine .cache() ) n_gram = NGram(n=n_size, inputCol=output_col, outputCol=n_gram_col) df = n_gram.transform(df) df = df.cols.apply(n_gram_col, remote_white_spaces_remove_sort_join, "string") return df
def get_Ngram(text): # 문장을 단어 단위로 쪼갬 tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(text) # 불용어 제거 # remover = StopWordsRemover() \ # .setStopWords(mystopwords) \ # .setCaseSensitive(False) \ # .setInputCol("words") \ # .setOutputCol("filtered") # remover.transform(wordsData).show(truncate = 15) # N-gram 이용하여 단어 조합 만들기 ngram = NGram(n=3, inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordsData) result = ngramDataFrame.select("ngrams").show(truncate=False) ngr = ngram.rdd.flatmap(lambda x: x).collect() for i in ngr: print(i) return result
def functions_for_deal_with_texts(spark, resources_folder): send_df = spark.createDataFrame([ (0, 'Hi I heard about Spark'), (1, 'I wish java could use case classes'), (2, 'Logistic,regression,models,are,neat'), ], ['id', 'sentence']) tokenizer = Tokenizer(inputCol='sentence', outputCol='words') regularTokenizer = RegexTokenizer( inputCol='sentence', outputCol='words', pattern='\\W') count_token = udf(lambda words: len(words), IntegerType()) tokenize = tokenizer.transform(send_df) tokenize.show() tokenize.withColumn('tokens', count_token(col('words'))).show() rg_tokenize = regularTokenizer.transform(send_df) rg_tokenize.show() rg_tokenize.withColumn('tokens', count_token(col('words'))).show() # remover palabras comunes sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) wordDataFrame = spark.createDataFrame([ (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) ], ["id", "words"]) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.select("ngrams").show(truncate=False)
def LimpiarTextoTweets(df, Busqueda): #spark = SparkSession.builder.master('spark://192.168.55.3:7077').appName('LimpiaDatos').getOrCreate() spark = SparkSession.builder.appName('LimpiaDatos').getOrCreate() sdf = spark.createDataFrame(df) stopword_unidecode = [ unidecode.unidecode(word) for word in stopwords.words('spanish') ] numeros = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] stopwordList = list(numeros + stopword_unidecode + stopwords.words('spanish') + ['rt', 'https', 'co', 'http', 't', 'q', 'l', 'c'] + Busqueda.lower().split()) #dataTweet = spark.createDataFrame([(0, unidecode.unidecode(Texto))],['id','sentence']) tokenizer = RegexTokenizer(inputCol='text', outputCol='tokens', pattern='\W+') tokenized = tokenizer.transform(sdf) #tokenized.show(truncate=False) remover = StopWordsRemover(inputCol='tokens', outputCol='removed', stopWords=stopwordList) removered = remover.transform(tokenized) #removered.show(truncate=False) ngram = NGram(n=2, inputCol='removed', outputCol='grams') ngramd = ngram.transform(removered) Tweets_Limpios = ngramd.toPandas() spark.stop() return Tweets_Limpios
def get_ngrams(cases, region_path): if (debug): logging.info(region_path) for case_path in tqdm(cases): parsed = parse_file(case_path) text = get_case_text(parsed) date = get_decision_date(parsed).year state = parsed("case|court").attr('jurisdiction').strip() text = text.encode("ascii", "ignore") clean_word_list = alphanumeric.sub('', text).lower().split() text_df = spark.createDataFrame([Row(inputTokens=clean_word_list)]) for n in range(1, 4): if n == 1: ngrams = clean_word_list else: ngram_prepared = NGram(n=n, inputCol="inputTokens", outputCol="nGrams") ngrams = ngram_prepared.transform(text_df).head().nGrams sc.parallelize(ngrams).map(lambda word: (word, 1)).reduceByKey( lambda v1, v2: v1 + v2).map(lambda word_tuple: write_to_file( word_tuple, date, case_path, state, region_path, n=n) ).collect()
def toNgramDF(df, nbGrams, inputColName, addNbGramsToOutputCol=False, removeInputCol=True, logger=None, verbose=True): """ This function convert a dataframe to a ngramDF on the given inputColName """ if addNbGramsToOutputCol: columnName = str(nbGrams) + "grams" else: columnName = "ngrams" ngram = NGram(n=nbGrams, inputCol=inputColName, outputCol=columnName) ngramDF = ngram.transform(df) # We drop the inputCol column: if removeInputCol: try: ngramDF = ngramDF.drop(inputColName) except Exception as e: logException(e, logger, verbose=verbose) return ngramDF
import nltk print "Start preprocessing all data" t0 = time() def preProcess(doc): clean = doc[0].replace("<br /><br />"," ") tok = nltk.tokenize.wordpunct_tokenize(clean) low = [word.lower() for word in tok] return low,doc[1] bigram = NGram(inputCol="words", outputCol="bigrams") dfPre=df.map(preProcess).toDF(['words','label']).cache() dfTrain, dfValid = bigram.transform(dfPre).randomSplit([0.8,0.2]) dfTrain.cache() dfValid.cache() lists=dfTrain.map(lambda r : r.bigrams).collect() dictBigrams=list(set(itertools.chain(*lists))) dictionaryBigrams={} for i,word in enumerate(dictBigrams): dictionaryBigrams[word]=i dict_broad=sc.broadcast(dictionaryBigrams) revDict_broad=sc.broadcast(dictBigrams) tt = time() - t0 print "Data preprocessed in {} second".format(round(tt,3))
df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Clean train and test set created" from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTrainTok = tokenizer.transform(dfTrain) print "Tokens computed" from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTrainTok) print "Bigrams computed" import itertools lists=dfBigram.map(lambda r : r.bigrams).collect() dictBigrams=set(itertools.chain(*lists)) dictionaryBigrams={} for i,word in enumerate(dictBigrams): dictionaryBigrams[word]=i print "Dictionary created" dict_broad=sc.broadcast(dictionaryBigrams)
# COMMAND ---------- from pyspark.ml.feature import StopWordsRemover englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover()\ .setStopWords(englishStopWords)\ .setInputCol("DescOut") stops.transform(tokenized).show() # COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram().setInputCol("DescOut").setN(1) bigram = NGram().setInputCol("DescOut").setN(2) unigram.transform(tokenized.select("DescOut")).show(False) bigram.transform(tokenized.select("DescOut")).show(False) # COMMAND ---------- from pyspark.ml.feature import CountVectorizer cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("countVec")\ .setVocabSize(500)\ .setMinTF(1)\ .setMinDF(2) fittedCV = cv.fit(tokenized) fittedCV.transform(tokenized).show(False)
text2 = text.map(rm_junks).collect() rawLabelTweetDataFrame = spark.createDataFrame(text2, ["label", "tweets"]) regexTokenizer = RegexTokenizer(inputCol="tweets", outputCol="words", pattern="\\W") tokenized = regexTokenizer.transform(rawLabelTweetDataFrame) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(tokenized).select( "label", "filtered") uningram = NGram(n=2, inputCol="filtered", outputCol="ngrams") uningramDataFrame = uningram.transform(filteredDataFrame) uningramDataFrame.select("label", "ngrams").show(truncate=False) uningramData = uningramDataFrame.select("label", "ngrams") #hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(uningramData) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show()
print "Start preprocessing all data" t0 = time() def preProcess(doc): clean = doc.review.replace("<br /><br />"," ") tok = nltk.tokenize.wordpunct_tokenize(clean) tags = nltk.pos_tag(tok,tagset='universal') low = [word.lower() for word in tok] return low,zip(*tags)[1],doc.label schema = StructType([StructField('words',ArrayType(StringType()),True), StructField('tags',ArrayType(StringType()),True), StructField('label',DoubleType())]) dfPre=df.map(preProcess).toDF(schema).cache() trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams") dfTriAux = trigram.transform(dfPre).cache() trigram.setInputCol("words") trigram.setOutputCol("wordTrigrams") dfTri = trigram.transform(dfTriAux).cache() dfTrain, dfValid = dfTri.randomSplit([0.8,0.2]) lists=dfTrain.map(lambda r : r.words).collect() dictUnigrams=list(set(itertools.chain(*lists))) dictionaryUni={} for i,word in enumerate(dictUnigrams): dictionaryUni[word]=i dict_broad = sc.broadcast(dictionaryUni)
from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import NGram # $example off$ if __name__ == "__main__": sc = SparkContext(appName="NGramExample") sqlContext = SQLContext(sc) # $example on$ wordDataFrame = sqlContext.createDataFrame( [ (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]), ], ["label", "words"], ) ngram = NGram(inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3): print(ngrams_label) # $example off$ sc.stop()
posTagger = tr.NLTKPosTagger( inputCol="words", outputCol="tags") print "Compute tags" t0 = time() dfTags = posTagger.transform(df) dfTags.show() tt = time() - t0 print "Tags computed in {} second".format(round(tt,3)) # In[7]: from pyspark.ml.feature import NGram trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams") t0 = time() dfTriAux = trigram.transform(dfTags) trigram.setInputCol("words") trigram.setOutputCol("wordTrigrams") dfTri = trigram.transform(dfTriAux) dfTri.show() tt = time() - t0 print "Trigrams created in {} second".format(round(tt,3)) # In[8]: dfTrain, dfTest = dfTri.randomSplit([0.8,0.2]) # In[9]:
remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) # ## n-grams from pyspark.ml.feature import NGram wordDataFrame = spark.createDataFrame( [(0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"])], ["id", "words"]) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.select("ngrams").show(truncate=False) # _______ # # Feature Extractors # _______ # <h2 id="tf-idf">TF-IDF</h2> # # <p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-inverse document frequency (TF-IDF)</a> # is a feature vectorization method widely used in text mining to reflect the importance of a term # to a document in the corpus. Denote a term by <code>$t$</code>, a document by d , and the corpus by D. # Term frequency <code>$TF(t, d)$</code> is the number of times that term <code>$t$</code> appears in document <code>$d$</code>, while # document frequency <code>$DF(t, D)$</code> is the number of documents that contains term <code>$t$</code>. If we only use # term frequency to measure the importance, it is very easy to over-emphasize terms that appear very # often but carry little information about the document, e.g. “a”, “the”, and “of”. If a term appears
print print "Dataframe created in {} second".format(round(tt,3)) # In[314]: from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTok = tokenizer.transform(df) # In[315]: from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTok) # In[317]: print "Start tokenizing, computing bigrams and splitting between test and train" t0 = time() dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2]) dfTrain.take(1) dfTest.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[318]:
def concat_(*args): return list(chain(*args)) return udf(concat_, ArrayType(type)) concat_string_arrays = concat(StringType()) df = df.withColumn( 'joined_tokens', concat_string_arrays(col('filtered_title_tokens'), col('filtered_sterm_tokens'), col('filtered_attr_tokens'))) joined_ngram = NGram(n=2, inputCol="joined_tokens", outputCol="joined_ngrams") df = joined_ngram.transform(df) ''' stemmingUdf = udf(stemming, ArrayType(StringType())) df = df.withColumn('stemmed_tokens', stemmingUdf('joined_tokens')) ''' joined_hashingTF = HashingTF(inputCol="joined_ngrams", outputCol="joined_rawFeatures", numFeatures=30000) df = joined_hashingTF.transform(df) joined_idf = IDF(inputCol="joined_rawFeatures", outputCol="features") joined_idfModel = joined_idf.fit(df) df = joined_idfModel.transform(df)
in_col = dataset[self.getInputCol()] return dataset.withColumn(out_col, udf(f, t)(in_col)) posTagger = NLTKPosTagger(inputCol="words", outputCol="tagWords") dfTagged = posTagger.transform(dfTrainTok) #dfTagged.show() #---------------------------------------------------------------------- #------------------------------Bigrams--------------------------------- #---------------------------------------------------------------------- from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTrainTokNoSw) print "DataFrame des Bigram: " dfBigram.show() #********************************************************************** #------------------------Feature selection----------------------------- #********************************************************************** # Pour la suite on a le choix entre l'encodage utilisé par le prof (le mot y est ou n'y est pas) # ou la version en apparence plus informative du tfidf. En vrai, le tfidf peut être trompeur # donc je construis quand même les dictionnaires d'unigrammes et de bigrammes pour pouvoir # calculer les sparse vectors du prof.