def generate_nlp_columns(input_dataset,target): udf_remove_punc = udf(lambda s: removePunctuation(s) ) # Remove Punctuation input_dataset = input_dataset.withColumn(target,udf_remove_punc(target)) # Tokenize Title tokenizer = Tokenizer(inputCol=target, outputCol=target+"_words") input_dataset = tokenizer.transform(input_dataset) # Remove Stop Words remover = StopWordsRemover(inputCol=target+"_words", outputCol=target+"_cleanwords") input_dataset = remover.transform(input_dataset) # Generate N-Grams ngram = NGram(n=2, inputCol=target+"_cleanwords", outputCol=target+"_bigrams") input_dataset = ngram.transform(input_dataset) trigram = NGram(n=3, inputCol=target+"_cleanwords", outputCol=target+"_trigrams") input_dataset = trigram.transform(input_dataset) # Drop Extra Columns - Leave ngrams only. input_dataset = input_dataset.drop(target+"_words") input_dataset = input_dataset.drop(target+"_cleanwords") # Perform TFIDF #hashingTF = HashingTF(inputCol=target+"_trigrams", outputCol=target+"_hashing", numFeatures=20) #input_dataset = hashingTF.transform(input_dataset) #idf = IDF(inputCol=target+"_hashing", outputCol=target+"_features") #idfModel = idf.fit(input_dataset) #input_dataset = idfModel.transform(input_dataset) return input_dataset
def build_pipeline(classifier='rf', max_depth=7): """ creates a pipeline of functionalities to be applied on the training set """ # Training: Tokenize, Removing stop words, calculating n-grams, calcuating frequencies tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern='\w{8}|\s') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=['??']) ngram_2 = NGram(n=2, inputCol='filtered', outputCol='ngrams') ngram_3 = NGram(n=3, inputCol='filtered', outputCol='ngrams') hashingTF = HashingTF(inputCol="ngrams", outputCol="features") word2vec = Word2Vec(inputCol='ngrams', outputCol='features') if classifier == 'rf': clf = RandomForestClassifier(maxDepth=max_depth) stages = [tokenizer, remover, ngram_2, hashingTF, clf] elif classifier == 'nb': clf = NaiveBayes(smoothing=1) stages = [tokenizer, remover, ngram_3, hashingTF, clf] elif classifier == 'lr': clf = LogisticRegression() stages = [tokenizer, remover, ngram_2, word2vec, clf] else: raise ValueError("classifier must be 'rf', 'nb', or 'lr'.") return stages
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel): asmFiles = hashFiles.map( lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm") def fun(accum, x): return accum + ',' + x asmFileString = asmFiles.reduce(fun) rdd1 = sc.wholeTextFiles(asmFileString, 20) opcodesInDoc = rdd1.map(lambda x: x[1].split()).map( lambda x: [word for word in x if word in opcodes.value]).zipWithIndex( ).map(lambda x: (x[1], x[0])) ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"]) twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams") ngramFrame = threeGram.transform(ngramFrame) fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams") ngramFrame = fourGram.transform(ngramFrame) def getSegment(x): templist = [] for line in x: l = re.findall(r'\w+:?(?=:)', line) if l: templist.append(l[0]) return templist segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines( ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"]) featureFrame = ngramFrame.join(segments, "docId") featuresDF = featureFrame.rdd.map( lambda x: Row(did=x['docId'], docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[ '4grams'] + x['segments'])).toDF() featuresCV = featureFitModel.transform(featuresDF) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(testData, path) testData.show()
def build_pipeline(): tokenizer = [Tokenizer(inputCol='text', outputCol='words')] remover = [StopWordsRemover(inputCol="words", outputCol="stopped_words")] ngrams = [ NGram(n=i, inputCol='stopped_words', outputCol='{0}_grams'.format(i)) for i in range(1, 6) ] cv = [ CountVectorizer(vocabSize=50000, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1, 6) ] idf = [ IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=5) for i in range(1, 6) ] tweetvect = [ VectorAssembler(inputCols=["tweet_count"], outputCol="vec_tweet_count") ] ss = [ StandardScaler(inputCol="vec_tweet_count", outputCol="ss_tweet_count") ] assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')] pipeline = Pipeline(stages=tokenizer + remover + ngrams + cv + idf + tweetvect + ss + assembler) return pipeline
def preprocess(inputCol=["text", "label"], n=4): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] remover = [StopWordsRemover(inputCol="words", outputCol="filtered")] ngrams = [ NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=2) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)
def extract_collocations(records, num_collocations, collocation_window): """Extracts the most common collocations present in the records. Params: - records (pyspark.rdd.RDD): The tokenized and lemmatized records from the JSON file - num_collocations (int): The number of collocations to show - collocation_window (int): The text window within which to search for collocations. Returns: - best_collocations (list<tuple<str, int>>): The highest scored collocations present in the records, with their frequency of occurrence in the dataset. """ # @see: https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram from pyspark.ml.feature import NGram data_frame = records.map(lambda record: Row(record[constants.VALUE])).toDF( ['words']) ngram_model = NGram(n=2, inputCol='words', outputCol='ngrams') ngram_data_frame = ngram_model.transform(data_frame) ngram_rdd = ngram_data_frame.select('ngrams').rdd ngram_rdd = ngram_rdd.flatMap(lambda row: row['ngrams'])\ .map(lambda ngram: (ngram.encode('utf-8'), 1))\ .reduceByKey(add)\ .sortBy(lambda bigram_with_count: bigram_with_count[1], ascending=False) rdd_show(ngram_rdd) frequent_collocations = ngram_rdd.take(num_collocations) return frequent_collocations
def extract_featrues(self, train_rdd=None, test_rdd=None): """ train_rdd: type rdd, the raw rdd of train data (text content, label) test_rdd: type rdd, the raw rdd of test data (text content, doc_id) return: type data frame, a data frame where each record contains the extracred features """ print('****************************') print('Feature Extraction: TF-IDF\n') train_raw_df = train_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'label']) test_raw_df = test_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'doc_id']) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") train_ngram_df = ngram.transform(train_raw_df).drop('words') test_ngram_df = ngram.transform(test_raw_df).drop('words') hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features') train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop( 'ngrams') test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop( 'ngrams') idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(train_raw_featured_data) train_df = idf_model.transform(train_raw_featured_data).drop( 'raw_features') test_df = idf_model.transform(test_raw_featured_data).drop( 'raw_features') return (train_df, test_df)
def build_pipeline(): tokenizer = [Tokenizer(inputCol='tweet', outputCol='words')] ngrams = [ NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i)) for i in range(1, 4) ] cv = [ CountVectorizer(vocabSize=5460, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1, 4) ] idf = [ IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=5) for i in range(1, 4) ] assembler = [ VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1, 4)], outputCol='features') ] label_stringIdx = [StringIndexer(inputCol='sentiment', outputCol='label')] lr = [LogisticRegression(maxIter=100)] pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_stringIdx + lr) return pipeline
def ngrram(dataframe, column, x): tokens = Tokenizer(inputCol=column, outputCol='tokens') nn = NGram(n=x, inputCol='tokens', outputCol='ngrams') b = tokens.transform(dataframe) a = nn.transform(b) final = a.select(['tokens', 'ngrams']).show(4) return final
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0): tokenized = Tokenizer(inputCol="text", outputCol="words").transoform(sentenceDataFrame) ngramDataFrame = NGram(n=ngrams, inputCol="words", outputCol="ngrams").transform(tokenized) countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures") countVectModel = countVect.fit(ngramDataFrame) featurizedData = countVectModel.transform(ngramDataFrame) idf = IDF(minDocFreq=minDocFreq, inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features") normalizer = Normalizer(inputCol="features", outputCol='scores') X = normalizer.transform(rescaledData) return X
def LR_Model(train_dataframe, test_dataframe): ''' Takes the argument as a train_dataframe, test_dataframe implements the pipeline of RegexTokenizer, NGrams =3 , HashingTF, IDF and LogisticRegression and predicts the label based on features of test_dataframe. The Pattern RegexTokenizer is set to "\\W|\b(00|CC)\b" because it removes all nonwords that is extra spaces or punctuations, '??', '00' and 'CC' are removed as these are most repeated words and accuracy is significantly improved. Args: dataframe: -The train_dataframe should consist of the columns, 'label' and 'text'. -The test_dataframe should consist of the column 'text'. Returns: DataFrame['prediction': double, given_order: bigint, label: string] iff data read initially is a small dataset else DataFrame['prediction': double, given_order: bigint] data read initially is a big dataset ''' train_dataframe = train_dataframe.repartition(96)\ .withColumn('label', train_dataframe['label'].cast(IntegerType())) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W|\b(00|CC)\b") ngram = NGram(n=3, inputCol="words", outputCol="ngrams") hashingTF = HashingTF(inputCol="ngrams", outputCol="TF") idf = IDF(inputCol="TF", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) pipeline = Pipeline(stages=[regexTokenizer, ngram, hashingTF, idf, lr]) model = pipeline.fit(train_dataframe) predictions_df = model.transform(test_dataframe) return predictions_df\ .drop('rawfeatures', 'n_grams', 'TF', 'text', 'words', 'features')
def process_df(df): time_seq.append(['start process-df', time.time()]) model = Pipeline(stages=[ RegexTokenizer(pattern=" ", inputCol="instruments", outputCol="instruments_tokenized", minTokenLength=1), NGram(n=1, inputCol="instruments_tokenized", outputCol="instruments_ngrams"), HashingTF(inputCol="instruments_ngrams", outputCol="instruments_vectors"), MinHashLSH(inputCol="instruments_vectors", outputCol="instruments_lsh", numHashTables=10) ]).fit(df) df_hashed = model.transform(df) df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \ .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \ .select(f.col('datasetA.filename').alias('filename_A'), f.col('datasetB.filename').alias('filename_B'), f.col('distance')) time_seq.append(['process-df df_matches', time.time()]) write_df_to_pgsql(df_matches, 'filepair_similarity_run3') time_seq.append(['write pgsql', time.time()]) print('time_seq', time_seq)
def shringles(x, fileName): # tokenize and ngrams tokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W") ngrams = NGram(n=x, inputCol="words", outputCol="kshringles") shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
def train(allHex,labels,hashFiles,sc,sqlc,path): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000) featureFitModel = cv.fit(ngramFrame) featuresCV = featureFitModel.transform(ngramFrame) labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0])) labelFrame = labelRdd.toDF(["did","label"]) trainData = ngramFrame.featuresCV(labelFrame,"did") trainData.persist(StorageLevel(True, True, False, False, 1)) saveData(trainData,path) trainData.show() returm featureFitModel
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1]))) sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0]))) ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() featuresCV = featureFitModel.transform(ngramFrame) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(ngramFrame,path) testData.show()
def feature_engineering(class_balancedDf): # N-Gram ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams") ngramDataFrame = ngram.transform(class_balancedDf) # Hashing TF hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # K-Means kmeans = KMeans().setK(6).setSeed(1) kmodel = kmeans.fit(rescaledData).transform(rescaledData) #LDA lda = LDA(k=10, maxIter=10) ldamodel = lda.fit(kmodel).transform(kmodel) # changing label column to int data = ldamodel.withColumn( "label", ldamodel.label.cast("Integer")).drop("prediction") return data
def build_ngrams(n=3): tokenizer = [Tokenizer(inputCol="text", outputCol="tokens")] stopwordsRemover = [ StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered') ] ngrams = [ NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=5460, inputCol="{0}_grams".format(i), outputCol="{0}_cv".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_cv".format(i), outputCol="{0}_idf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_idf".format(i) for i in range(1, n + 1)], outputCol="features") ] stringIndexer = [StringIndexer(inputCol="class", outputCol="label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + stringIndexer + lr)
def Ngram_feature(N, feature_rdd): ''' Extract and count N-gram. Leave top 1000 n-gram features if it's 2-gram or more. Input: feature_rdd : [(<hash1>,<feature1>), (<hash1>,<feature2>), ..., (<hashN>,<featureK>)] Output: freq_ngram_count_rdd : [((<hash>,<ngram feature>),cnt), ...] ''' feature_rdd = feature_rdd.groupByKey().map(lambda x: (x[0],list(x[1]))) df = spark.createDataFrame(feature_rdd).toDF("file_names", "features") ngram = NGram(n=N, inputCol="features", outputCol="ngrams") ngramDataFrame = ngram.transform(df) ngram_rdd = ngramDataFrame.rdd.map(tuple).map(lambda x: (x[0],x[2])).flatMapValues(lambda x: x) ngram_count_rdd = ngram_rdd.map(lambda x: ((x),1)).reduceByKey(add) freq_ngram_count_rdd = ngram_count_rdd if not N == 1: #[(<ngram feature>,cnt), ...] topN_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],x[1])).reduceByKey(add) #[((<ngram feature>,cnt),index), ...] topN_ngram_count_rdd = topN_ngram_count_rdd.sortBy(lambda x: x[1],ascending=False).zipWithIndex() length = topN_ngram_count_rdd.count() #top [(<ngram feature>,cntSum), ...] topN_ngram_count_rdd = topN_ngram_count_rdd.filter(lambda x: x[1]<1000).map(lambda x: x[0]) #freq [(<ngram feature>,(<hash>,cnt)), ...] freq_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],(x[0][0],x[1]))) #[(<ngram feature>,(cntSum,(<hash>,cnt))), ...] freq_ngram_count_rdd = topN_ngram_count_rdd.join(freq_ngram_count_rdd).map(lambda x: ((x[1][1][0],x[0]),x[1][1][1])) return freq_ngram_count_rdd
def ngramFeatureExtractors(n, inputCol=["text", "target"]): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] count_vectorizer = [ CountVectorizer(vocabSize=5460, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="features") ] label_stringIdx = [StringIndexer(inputCol="target", outputCol="label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + count_vectorizer + idf + assembler + label_stringIdx + lr)
def create_ngram(self, df, n, input_col, output_col='ngrams'): "Generate N-Gram -> https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram" from pyspark.ml.feature import NGram ngram = NGram(n=n, inputCol=input_col, outputCol=output_col) ngram_df = ngram.transform(df) return ngram_df
def build_ngrams_part(inputCol="words", n=6): ngrams = [ NGram(n=i, inputCol="words", outputCol="ngrams_{0}".format(i)) for i in range(7, n + 1) ] vectorizers = [ CountVectorizer(inputCol="ngrams_{0}".format(i), outputCol="ngramscounts_{0}".format(i)) for i in range(7, n + 1) ] return Pipeline(stages=ngrams + vectorizers)
def test_ngram(self): dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def main(train_x, train_y, test_x, test_y=None, idf=False, ngram=1, base='gs', asm=False): # Load : DF[id, url, features, label?] # The DataFrames only have a labels column if labels are given. # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens kind = 'asm' if asm else 'bytes' train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text') test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # Train the preprocessor and transform the data. prep = elizabeth.Preprocessor() prep.add(NGram(n=int(ngram))) prep.add(CountVectorizer()) if idf: prep.add(IDF()) train = prep.fit(train) test = prep.transform(test) # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction] nb = NaiveBayes(labelCol='indexedLabel').fit(train) test = nb.transform(test) test = index_labeller.transform( test) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: test = test.orderBy(test.id) test = test.withColumn( 'correct', (test.label == test.predictedClass).cast('double')) test = test.select(avg(test.correct)) print(test.show()) # If no labels are given for the test set, print predictions. else: test = test.orderBy(test.id).select(test.predictedClass) test = test.rdd.map(lambda row: int(row.predictedClass)) test = test.toLocalIterator() print(*test, sep='\n')
def main(): # basic cleaning and getting of files get_moby() sentences = get_sentences() # create spark app, for use in iPython notebook OR as a standalone. spark = SparkSession\ .builder\ .appName("NGramSample")\ .getOrCreate() # build a distributed dataframe sentence_df = spark.createDataFrame(sentences, ['id', 'sentences']) # create a tokenizer and write a 'words' column to DF tokenizer = Tokenizer(inputCol='sentences', outputCol='words') words = tokenizer.transform(sentence_df) # create ngram generators for bi, tri, and quad grams bigram = NGram(n=2, inputCol='words', outputCol='bigrams') trigram = NGram(n=3, inputCol='words', outputCol='trigrams') quadgram = NGram(n=4, inputCol='words', outputCol='quadgrams') # add each one in turn to the df bigrams = bigram.transform(words) trigrams = trigram.transform(bigrams) final = quadgram.transform(trigrams) # write as traversable JSON if os.path.exists('ngrams'): shutil.rmtree('ngrams') final.coalesce(1).write.json('ngrams') # as an example, write out quadgrams to CSV if os.path.exists('bigrams'): shutil.rmtree('bigrams') # This tricky bit selects bigrams, explodes it, and regroups by unique # bigram, then adds a count, after filtering out extremely uncommon bigrams # It finally writes to a CSV final.select('bigrams')\ .withColumn('bigrams', explode('bigrams'))\ .groupBy('bigrams').count().orderBy('count', ascending=False)\ .filter('count > 10')\ .coalesce(1).write.csv('bigrams')
def calculate_vectors(data, n=2, binary=False): ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams") ngramDataFrame = ngram.transform(data) ngrams = ngramDataFrame.select("ngrams") cvectorizer = CountVectorizer( inputCol="ngrams", outputCol="vec", binary=binary ) model = cvectorizer.fit(ngrams) return model.transform(ngrams).select("vec")
def initialize(): spark = SparkSession \ .builder \ .appName("search-flight-spark-ml-model") \ .getOrCreate() sc = spark.sparkContext auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET) api = tweepy.API(auth) important_fields = ['id', 'text', 'user'] schema = StructType([ StructField('id', LongType(), False), StructField('text', StringType(), False), StructField('username', StringType(), False) ]) tweetsDf = spark.createDataFrame(sc.emptyRDD(), schema) for tweet in tweepy.Cursor(api.search, q='barajas', rpp=100, lang='en').items(MAX_TWEETS): json_tweet = {k: tweet._json[k] for k in important_fields} json_tweet['text'] = json_tweet['text'].replace("'", "").replace( "\"", "").replace("\n", "") tweetDf = spark.createDataFrame([ (json_tweet['id'], json_tweet['text'], json_tweet['user']['name']) ], schema) tweetsDf = tweetsDf.union(tweetDf) tweets_df_splitted = tweetsDf.randomSplit([0.75, 0.25], MAX_TWEETS) training_set = tweets_df_splitted[0] test_set = tweets_df_splitted[1] username_indexed = StringIndexer(inputCol="username", outputCol="username_indexed") tokenizer = Tokenizer(inputCol="text", outputCol="token_raw") ngram = NGram(inputCol="token_raw", outputCol="ngram", n=2) hashing_tf = HashingTF(inputCol="ngram", outputCol="tf", numFeatures=20) idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=2) lr = LogisticRegression(featuresCol="idf", labelCol="username_indexed") pipeline = Pipeline( stages=[username_indexed, tokenizer, ngram, hashing_tf, idf, lr]) pipeline_model = pipeline.fit(training_set) pipeline_model.write().overwrite().save("tweet_traveling_partners_model") tweet_traveling_partners_prediction = pipeline_model.transform(test_set) selected = tweet_traveling_partners_prediction.select( "username", "text", "probability", "prediction") for row in selected.collect(): print(row) spark.stop()
def bytes_ngram(df_bytes, n): """ Generates n-grams bytes by bytes data frame. Returns n-grams bytes in RDD((hash, n-gram), total_counts) """ ngrams = NGram(n=n, inputCol="bytes", outputCol="ngrams") df_ngrams = ngrams.transform(df_bytes) rdd_ngrams = df_ngrams.select("hash", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\ .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add) return rdd_ngrams
def opcode_ngram(df_opcode, N): """ Generates n-grams opcode by opcode data frame. Returns n-grams opcode in RDD((filename, n-gram), total_counts) """ ngrams = NGram(n=N, inputCol="opcode", outputCol="ngrams") df_ngrams = ngrams.transform(df_opcode) rdd_ngrams = df_ngrams.select("filename", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\ .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add) return rdd_ngrams
def main(): input_dataset = sys.argv[1] output_dir = sys.argv[2] start_time = time.time() #stackoverflow_df = sqlContext.read.csv("../Datasource/stackOverFlow_ID_Title_SMALL.csv", header=True).toDF('id', 'text') stackoverflow_df = sqlContext.read.csv(input_dataset, header=True).toDF('id', 'text') # stackoverflow_df.show() # stackoverflow_df.head(10).show() # stack_df = stack_rdd.toDF(['id','text']) # stackoverflow_df.show() # stackoverflow_df.printSchema() model = Pipeline(stages=[ RegexTokenizer( pattern="", inputCol="text", outputCol="tokens", minTokenLength=1), NGram(n=3, inputCol="tokens", outputCol="ngrams"), HashingTF(inputCol="ngrams", outputCol="vectors"), MinHashLSH( inputCol="vectors", outputCol="lsh" ) #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5) ]).fit(stackoverflow_df) db_hashed = model.transform(stackoverflow_df) # db_hashed.show() # query_hashed = model.transform(query) # db_hashed.show() # query_hashed.show() #res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.90).filter("datasetA.id < datasetB.id") res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.70).filter("distCol > 0") #print res #print res.count() res.show() elapsed_time = time.time() - start_time print 'Elapsed Time ==> ', elapsed_time
def make_ngrams (df, n=1): df = df.withColumn('normalized_text', processing(F.col('text'))) tokenizer = Tokenizer(inputCol="normalized_text", outputCol="tokens") tokenized = tokenizer.transform(df).drop('normalized_text') ngram = NGram(n=n, inputCol="tokens", outputCol="n_gram") n_gram_df = ngram.transform(tokenized) n_gram_df = n_gram_df.withColumn('n_gram', F.explode('n_gram')) n_gram_df = n_gram_df.filter(F.length('n_gram')>2) return n_gram_df