Exemple #1
0
def generate_nlp_columns(input_dataset,target):
    udf_remove_punc = udf(lambda s: removePunctuation(s) )
    # Remove Punctuation
    input_dataset = input_dataset.withColumn(target,udf_remove_punc(target))
    # Tokenize Title
    tokenizer = Tokenizer(inputCol=target, outputCol=target+"_words")
    input_dataset = tokenizer.transform(input_dataset)
    # Remove Stop Words
    remover = StopWordsRemover(inputCol=target+"_words", outputCol=target+"_cleanwords")
    input_dataset = remover.transform(input_dataset)
    # Generate N-Grams 
    ngram = NGram(n=2, inputCol=target+"_cleanwords", outputCol=target+"_bigrams")
    input_dataset = ngram.transform(input_dataset)
    trigram = NGram(n=3, inputCol=target+"_cleanwords", outputCol=target+"_trigrams")
    input_dataset = trigram.transform(input_dataset)
    # Drop Extra Columns - Leave ngrams only.
    input_dataset = input_dataset.drop(target+"_words")
    input_dataset = input_dataset.drop(target+"_cleanwords")
    # Perform TFIDF
    #hashingTF = HashingTF(inputCol=target+"_trigrams", outputCol=target+"_hashing", numFeatures=20)
    #input_dataset = hashingTF.transform(input_dataset)
    #idf = IDF(inputCol=target+"_hashing", outputCol=target+"_features")
    #idfModel = idf.fit(input_dataset)
    #input_dataset = idfModel.transform(input_dataset) 
    return input_dataset
Exemple #2
0
def build_pipeline(classifier='rf', max_depth=7):
    """
	creates a pipeline of functionalities to be applied on the training set
	"""

    # Training: Tokenize, Removing stop words, calculating n-grams, calcuating frequencies
    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol="words",
                               pattern='\w{8}|\s')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=['??'])
    ngram_2 = NGram(n=2, inputCol='filtered', outputCol='ngrams')
    ngram_3 = NGram(n=3, inputCol='filtered', outputCol='ngrams')
    hashingTF = HashingTF(inputCol="ngrams", outputCol="features")
    word2vec = Word2Vec(inputCol='ngrams', outputCol='features')

    if classifier == 'rf':
        clf = RandomForestClassifier(maxDepth=max_depth)
        stages = [tokenizer, remover, ngram_2, hashingTF, clf]
    elif classifier == 'nb':
        clf = NaiveBayes(smoothing=1)
        stages = [tokenizer, remover, ngram_3, hashingTF, clf]
    elif classifier == 'lr':
        clf = LogisticRegression()
        stages = [tokenizer, remover, ngram_2, word2vec, clf]
    else:
        raise ValueError("classifier must be 'rf', 'nb', or 'lr'.")
    return stages
Exemple #3
0
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel):

    asmFiles = hashFiles.map(
        lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm")

    def fun(accum, x):
        return accum + ',' + x

    asmFileString = asmFiles.reduce(fun)

    rdd1 = sc.wholeTextFiles(asmFileString, 20)

    opcodesInDoc = rdd1.map(lambda x: x[1].split()).map(
        lambda x: [word for word in x if word in opcodes.value]).zipWithIndex(
        ).map(lambda x: (x[1], x[0]))

    ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"])

    twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams")
    ngramFrame = threeGram.transform(ngramFrame)

    fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams")
    ngramFrame = fourGram.transform(ngramFrame)

    def getSegment(x):
        templist = []
        for line in x:
            l = re.findall(r'\w+:?(?=:)', line)
            if l:
                templist.append(l[0])
        return templist

    segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines(
    ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"])

    featureFrame = ngramFrame.join(segments, "docId")

    featuresDF = featureFrame.rdd.map(
        lambda x: Row(did=x['docId'],
                      docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[
                          '4grams'] + x['segments'])).toDF()

    featuresCV = featureFitModel.transform(featuresDF)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(testData, path)
    testData.show()
Exemple #4
0
def build_pipeline():
    tokenizer = [Tokenizer(inputCol='text', outputCol='words')]
    remover = [StopWordsRemover(inputCol="words", outputCol="stopped_words")]
    ngrams = [
        NGram(n=i, inputCol='stopped_words', outputCol='{0}_grams'.format(i))
        for i in range(1, 6)
    ]
    cv = [
        CountVectorizer(vocabSize=50000,
                        inputCol='{0}_grams'.format(i),
                        outputCol='{0}_tf'.format(i)) for i in range(1, 6)
    ]
    idf = [
        IDF(inputCol='{0}_tf'.format(i),
            outputCol='{0}_tfidf'.format(i),
            minDocFreq=5) for i in range(1, 6)
    ]
    tweetvect = [
        VectorAssembler(inputCols=["tweet_count"], outputCol="vec_tweet_count")
    ]
    ss = [
        StandardScaler(inputCol="vec_tweet_count", outputCol="ss_tweet_count")
    ]
    assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')]
    pipeline = Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                        tweetvect + ss + assembler)
    return pipeline
Exemple #5
0
def preprocess(inputCol=["text", "label"], n=4):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    remover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=2) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)
def extract_collocations(records, num_collocations, collocation_window):
    """Extracts the most common collocations present in the records.

    Params:
    - records (pyspark.rdd.RDD): The tokenized and lemmatized records from the JSON file
    - num_collocations (int): The number of collocations to show
    - collocation_window (int): The text window within which to search for collocations.

    Returns:
    - best_collocations (list<tuple<str, int>>): The highest scored collocations present in the records, with their
                                                 frequency of occurrence in the dataset.
    """
    # @see: https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram
    from pyspark.ml.feature import NGram

    data_frame = records.map(lambda record: Row(record[constants.VALUE])).toDF(
        ['words'])
    ngram_model = NGram(n=2, inputCol='words', outputCol='ngrams')
    ngram_data_frame = ngram_model.transform(data_frame)

    ngram_rdd = ngram_data_frame.select('ngrams').rdd
    ngram_rdd = ngram_rdd.flatMap(lambda row: row['ngrams'])\
        .map(lambda ngram: (ngram.encode('utf-8'), 1))\
        .reduceByKey(add)\
        .sortBy(lambda bigram_with_count: bigram_with_count[1], ascending=False)
    rdd_show(ngram_rdd)

    frequent_collocations = ngram_rdd.take(num_collocations)

    return frequent_collocations
Exemple #7
0
    def extract_featrues(self, train_rdd=None, test_rdd=None):
        """
        train_rdd: type rdd, the raw rdd of train data (text content, label)
        test_rdd: type rdd, the raw rdd of test data (text content, doc_id)
        return: type data frame, a data frame where each record contains the extracred features
        """
        print('****************************')
        print('Feature Extraction: TF-IDF\n')

        train_raw_df = train_rdd.map(lambda row:
                                     (self.convert(row[0]), row[1])).toDF(
                                         ['words', 'label'])
        test_raw_df = test_rdd.map(lambda row:
                                   (self.convert(row[0]), row[1])).toDF(
                                       ['words', 'doc_id'])

        ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
        train_ngram_df = ngram.transform(train_raw_df).drop('words')
        test_ngram_df = ngram.transform(test_raw_df).drop('words')

        hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features')
        train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop(
            'ngrams')
        test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop(
            'ngrams')

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(train_raw_featured_data)

        train_df = idf_model.transform(train_raw_featured_data).drop(
            'raw_features')
        test_df = idf_model.transform(test_raw_featured_data).drop(
            'raw_features')

        return (train_df, test_df)
Exemple #8
0
def build_pipeline():
    tokenizer = [Tokenizer(inputCol='tweet', outputCol='words')]
    ngrams = [
        NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i))
        for i in range(1, 4)
    ]
    cv = [
        CountVectorizer(vocabSize=5460,
                        inputCol='{0}_grams'.format(i),
                        outputCol='{0}_tf'.format(i)) for i in range(1, 4)
    ]
    idf = [
        IDF(inputCol='{0}_tf'.format(i),
            outputCol='{0}_tfidf'.format(i),
            minDocFreq=5) for i in range(1, 4)
    ]
    assembler = [
        VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1, 4)],
                        outputCol='features')
    ]
    label_stringIdx = [StringIndexer(inputCol='sentiment', outputCol='label')]
    lr = [LogisticRegression(maxIter=100)]
    pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                        label_stringIdx + lr)
    return pipeline
Exemple #9
0
def ngrram(dataframe, column, x):
    tokens = Tokenizer(inputCol=column, outputCol='tokens')
    nn = NGram(n=x, inputCol='tokens', outputCol='ngrams')
    b = tokens.transform(dataframe)
    a = nn.transform(b)
    final = a.select(['tokens', 'ngrams']).show(4)
    return final
Exemple #10
0
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0):

    tokenized = Tokenizer(inputCol="text",
                          outputCol="words").transoform(sentenceDataFrame)

    ngramDataFrame = NGram(n=ngrams, inputCol="words",
                           outputCol="ngrams").transform(tokenized)

    countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures")

    countVectModel = countVect.fit(ngramDataFrame)

    featurizedData = countVectModel.transform(ngramDataFrame)

    idf = IDF(minDocFreq=minDocFreq,
              inputCol="rawFeatures",
              outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features")

    normalizer = Normalizer(inputCol="features", outputCol='scores')
    X = normalizer.transform(rescaledData)

    return X
Exemple #11
0
def LR_Model(train_dataframe, test_dataframe):
    '''
    Takes the argument as a train_dataframe, test_dataframe implements the
    pipeline of RegexTokenizer,    NGrams =3 , HashingTF, IDF and
    LogisticRegression and predicts the label based on features of
    test_dataframe.

    The Pattern RegexTokenizer is set to "\\W|\b(00|CC)\b" because it removes
    all nonwords that is extra spaces or punctuations, '??', '00' and 'CC' are
    removed as these are most repeated words and accuracy is significantly
    improved.
    Args:
        dataframe:
            -The train_dataframe should consist of the columns, 'label'
            and 'text'.
            -The test_dataframe should consist of the column 'text'.
    Returns:
        DataFrame['prediction': double, given_order: bigint, label: string]
        iff data read initially is a small dataset
        else DataFrame['prediction': double, given_order: bigint]
        data read initially is a big dataset
    '''
    train_dataframe = train_dataframe.repartition(96)\
        .withColumn('label', train_dataframe['label'].cast(IntegerType()))
    regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words",
                                    pattern="\\W|\b(00|CC)\b")
    ngram = NGram(n=3, inputCol="words", outputCol="ngrams")
    hashingTF = HashingTF(inputCol="ngrams", outputCol="TF")
    idf = IDF(inputCol="TF", outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.001)
    pipeline = Pipeline(stages=[regexTokenizer, ngram, hashingTF, idf, lr])
    model = pipeline.fit(train_dataframe)
    predictions_df = model.transform(test_dataframe)
    return predictions_df\
        .drop('rawfeatures', 'n_grams', 'TF', 'text', 'words', 'features')
Exemple #12
0
def process_df(df):
    time_seq.append(['start process-df', time.time()])
    model = Pipeline(stages=[
        RegexTokenizer(pattern=" ",
                       inputCol="instruments",
                       outputCol="instruments_tokenized",
                       minTokenLength=1),
        NGram(n=1,
              inputCol="instruments_tokenized",
              outputCol="instruments_ngrams"),
        HashingTF(inputCol="instruments_ngrams",
                  outputCol="instruments_vectors"),
        MinHashLSH(inputCol="instruments_vectors",
                   outputCol="instruments_lsh",
                   numHashTables=10)
    ]).fit(df)

    df_hashed = model.transform(df)
    df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \
        .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \
        .select(f.col('datasetA.filename').alias('filename_A'),
                f.col('datasetB.filename').alias('filename_B'),
                f.col('distance'))
    time_seq.append(['process-df df_matches', time.time()])
    write_df_to_pgsql(df_matches, 'filepair_similarity_run3')
    time_seq.append(['write pgsql', time.time()])
    print('time_seq', time_seq)
def shringles(x, fileName):
    # tokenize and ngrams
    tokenizer = RegexTokenizer(inputCol="value",
                               outputCol="words",
                               pattern="\\W")
    ngrams = NGram(n=x, inputCol="words", outputCol="kshringles")
    shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
Exemple #14
0
def train(allHex,labels,hashFiles,sc,sqlc,path):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")

    def fun(accum,x):
        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))

    ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000)

    featureFitModel = cv.fit(ngramFrame)

    featuresCV = featureFitModel.transform(ngramFrame)

    labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0]))

    labelFrame = labelRdd.toDF(["did","label"])

    trainData = ngramFrame.featuresCV(labelFrame,"did")
    trainData.persist(StorageLevel(True, True, False, False, 1))
    saveData(trainData,path)

    trainData.show()
    returm featureFitModel
Exemple #15
0
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")
    def fun(accum,x):

        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))
    Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1])))
    sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0])))

    ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    featuresCV = featureFitModel.transform(ngramFrame)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(ngramFrame,path)
    testData.show()
Exemple #16
0
def feature_engineering(class_balancedDf):
    # N-Gram
    ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams")
    ngramDataFrame = ngram.transform(class_balancedDf)

    # Hashing TF
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(ngramDataFrame)

    # IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # K-Means
    kmeans = KMeans().setK(6).setSeed(1)
    kmodel = kmeans.fit(rescaledData).transform(rescaledData)

    #LDA
    lda = LDA(k=10, maxIter=10)
    ldamodel = lda.fit(kmodel).transform(kmodel)

    # changing label column to int
    data = ldamodel.withColumn(
        "label", ldamodel.label.cast("Integer")).drop("prediction")

    return data
Exemple #17
0
def build_ngrams(n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="tokens")]
    stopwordsRemover = [
        StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered')
    ]
    ngrams = [
        NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
    cv = [
        CountVectorizer(vocabSize=5460,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_cv".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_cv".format(i),
            outputCol="{0}_idf".format(i),
            minDocFreq=5) for i in range(1, n + 1)
    ]
    assembler = [
        VectorAssembler(
            inputCols=["{0}_idf".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]

    stringIndexer = [StringIndexer(inputCol="class", outputCol="label")]
    lr = [LogisticRegression(maxIter=100)]

    return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                    stringIndexer + lr)
def Ngram_feature(N, feature_rdd):
    '''
        Extract and count N-gram. Leave top 1000 n-gram features if it's 2-gram or more.
        
        Input:
        feature_rdd : [(<hash1>,<feature1>), (<hash1>,<feature2>), ..., (<hashN>,<featureK>)]
        
        Output:
        freq_ngram_count_rdd : [((<hash>,<ngram feature>),cnt), ...]
        '''
    feature_rdd = feature_rdd.groupByKey().map(lambda x: (x[0],list(x[1])))
    df = spark.createDataFrame(feature_rdd).toDF("file_names", "features")
    ngram = NGram(n=N, inputCol="features", outputCol="ngrams")
    ngramDataFrame = ngram.transform(df)
    ngram_rdd = ngramDataFrame.rdd.map(tuple).map(lambda x: (x[0],x[2])).flatMapValues(lambda x: x)
    ngram_count_rdd = ngram_rdd.map(lambda x: ((x),1)).reduceByKey(add)
    freq_ngram_count_rdd = ngram_count_rdd

    if not N == 1:
        #[(<ngram feature>,cnt), ...]
        topN_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],x[1])).reduceByKey(add)
        #[((<ngram feature>,cnt),index), ...]
        topN_ngram_count_rdd = topN_ngram_count_rdd.sortBy(lambda x: x[1],ascending=False).zipWithIndex()
        length = topN_ngram_count_rdd.count()
        #top [(<ngram feature>,cntSum), ...]
        topN_ngram_count_rdd = topN_ngram_count_rdd.filter(lambda x: x[1]<1000).map(lambda x: x[0])
        #freq [(<ngram feature>,(<hash>,cnt)), ...]
        freq_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],(x[0][0],x[1])))
        #[(<ngram feature>,(cntSum,(<hash>,cnt))), ...]
        freq_ngram_count_rdd = topN_ngram_count_rdd.join(freq_ngram_count_rdd).map(lambda x: ((x[1][1][0],x[0]),x[1][1][1]))
    
    return freq_ngram_count_rdd
Exemple #19
0
def ngramFeatureExtractors(n, inputCol=["text", "target"]):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    count_vectorizer = [
        CountVectorizer(vocabSize=5460,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=5) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]
    label_stringIdx = [StringIndexer(inputCol="target", outputCol="label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + count_vectorizer + idf +
                    assembler + label_stringIdx + lr)
Exemple #20
0
  def create_ngram(self, df, n, input_col, output_col='ngrams'):
    "Generate N-Gram -> https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram"
    from pyspark.ml.feature import NGram

    ngram = NGram(n=n, inputCol=input_col, outputCol=output_col)

    ngram_df = ngram.transform(df)
    return ngram_df
Exemple #21
0
def build_ngrams_part(inputCol="words", n=6):
    ngrams = [ 
        NGram(n=i, inputCol="words", outputCol="ngrams_{0}".format(i)) 
        for i in range(7, n + 1) ]
    vectorizers = [ 
        CountVectorizer(inputCol="ngrams_{0}".format(i), outputCol="ngramscounts_{0}".format(i)) 
        for i in range(7, n + 1) ]
    return Pipeline(stages=ngrams + vectorizers)
Exemple #22
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
Exemple #23
0
def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')
def main():
    # basic cleaning and getting of files
    get_moby()
    sentences = get_sentences()

    # create spark app, for use in iPython notebook OR as a standalone.
    spark = SparkSession\
        .builder\
        .appName("NGramSample")\
        .getOrCreate()

    # build a distributed dataframe
    sentence_df = spark.createDataFrame(sentences, ['id', 'sentences'])

    # create a tokenizer and write a 'words' column to DF
    tokenizer = Tokenizer(inputCol='sentences', outputCol='words')
    words = tokenizer.transform(sentence_df)

    # create ngram generators for bi, tri, and quad grams
    bigram = NGram(n=2, inputCol='words', outputCol='bigrams')
    trigram = NGram(n=3, inputCol='words', outputCol='trigrams')
    quadgram = NGram(n=4, inputCol='words', outputCol='quadgrams')

    # add each one in turn to the df
    bigrams = bigram.transform(words)
    trigrams = trigram.transform(bigrams)
    final = quadgram.transform(trigrams)

    # write as traversable JSON
    if os.path.exists('ngrams'):
        shutil.rmtree('ngrams')
    final.coalesce(1).write.json('ngrams')

    # as an example, write out quadgrams to CSV
    if os.path.exists('bigrams'):
        shutil.rmtree('bigrams')

    # This tricky bit selects bigrams, explodes it, and regroups by unique
    # bigram, then adds a count, after filtering out extremely uncommon bigrams
    # It finally writes to a CSV
    final.select('bigrams')\
        .withColumn('bigrams', explode('bigrams'))\
        .groupBy('bigrams').count().orderBy('count', ascending=False)\
        .filter('count > 10')\
        .coalesce(1).write.csv('bigrams')
Exemple #25
0
def calculate_vectors(data, n=2, binary=False):
    ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams")
    ngramDataFrame = ngram.transform(data)
    ngrams = ngramDataFrame.select("ngrams")
    cvectorizer = CountVectorizer(
        inputCol="ngrams", outputCol="vec", binary=binary
    )
    model = cvectorizer.fit(ngrams)
    return model.transform(ngrams).select("vec")
Exemple #26
0
def initialize():

    spark = SparkSession \
        .builder \
        .appName("search-flight-spark-ml-model") \
        .getOrCreate()
    sc = spark.sparkContext

    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
    api = tweepy.API(auth)
    important_fields = ['id', 'text', 'user']

    schema = StructType([
        StructField('id', LongType(), False),
        StructField('text', StringType(), False),
        StructField('username', StringType(), False)
    ])

    tweetsDf = spark.createDataFrame(sc.emptyRDD(), schema)

    for tweet in tweepy.Cursor(api.search, q='barajas', rpp=100,
                               lang='en').items(MAX_TWEETS):
        json_tweet = {k: tweet._json[k] for k in important_fields}
        json_tweet['text'] = json_tweet['text'].replace("'", "").replace(
            "\"", "").replace("\n", "")
        tweetDf = spark.createDataFrame([
            (json_tweet['id'], json_tweet['text'], json_tweet['user']['name'])
        ], schema)
        tweetsDf = tweetsDf.union(tweetDf)

    tweets_df_splitted = tweetsDf.randomSplit([0.75, 0.25], MAX_TWEETS)
    training_set = tweets_df_splitted[0]
    test_set = tweets_df_splitted[1]

    username_indexed = StringIndexer(inputCol="username",
                                     outputCol="username_indexed")
    tokenizer = Tokenizer(inputCol="text", outputCol="token_raw")
    ngram = NGram(inputCol="token_raw", outputCol="ngram", n=2)
    hashing_tf = HashingTF(inputCol="ngram", outputCol="tf", numFeatures=20)
    idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=2)
    lr = LogisticRegression(featuresCol="idf", labelCol="username_indexed")
    pipeline = Pipeline(
        stages=[username_indexed, tokenizer, ngram, hashing_tf, idf, lr])

    pipeline_model = pipeline.fit(training_set)
    pipeline_model.write().overwrite().save("tweet_traveling_partners_model")

    tweet_traveling_partners_prediction = pipeline_model.transform(test_set)

    selected = tweet_traveling_partners_prediction.select(
        "username", "text", "probability", "prediction")
    for row in selected.collect():
        print(row)

    spark.stop()
def bytes_ngram(df_bytes, n):
    """
    Generates n-grams bytes by bytes data frame.
    Returns n-grams bytes in RDD((hash, n-gram), total_counts)
    """
    ngrams = NGram(n=n, inputCol="bytes", outputCol="ngrams")
    df_ngrams = ngrams.transform(df_bytes)
    rdd_ngrams = df_ngrams.select("hash", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\
                    .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)
    return rdd_ngrams
Exemple #28
0
def opcode_ngram(df_opcode, N):
    """
    Generates n-grams opcode by opcode data frame.
    Returns n-grams opcode in RDD((filename, n-gram), total_counts)
    """
    ngrams = NGram(n=N, inputCol="opcode", outputCol="ngrams")
    df_ngrams = ngrams.transform(df_opcode)
    rdd_ngrams = df_ngrams.select("filename", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\
                    .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)
    return rdd_ngrams
Exemple #29
0
def main():
    input_dataset = sys.argv[1]
    output_dir = sys.argv[2]

    start_time = time.time()

    #stackoverflow_df = sqlContext.read.csv("../Datasource/stackOverFlow_ID_Title_SMALL.csv", header=True).toDF('id', 'text')

    stackoverflow_df = sqlContext.read.csv(input_dataset,
                                           header=True).toDF('id', 'text')

    # stackoverflow_df.show()

    # stackoverflow_df.head(10).show()

    # stack_df = stack_rdd.toDF(['id','text'])

    # stackoverflow_df.show()

    # stackoverflow_df.printSchema()

    model = Pipeline(stages=[
        RegexTokenizer(
            pattern="", inputCol="text", outputCol="tokens", minTokenLength=1),
        NGram(n=3, inputCol="tokens", outputCol="ngrams"),
        HashingTF(inputCol="ngrams", outputCol="vectors"),
        MinHashLSH(
            inputCol="vectors", outputCol="lsh"
        )  #MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=5)
    ]).fit(stackoverflow_df)

    db_hashed = model.transform(stackoverflow_df)

    # db_hashed.show()
    # query_hashed = model.transform(query)

    # db_hashed.show()
    # query_hashed.show()

    #res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed, 0.90).filter("datasetA.id < datasetB.id")

    res = model.stages[-1].approxSimilarityJoin(db_hashed, db_hashed,
                                                0.70).filter("distCol > 0")

    #print res

    #print res.count()

    res.show()

    elapsed_time = time.time() - start_time

    print 'Elapsed Time ==> ', elapsed_time
def make_ngrams (df, n=1):

    df = df.withColumn('normalized_text', processing(F.col('text')))
    tokenizer = Tokenizer(inputCol="normalized_text", outputCol="tokens")
    tokenized = tokenizer.transform(df).drop('normalized_text')
    
    ngram = NGram(n=n, inputCol="tokens", outputCol="n_gram")
    n_gram_df = ngram.transform(tokenized)
    n_gram_df = n_gram_df.withColumn('n_gram', F.explode('n_gram'))
    n_gram_df = n_gram_df.filter(F.length('n_gram')>2)
    
    return n_gram_df