grammer = NGram(n=2, inputCol="words", outputCol="grams") data = grammer.transform(data).drop('doc') print("grammed\ncvetirizing1") #data = data.rdd.map(lambda x: Row(x['did'],x['words']+x['grams'])).toDF(['did','features']) data = CountVectorizer(inputCol="words", outputCol="wordCounts", vocabSize=1000).fit(data).transform(data) print("cvectorized1\ncvectorizing2") data = CountVectorizer(inputCol="grams", outputCol="gramCounts", vocabSize=1000).fit(data).transform(data) print("cvectorized2\nvectorassembling") data = VectorAssembler(inputCols = ['wordCounts','gramCounts'],outputCol = 'features')\ .transform(data)\ .drop('words','grams','wordCounts','gramCounts','doc') data = data.join(labels, ['did']) print('joined\nindexing') indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") data = indexer.fit(data).transform(data) print('indexed') data.write.parquet("gs://elinor/NBTrainData") #------------------------------FIT RANDOM FOREST-----------------------------# print("fittingRF") rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10) rfmodel = rf.fit(data) print("fitted") #------------------------------FIT NB ---------------------------------------# print("fitting NB")
toker = Tokenizer(inputCol = "doc",outputCol = "words") data = toker.transform(data) print("tokenized\ngramming") grammer = NGram(n=2,inputCol="words",outputCol="grams") data = grammer.transform(data).drop('doc') print("grammed\ncvetirizing1") #data = data.rdd.map(lambda x: Row(x['did'],x['words']+x['grams'])).toDF(['did','features']) data = CountVectorizer(inputCol="words", outputCol="wordCounts").fit(data).transform(data) print("cvectorized1\ncvectorizing2") data = CountVectorizer(inputCol="grams", outputCol="gramCounts").fit(data).transform(data) print("cvectorized2\nvectorassembling") data = VectorAssembler(inputCols = ['wordCounts','gramCounts'],outputCol = 'features')\ .transform(data)\ .drop('words','grams','wordCounts','gramCouns','doc') data = data.join(labels,['did']) print('joined\nindexing') indexer = StringIndexer(inputCol="lab", outputCol="indexedLabel") data = indexer.fit(data).transform(data) print('indexed') #------------------------------DEAL WITH TEST DATA-----------------------------# print("loading training data") if DEBUG : wtf = sc.textFile(TEST_DATA)\ .map(lambda x: "data/bytes/" + x + ".bytes")\ .reduce(lambda accum,x: accum + "," + x) else: wtf = sc.textFile(TEST_DATA)\ .map(lambda x: "gs://uga-dsp/project2/data/bytes" + x + ".bytes")\ .reduce(lambda accum,x: accum + "," + x)