def test_rbm(): rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1, n_hidden=20, output_folder='tmp_rbm_plots')
def test_rbm(): t0 = time.time() rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1, output_folder='tmp_rbm_plots') print >> sys.stderr, "test_rbm took %.3fs expected ??s in our buildbot" % ( time.time() - t0)
def main(): whiten = False if len(sys.argv) > 1 and sys.argv[1] == '--whiten': whiten = True del sys.argv[1] if len(sys.argv) <= 3: print 'Usage: %s pcaDims n_hidden learningRate' % sys.argv[0] sys.exit(1) # loads data like datasets = ((train_x, train_y), ([], None), (test_x, None)) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') img_dim = 15 # must match actual size of training data print 'done loading.' pcaDims = int(sys.argv[1]) pca = PCA(datasets[0][0]) # train datasets[0][0] = pca.toPC(datasets[0][0], pcaDims, whiten=whiten) # train datasets[1][0] = pca.toPC( datasets[1][0], pcaDims, whiten=whiten) if len(datasets[1][0]) > 0 else array([]) # valid datasets[2][0] = pca.toPC(datasets[2][0], pcaDims, whiten=whiten) # test print 'reduced by PCA to' print('(%d, %d, %d) %d dimensional examples in (train, valid, test)' % (datasets[0][0].shape[0], datasets[1][0].shape[0], datasets[2][0].shape[0], datasets[0][0].shape[1])) # plot mean and principle components image = Image.fromarray( tile_raster_images(X=pca.meanAndPc(pcaDims).T, img_shape=(img_dim, img_dim), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save(os.path.join(resman.rundir, 'meanAndPc.png')) # plot fractional stddev in PCA dimensions pyplot.semilogy(pca.fracStd, 'bo-') if pcaDims is not None: pyplot.axvline(pcaDims) pyplot.savefig(os.path.join(resman.rundir, 'fracStd.png')) pyplot.clf() test_rbm(datasets=datasets, training_epochs=45, img_dim=img_dim, n_input=pcaDims if pcaDims else img_dim * img_dim, n_hidden=int(sys.argv[2]), learning_rate=float(sys.argv[3]), output_dir=resman.rundir, quickHack=False, visibleModel='real', initWfactor=.01, imgPlotFunction=lambda xx: pca.fromPC(xx, unwhiten=whiten))
def main(): whiten = False if len(sys.argv) > 1 and sys.argv[1] == '--whiten': whiten = True del sys.argv[1] if len(sys.argv) <= 3: print 'Usage: %s pcaDims n_hidden learningRate' % sys.argv[0] sys.exit(1) # loads data like datasets = ((train_x, train_y), ([], None), (test_x, None)) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') img_dim = 15 # must match actual size of training data print 'done loading.' pcaDims = int(sys.argv[1]) pca = PCA(datasets[0][0]) # train datasets[0][0] = pca.toPC(datasets[0][0], pcaDims, whiten = whiten) # train datasets[1][0] = pca.toPC(datasets[1][0], pcaDims, whiten = whiten) if len(datasets[1][0]) > 0 else array([]) # valid datasets[2][0] = pca.toPC(datasets[2][0], pcaDims, whiten = whiten) # test print 'reduced by PCA to' print ('(%d, %d, %d) %d dimensional examples in (train, valid, test)' % (datasets[0][0].shape[0], datasets[1][0].shape[0], datasets[2][0].shape[0], datasets[0][0].shape[1])) # plot mean and principle components image = Image.fromarray(tile_raster_images( X = pca.meanAndPc(pcaDims).T, img_shape = (img_dim,img_dim),tile_shape = (10,10), tile_spacing=(1,1))) image.save(os.path.join(resman.rundir, 'meanAndPc.png')) # plot fractional stddev in PCA dimensions pyplot.semilogy(pca.fracStd, 'bo-') if pcaDims is not None: pyplot.axvline(pcaDims) pyplot.savefig(os.path.join(resman.rundir, 'fracStd.png')) pyplot.clf() test_rbm(datasets = datasets, training_epochs = 45, img_dim = img_dim, n_input = pcaDims if pcaDims else img_dim * img_dim, n_hidden = int(sys.argv[2]), learning_rate = float(sys.argv[3]), output_dir = resman.rundir, quickHack = False, visibleModel = 'real', initWfactor = .01, imgPlotFunction = lambda xx: pca.fromPC(xx, unwhiten = whiten))
def main(): # Load both squares and spheres datasets img_dim = 10 # 2, 4, 10, 15, 28 cubeDatasets = loadPickledData('../data/cubes/train_%d_50000.pkl.gz' % img_dim, '../data/cubes/test_%d_50000.pkl.gz' % img_dim) sphereDatasets = loadPickledData('../data/spheres/train_%d_50000.pkl.gz' % img_dim, '../data/spheres/test_%d_50000.pkl.gz' % img_dim) # If necessary, reduce to 20000 rows to prevent memory error #cubeDatasets = ((cubeDatasets[0][0][:20000,:], None), # (None, None), # (cubeDatasets[2][0][:20000,:], None)) #sphereDatasets = ((sphereDatasets[0][0][:20000,:], None), # (None, None), # (sphereDatasets[2][0][:20000,:], None)) # make different size datasets sizes = [10, 20, 40, 100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000] #sizes = [10, 20, 40, 100, 200, 400, 1000, 2000, 4000, 10000] #sizes = [20000] #sizes = [40000] #sizes = [10, 20] sizedDatasetsX = {} sizedDatasetsXY = {} for size in sizes: sizedDatasetsX[size] = makeSizedDataset(size, cubeDatasets, sphereDatasets, appendClass = False) sizedDatasetsXY[size] = makeSizedDataset(size, cubeDatasets, sphereDatasets, appendClass = True) testDatasetX = makeSizedDataset(40000, cubeDatasets, sphereDatasets, appendClass = False) testDatasetXY = makeSizedDataset(40000, cubeDatasets, sphereDatasets, appendClass = True) print 'done loading.' for useXY in [False, True]: for size in sizes: print 'useXY', useXY, ', Size:', size thisDir = os.path.join(resman.rundir, '%s_size_%05d' % ('xy' if useXY else 'x', size)) os.mkdir(thisDir) if useXY: thisDataset = (sizedDatasetsXY[size], (array([]), None), testDatasetXY) else: thisDataset = (sizedDatasetsX[size], (array([]), None), testDatasetX) # this automatically saves the RBM to the given directory rbm, meanCosts = test_rbm(datasets = thisDataset, training_epochs = 45, img_dim = img_dim, n_hidden = 400, learning_rate = .002, output_dir = thisDir, quickHack = False, initWfactor = .02, imgPlotFunction = lambda xx: xx[:,0:img_dim*img_dim], # HACK: plot first slice )
def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123): assert vsize and hsizes and lr #input = T.dmatrix('global_input') self.layers = [] for hsize in hsizes: r = rbm.test_rbm(learning_rate=lr, output_folder='dbn_rbm_plots') # configure inputs for subsequent layer input = self.layers[-1].hid vsize = hsize
with gzip.open('mnist.pkl.gz', 'rb') as f: try: train_set, valid_set, test_set = pickle.load(f, encoding='latin1') except: train_set, valid_set, test_set = pickle.load(f) mnist_x, data_y = train_set test_x, test_y = test_set for i in range(10): res = test_rbm(dataset = mnist_x[data_y == i], neg_dataset = mnist_x[data_y != i], learning_rate = 0.1, training_epochs = 10, batch_size = 20, output_folder = "disscd_digit_%d" % i, n_hidden = 500, k = 10, pcd=False) input = T.matrix('input') en = res.get_energy(input) ef = theano.function(inputs=[input], outputs=[en]) persistent_vis_chain = theano.shared( numpy.asarray( test_x[test_y == i], dtype=theano.config.floatX )
import rbm, DBN from rbm import test_rbm from DBN import test_DBN ''' A simple script to run some of the default tests to determine the speed of training different algorithms. I found that my result was slower for the RBM test (151.38 compared to 122.47), but seemed faster than the DBM test result quoted on deeplearning.net for LISA labs (On my ThinkPad T430 I get ~1.65 mins/epoch for the DBN compared to 2.2 mins/epoch there). The pretraining ran for 871.49 m and the fine tuning ran for 226.5 m, giving a best validation and test error of 1.49% (compared to 1.27 and 1.34 respectively for lisa labs) at iteration 110000. This is slower than the 615 minutes for pretraining and 101 minutes for fine tuning reported by lisa labs. Possibly the fact that I was running this and lrn2 at the same time may have slowed it down at the end. Started testing the default mnist_pretrain algorithm from lrn2 (the one with RBMs). Each epoch takes about 97 seconds. Full training took 590.5 m. ''' if (sys.argv[1] == 'rbm'): test_rbm() elif (sys.argv[1] == 'dbn'): test_DBN() elif (sys.argv[3] == 'mnist_pretrain'): parser = argparse.ArgumentParser( description="Run a complete lrn2 work flow") parser.add_argument("run_keyword", metavar="run_keyword", help="Keyword for the current test") parser.add_argument("modelconfig", help="model config file") parser.add_argument("nettype", help="Type of net I am using") parser.add_argument( "--re-train",
def summarize(text): # SPLIT TO PARAGRAPHS pre_paragraphs = text.split('\n') paragraphs = [] for i, p in enumerate(pre_paragraphs): if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1 or re.match(r'^\s*$', pre_paragraphs[i + 1])): paragraphs.append(p) # print(f'Num of paragraphs: {len(paragraphs)}') # for i, p in enumerate(paragraphs): # print(f'par#{i+1}: {p}') # SPLIT TO SENTENCES sentences = separator.separate(text) print(f'Num of sentences: {len(sentences)}') for i, s in enumerate(sentences): print(f'#{i+1}: {s}') # TOKENIZE stem = False if stem: tokenized_sentences = [[ czech_stemmer.cz_stem(word, aggressive=False) for word in sentence ] for sentence in tokenize(sentences)] else: tokenized_sentences = tokenize(sentences) # REMOVE STOPWORDS tokenized_sentences_without_stopwords = remove_stop_words( tokenized_sentences, keep_case=False) sentences_without_stopwords_case = remove_stop_words( sentences, keep_case=True, is_tokenized=False, return_tokenized=False) print('===Sentences without stopwords===') for i, s in enumerate(tokenized_sentences_without_stopwords): print(f'''#{i+1}: {' '.join(s)}''') print('===Sentences without stopwords CASE===') for i, s in enumerate(sentences_without_stopwords_case): print(f'''#{i+1}: {s}''') # POS-TAG tagged_sentences = pos_tag(sentences_without_stopwords_case) print('=====Tagged_sentences=====') for i, s in enumerate(tagged_sentences): print(f'''#{i+1}: {s}''') # 1. THEMATICITY FEATURE thematicity_feature_scores = thematicity_feature( tokenized_sentences_without_stopwords) # 2. SENTENCE POSITION FEATURE - NOTE: shitty! sentence_position_scores = sentence_position_feature(len(sentences)) # 3. SENTENCE LENGTH FEATURE sentence_length_scores = sentence_length_feature(tokenized_sentences) # 4. SENTENCE PARAGRAPH POSITION FEATURE # 5. PROPER_NOUN FEATURE proper_noun_scores = proper_noun_feature(tagged_sentences) # 6. NUMERALS FEATURE numerals_scores = numerals_feature(tokenized_sentences) # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords) # 9. CENTROID SIMILARITY FEATURE centroid_similarity_scores = centroid_similarity_feature( sentences, tf_isf_scores) # 10. UPPER-CASE FEATURE (not in the paper) upper_case_scores = upper_case_feature(tokenized_sentences) # 11. QUOTES FEATURE (not in the paper) quotes_scores = quotes_feature(sentences) # 12. REFERENCES FEATURE (not in the paper) references_scores = references_feature(tokenized_sentences) # 13. TEXTRANK FEATURE (not in the paper) textrank_scores = textrank.textrank(tokenized_sentences, True, '4-1-0.0001') feature_matrix = [] feature_matrix.append(thematicity_feature_scores) feature_matrix.append(sentence_position_scores) feature_matrix.append(sentence_length_scores) feature_matrix.append(proper_noun_scores) feature_matrix.append(numerals_scores) feature_matrix.append(tf_isf_scores) feature_matrix.append(centroid_similarity_scores) feature_matrix.append(upper_case_scores) features = [ ' thema', 'sen_pos', 'sen_len', ' propn', ' num', ' tf_isf', 'cen_sim', ' upper' ] feature_matrix_2 = np.zeros((len(sentences), len(features))) for i in range(len(features)): for j in range(len(sentences)): feature_matrix_2[j][i] = feature_matrix[i][j] feature_sum = [] for i in range(len(np.sum(feature_matrix_2, axis=1))): feature_sum.append(np.sum(feature_matrix_2, axis=1)[i]) print('=====Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in feature_matrix: print('{: .4f}'.format(round(f_s[i], 4)), end='|') print('{: .4f}'.format(round(feature_sum[i], 4))) print('Training rbm...') rbm_trained = rbm.test_rbm(dataset=feature_matrix_2, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=len(features)) # another implementation of rbm, from sklearn # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1) # rbm_trained = rbm2.fit_transform(feature_matrix_2) # print(rbm_trained) rbm_trained_sums = np.sum(rbm_trained, axis=1) print('=====RBM Enhanced Scores=====') print(35 * ' ', end='|') for f in features: print(f, end='|') print() for i, s in enumerate(sentences): print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|') for f_s in rbm_trained[i]: print('{: .4f}'.format(round(f_s, 4)), end='|') print('{: .4f}'.format(round(rbm_trained_sums[i], 4))) enhanced_feature_sum = [] feature_sum = [] for i in range(len(np.sum(rbm_trained, axis=1))): enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i]) feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i]) print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') enhanced_feature_sum.sort(key=lambda x: x[0]) feature_sum.sort(key=lambda x: -1 * x[0]) print('=====Sorted=====') print(f'enhanced_feature_sum: {enhanced_feature_sum}') print(f'feature_sum: {feature_sum}') # print('=====The text=====') # for x in range(len(sentences)): # print(sentences[x]) extracted_sentences_rbm = [] extracted_sentences_rbm.append([sentences[0], 0]) extracted_sentences_simple = [] extracted_sentences_simple.append([sentences[0], 0]) summary_length = max(min(round(len(sentences) / 4), 12), 3) # length between 3-12 sentences for x in range(summary_length): if enhanced_feature_sum[x][1] != 0: extracted_sentences_rbm.append([ sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1] ]) if feature_sum[x][1] != 0: extracted_sentences_simple.append( [sentences[feature_sum[x][1]], feature_sum[x][1]]) extracted_sentences_rbm.sort(key=lambda x: x[1]) extracted_sentences_simple.sort(key=lambda x: x[1]) final_text_rbm = '' for i in range(len(extracted_sentences_rbm)): final_text_rbm += extracted_sentences_rbm[i][0] + '\n' final_text_simple = '' for i in range(len(extracted_sentences_simple)): final_text_simple += extracted_sentences_simple[i][0] + '\n' print('=====Extracted Final Text RBM=====') print(final_text_rbm) print() print('=====Extracted Final Text simple=====') print(final_text_simple) return final_text_rbm
def executeForAFile_input(filename, cwd): os.chdir(cwd + "/Input_file") file = open(filename, 'r') text = file.read() file.close() # os.chdir(cwd) paragraphs = para_reader.show_paragraphs(filename) sentences = split_into_sentences(text) text_len = len(sentences) tokenized_sentences = remove_stop_words(sentences) # print(tokenized_sentences) # # tagged = pos_tag(remove_stop_words(sentences)) # print("LENNNNN : ") # print(len(senPos(paragraphs))) # term frequency score tfIdfScore = tFiDF(tokenized_sentences) # print('Term frequency') # print(tfIdfScore) # Number of numerals numericTokenScore = numericToken(tokenized_sentences) # print('numeric token score') # print(numericTokenScore) # Number of Named entity namedEntityRecogScore = namedEntityRecog(sentences, cwd) print('named Entity score') print(namedEntityRecogScore) # # # Sentence position score sentencePosScore = senPos(sentences) # print('Sentence score') # print(sentencePosScore) # # Sentence Position score sentenceParaScore = paraPos(paragraphs) # print('Sentence Para score') # print(sentenceParaScore) featureMatrix = [] featureMatrix.append(sentencePosScore) featureMatrix.append(sentenceParaScore) featureMatrix.append(numericTokenScore) featureMatrix.append(namedEntityRecogScore) featureMatrix.append(tfIdfScore) # prepare feature matrix for training featureMat = np.zeros((len(sentences), 5)) for i in range(5): for j in range(len(sentences)): featureMat[j][i] = featureMatrix[i][j] print("\n\n\nPrinting Feature Matrix : ") print(featureMat) # feature sum generation feature_sum = [] for i in range(len(np.sum(featureMat, axis=1))): feature_sum.append(np.sum(featureMat, axis=1)[i]) print(feature_sum) #training using rbm temp = rbm.test_rbm(featureMat, learning_rate=0.1, training_epochs=10, batch_size=5, n_chains=5, n_hidden=5) enhancedFeatureSum = [] for i in range(len(sentences)): enhancedFeatureSum.append([np.sum(temp, axis=1)[i], i]) index_sentence = sorted(enhancedFeatureSum, key=lambda x: x[0], reverse=True)[0][1] output = sentences[index_sentence] #get enhancedfeature return output
from pca import PCA if __name__ == '__main__': resman.start('junk', diary=True) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') #meanTrain = mean(datasets[0][0]) #stdTrain = std(datasets[0][0]) #datasets[0][0] = (datasets[0][0] - meanTrain) / stdTrain #datasets[2][0] = (datasets[2][0] - meanTrain) / stdTrain pca = PCA(datasets[0][0]) datasets[0][0] = pca.toZca(datasets[0][0], None, epsilon=.1) datasets[2][0] = pca.toZca(datasets[2][0], None, epsilon=.1) print 'done loading.' test_rbm( datasets=datasets, training_epochs=45, img_dim=15, # must match actual size of training data n_hidden=int(sys.argv[1]), learning_rate=float(sys.argv[2]), output_dir=resman.rundir, quickHack=False, visibleModel='real', initWfactor=.01, pcaDims=None) resman.stop()
def executeForAFile(filename, output_file_name, cwd): os.chdir(cwd + "/articles") file = open(filename, 'r') text = file.read() paragraphs = para_reader.show_paragraphs(filename) print(paragraphs) print("Number of paras : %d", len(paragraphs)) sentences = split_into_sentences(text) text_len = len(sentences) sentenceLengths.append(text_len) tokenized_sentences = remove_stop_words(sentences) tagged = posTagger(remove_stop_words(sentences)) thematicFeature(tokenized_sentences) print(upperCaseFeature(sentences)) print("LENNNNN : ") print(len(sentencePosition(paragraphs))) tfIsfScore = tfIsf(tokenized_sentences) similarityScore = similarityScores(tokenized_sentences) print("\n\nProper Noun Score : \n") properNounScore = properNounScores(tagged) print(properNounScore) centroidSimilarityScore = centroidSimilarity(sentences, tfIsfScore) numericTokenScore = numericToken(tokenized_sentences) namedEntityRecogScore = namedEntityRecog(sentences) sentencePosScore = sentencePos(sentences) sentenceLengthScore = sentenceLength(tokenized_sentences) thematicFeatureScore = thematicFeature(tokenized_sentences) sentenceParaScore = sentencePosition(paragraphs) featureMatrix = [] featureMatrix.append(thematicFeatureScore) featureMatrix.append(sentencePosScore) featureMatrix.append(sentenceLengthScore) #featureMatrix.append(sentenceParaScore) featureMatrix.append(properNounScore) featureMatrix.append(numericTokenScore) featureMatrix.append(namedEntityRecogScore) featureMatrix.append(tfIsfScore) featureMatrix.append(centroidSimilarityScore) featureMat = np.zeros((len(sentences), 8)) for i in range(8): for j in range(len(sentences)): featureMat[j][i] = featureMatrix[i][j] print("\n\n\nPrinting Feature Matrix : ") print(featureMat) print("\n\n\nPrinting Feature Matrix Normed : ") #featureMat_normed = featureMat / featureMat.max(axis=0) featureMat_normed = featureMat feature_sum = [] for i in range(len(np.sum(featureMat, axis=1))): feature_sum.append(np.sum(featureMat, axis=1)[i]) print(featureMat_normed) for i in range(len(sentences)): print(featureMat_normed[i]) temp = rbm.test_rbm(dataset=featureMat_normed, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=8) print("\n\n") print(np.sum(temp, axis=1)) enhanced_feature_sum = [] enhanced_feature_sum2 = [] for i in range(len(np.sum(temp, axis=1))): enhanced_feature_sum.append([np.sum(temp, axis=1)[i], i]) enhanced_feature_sum2.append(np.sum(temp, axis=1)[i]) print(enhanced_feature_sum) print("\n\n\n") enhanced_feature_sum.sort(key=lambda x: x[0]) print(enhanced_feature_sum) length_to_be_extracted = len(enhanced_feature_sum) / 2 print("\n\nThe text is : \n\n") for x in range(len(sentences)): print(sentences[x]) print("\n\n\nExtracted sentences : \n\n\n") extracted_sentences = [] extracted_sentences.append([sentences[0], 0]) indeces_extracted = [] indeces_extracted.append(0) for x in range(length_to_be_extracted): if (enhanced_feature_sum[x][1] != 0): extracted_sentences.append([ sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1] ]) indeces_extracted.append(enhanced_feature_sum[x][1]) extracted_sentences.sort(key=lambda x: x[1]) finalText = "" print("\n\n\nExtracted Final Text : \n\n\n") for i in range(len(extracted_sentences)): print("\n" + extracted_sentences[i][0]) finalText = finalText + extracted_sentences[i][0] os.chdir(cwd + "/outputs") file = open(output_file_name, "w") file.write(finalText) file.close() os.chdir(cwd) file = open("featureSum", "w") for item in feature_sum: print(item, end="\n", file=file) file = open("enhancedfeatureSum", "w") for item in enhanced_feature_sum2: print(item, end="\n", file=file)
def test_rbm(): t0=time.time() rbm.test_rbm(training_epochs = 1, batch_size = 300, n_chains = 1, n_samples = 1, output_folder = 'tmp_rbm_plots') print >> sys.stderr, "test_rbm took %.3fs expected ??s in our buildbot"%(time.time()-t0)
def run(filename, out_path): decode_path = os.path.join(out_path, filename[:-4]) cmd = 'apktool d "' + out_path + filename + '" -o ' + decode_path os.system(cmd) # decode apk if os.path.exists(os.path.join(decode_path, "AndroidManifest.xml")) is not True: icon = None meta = [] all_permission = [] all_api = [] result = u"该应用设有防止反编译机制,无法反编译,请换一个应用" return icon, meta, all_permission, all_api, result icon, meta = metaInfo(os.path.join(out_path, filename), decode_path) #permission_vector, all_permission = permission.ml_per_feature(decode_path) # return all permission needed #api_vector, all_api = api.ml_api_feature(decode_path) # return all api called #ngram_vector, all_ngram = ngram.ml_ngram_feature(decode_path) ''' 加载训练模型 ''' ''' rf = pickle.load(open('dataset/model', 'rb')) gbdt = pickle.load(open('dataset/model2', 'rb')) ada = pickle.load(open('dataset/model3', 'rb')) lr = pickle.load(open('dataset/model4', 'rb')) ''' vc = pickle.load(open('dataset/model_final', 'rb')) '''构造该应用的特征向量 ''' permission_vector, all_permission = permission.ml_per_feature( decode_path) # return all permission needed api_vector, all_api = api.ml_api_feature( decode_path) # return all api called ngram_vector, all_ngram = ngram.ml_ngram_feature(decode_path) feature_vector = permission_vector + api_vector + ngram_vector '''feature-selection ''' #make list to a vector train_set_x = list() train_set_x.append(feature_vector) raw_train_set_x = train_set_x fsmodel = pickle.load(open('dataset/fsmodel', 'rb')) #print len(train_set_x) fs_vector = fsmodel.transform(train_set_x) fsmodel2 = pickle.load(open('dataset/fsmodel2', 'rb')) fs_vector = fsmodel2.transform(fs_vector) feature_vector = fs_vector[0] train_set_x = fs_vector # + fs_vector #print train_set_x #[[]] ######################### DEBUG ############################ fs_vec = [] for i in range(len(raw_train_set_x[0])): fs_vec.append(i) print fs_vec fs_vec = fsmodel.transform(fs_vec) fs_vec = fsmodel2.transform(fs_vec) print fs_vec feature_matrix_dl = [x for x in range(len(raw_train_set_x))] for i in range(len(feature_matrix_dl)): feature_matrix_dl[i] = [ x for x in range(len(raw_train_set_x[0]) - len(fs_vec[0])) ] temp = 0 for i in range(len(raw_train_set_x[0])): if i not in fs_vec: #print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl)): feature_matrix_dl[j][temp] = raw_train_set_x[j][i] temp = temp + 1 #print "行数%d" % len(feature_matrix_dl) #print "列数%d" % len(feature_matrix_dl[0]) train_set_x = feature_matrix_dl #print train_set_x train_set_x1 = [train_set_x[0]] b_s = 5 for i in range(1, b_s): train_set_x1.append(1) train_set_x1[i] = train_set_x[0] #print train_set_x1 train_set_x = train_set_x1 ################################################################## print len(raw_train_set_x) print len(fs_vec[0]) rbm = pickle.load(open('dataset/rbmmodel', 'rb')) hiddeny, test = test_rbm(train_set_x, train_set_x_feature_num=len(train_set_x[0]), batch_size=b_s, rbm_object=rbm) hiddeny = hiddeny[0] feature_vector = numpy.concatenate((feature_vector, hiddeny), axis=0).reshape(1, -1).tolist() #print feature_vector # print len(feature_vector) ''' 预测 ''' ''' #[r1, r2, r3, r4] = [rf.predict(feature_vector), gbdt.predict(feature_vector), ada.predict(feature_vector), lr.predict(feature_vector)] [p1, p2, p3, p4] = [rf.predict_proba(feature_vector), gbdt.predict_proba(feature_vector), ada.predict_proba(feature_vector), lr.predict_proba(feature_vector)] [w1, w2, w3, w4] = [1, 1, 1, 1] #expect = w1 * r1 + w2 * r2 + w3 * r3 + w4 * r4 expect = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4 print ("risk score: %0.2f" % (expect[0][1] / (expect[0][0] + expect[0][1]))) #if expect > 2: if expect[0][0] < expect[0][1]: result = u"预测该应用是 恶意应用" else: result = u"预测该应用是 良性应用" print result #print [r1, r2, r3, r4] p = [p1, p2, p3, p4] print p #print all_permission ''' expect1 = vc.predict(feature_vector) score1 = '%.0f' % (vc.predict_proba(feature_vector)[0][1] * 100) print("risk score: %s" % score1) print filename if filename.find("apk"): expect2, score22 = g_predict.run(out_path + filename) print "expect2 = ", expect2 else: expect2, score22 = g_predict.run(out_path + filename + ".apk") print out_path if filename.find("apk"): expect3, score3 = dy_predict.dyrun(out_path + filename) else: expect3, score3 = dy_predict.dyrun(out_path + filename + ".apk") if expect2 == -1: score = score1 * 0.8 + score3 * 0.2 expect = expect1 * 0.8 + expect3 * 0.2 if expect < 0.5: expect = 0 else: expect = 1 else: score2 = '%.0f' % (score22 * 100) print "score1", score1 print "score2", score2 score2 = int(score2) score1 = int(score1) score3 = int(score3) print "score2 by int", score2 print "score3", score3 score = score1 * 0.6 + score2 * 0.2 + score3 * 0.2 print score score = int(score) print score expect = expect1 * 0.6 + expect2 * 0.2 + expect3 * 0.2 if expect < 0.5: expect = 0 else: expect = 1 img1 = int(score) / 10 img2 = int(score) % 10 if expect == 1: result = u"预测该应用是 恶意应用" else: result = u"预测该应用是 良性应用" print result # -------------------- Permission OUTPUT --------------------------------------------- permission_output = {} # print all_permission with open('/home/ubuntu/Code/data/permission.csv') as f: f_csv = csv.reader(f) for row in f_csv: if row[0].strip() in all_permission: permission_output[row[0].strip()] = {} permission_output[row[0].strip( )]['Description'] = row[1].strip().decode('utf-8') permission_output[ row[0].strip()]['ThreatLevel'] = row[2].strip() # ----------------- Sensitive Api OUTPUT --------------------------------- sensitive_api_output = {} sensitive_api_list = {} with open('/home/ubuntu/Code/data/all_sensitive_api.csv') as f: f_csv = csv.reader(f) for row in f_csv: if row[0] != 'API Name': sensitive_api_list[row[0].strip()] = {} sensitive_api_list[row[0].strip( )]['Description'] = row[1].strip().decode('utf-8') sensitive_api_list[ row[0].strip()]['ThreatLevel'] = row[2].strip() # sensitive api输出结构为二维字典 # packagename :{ api:{'Description':xxx, ThreatLevel:'xxx' }} for each_api in all_api: if each_api in sensitive_api_list: packagename, api_name = each_api.split('->') # print packagename, '#' ,api_name # print packagename # print api_name if packagename not in sensitive_api_output: sensitive_api_output[packagename] = {} sensitive_api_output[packagename][api_name] = sensitive_api_list[ each_api] # print sensitive_api_output # ------------------------- Component ---------------------------------------- component_output = component.run(decode_path) # ---------------------- DY OUTPUT --------------------------------------- dy_path = os.path.join('/home/project/apks/uploads/', meta['File Md5'] + '.dy') if not os.path.exists(dy_path): behavior = dy_analyse1.dy_demo(os.path.join(out_path, filename), meta['Package Name']) #print "bebavior:" + behavior #pickle.dump(behavior, open(dy_path, 'wb')) else: print "ddddddd" #behavior = pickle.load(open(dy_path, 'rb')) #behavior1 = eval(open('/home/project/out.txt','r').read()) outpath1 = '/home/ubuntu/DroidBox/output/out1.json' behavior = [json.loads(line) for line in open(outpath1)] #print behavior #behavior = behavior1[0] # ------------------------ FILE ACCESS ---------------------------------------- file_access = [] if 'fdaccess' in behavior[0]: if behavior[0]['fdaccess']: file_bebavior = behavior[0]['fdaccess'] print file_bebavior for each_info in file_bebavior: file_access.append([ file_bebavior[each_info]['path'], file_bebavior[each_info]['operation'], file_bebavior[each_info]['data'] ]) # ------------------------ SMS ACCESS ---------------------------------------- sms_access = [] if 'sendsms' in behavior[0]: if behavior[0]['sendsms']: sms_bebavior = behavior[0]['sendsms'] for each_info in sms_bebavior: sms_access.append([ sms_bebavior[each_info]['number'], sms_bebavior[each_info]['message'] ]) # ------------------------ Net ACCESS ---------------------------------------- net_send = [] net_recv = [] if 'sendnet' in behavior[0]: if behavior[0]['sendnet']: net_send_bebavior = behavior[0]['sendnet'] for each_info in net_send_bebavior: net_send.append([net_send_bebavior[each_info]['desthost'],net_send_bebavior[each_info]['data'],\ net_send_bebavior[each_info]['operation'],net_send_bebavior[each_info]['destport']]) if 'recvnet' in behavior[0]: if behavior[0]['recvnet']: net_recv_bebavior = behavior[0]['recvnet'] for each_info in net_recv_bebavior: net_recv.append([net_recv_bebavior[each_info]['host'],net_recv_bebavior[each_info]['port'],\ net_recv_bebavior[each_info]['data']]) #------------------------------- Sensitive Data operation--------------------------------- data_operation = [] sensitive_data_list = {} with open('/home/ubuntu/Code/data/dy_feature.csv') as f: f_csv = csv.reader(f) for row in f_csv: if row[0] != 'Feature': sensitive_data_list[row[0].strip()] = {} sensitive_data_list[row[0].strip( )]['Description'] = row[1].strip().decode('utf-8') #print sensitive_api_list if 'dataleaks' in behavior[0]: if behavior[0]['dataleaks']: sen_data_bebavior = behavior[0]['dataleaks'] print sen_data_bebavior for item in sensitive_data_list: for each_info in sen_data_bebavior: if item in sen_data_bebavior[each_info]['tag']: data_operation.append([ str(sen_data_bebavior[each_info]['tag']).decode( 'utf-8'), sensitive_data_list[item]['Description'], sen_data_bebavior[each_info]['data'] ]) print "data_operation:", data_operation #---------------------------------dataleak----------------------------------------- data_leak = [] dataleak_list = {} with open('/home/ubuntu/Code/data/dy_feature.csv') as f: f_csv = csv.reader(f) for row in f_csv: if row[0] != 'Feature': dataleak_list[row[0].strip()] = {} dataleak_list[row[0].strip()]['Description'] = row[1].strip( ).decode('utf-8') #print dataleak_list flag = 0 if 'sendnet' in behavior[0]: if behavior[0]['sendnet']: data_leak_bebavior = behavior[0]['sendnet'] #print data_leak_bebavior for item in dataleak_list: if str(item).find('sendnet_') != -1: flag = 1 print item, flag for each_info in data_leak_bebavior: temp = (str(item))[8:] if temp in (str(data_leak_bebavior[each_info]['tag'])): print temp data_leak.append([ data_leak_bebavior[each_info]['desthost'], str(data_leak_bebavior[each_info] ['tag']).decode('utf-8'), dataleak_list[item]['Description'] ]) break #print "data_leak:",data_leak #--------------------------------enfperm--------------------------------------- enf_per = [] if 'enfperm' in behavior[0]: if behavior[0]['enfperm']: enf_bebavior = behavior[0]['enfperm'] print enf_bebavior for each_info in enf_bebavior: enf_per.append([enf_bebavior[each_info]]) print "enf_per:", enf_per #------------------------------------------------------------------------------- return icon, meta, permission_output, sensitive_api_output, component_output, file_access, sms_access, net_send, net_recv, data_operation, data_leak, enf_per, result, img1, img2, expect
if __name__ == '__main__': resman.start('junk', diary = True) datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz', '../data/upson_rovio_1/test_15_50000.pkl.gz') #meanTrain = mean(datasets[0][0]) #stdTrain = std(datasets[0][0]) #datasets[0][0] = (datasets[0][0] - meanTrain) / stdTrain #datasets[2][0] = (datasets[2][0] - meanTrain) / stdTrain pca = PCA(datasets[0][0]) datasets[0][0] = pca.toZca(datasets[0][0], None, epsilon = .1) datasets[2][0] = pca.toZca(datasets[2][0], None, epsilon = .1) print 'done loading.' test_rbm(datasets = datasets, training_epochs = 45, img_dim = 15, # must match actual size of training data n_hidden = int(sys.argv[1]), learning_rate = float(sys.argv[2]), output_dir = resman.rundir, quickHack = False, visibleModel = 'real', initWfactor = .01, pcaDims = None) resman.stop()
return train_set if __name__ == '__main__': resman.start('junk', diary = True) circles = False if len(sys.argv) > 1 and sys.argv[1] == '--circles': circles = True del sys.argv[1] print 'Using dataset:', 'circles' if circles else 'squares' img_dim = 10 # 2, 4, 10, 15, 28 if circles: datasets = loadPickledData('../data/circles/train_%d_50000.pkl.gz' % img_dim, '../data/circles/test_%d_50000.pkl.gz' % img_dim) else: datasets = loadPickledData('../data/squares/train_%d_50000.pkl.gz' % img_dim, '../data/squares/test_%d_50000.pkl.gz' % img_dim) print 'done loading.' test_rbm(datasets = datasets, training_epochs = 5, img_dim = img_dim, n_hidden = 100, learning_rate = .1, output_dir = resman.rundir, quickHack = False) resman.stop()
if __name__ == '__main__': resman.start('junk', diary = True) spheres = False if len(sys.argv) > 1 and sys.argv[1] == '--spheres': spheres = True del sys.argv[1] print 'Using dataset:', 'spheres' if spheres else 'cubes' img_dim = 10 # 2, 4, 10, 15, 28 if spheres: datasets = loadPickledData('../data/spheres/train_%d_50000.pkl.gz' % img_dim, '../data/spheres/test_%d_50000.pkl.gz' % img_dim) else: datasets = loadPickledData('../data/cubes/train_%d_50000.pkl.gz' % img_dim, '../data/cubes/test_%d_50000.pkl.gz' % img_dim) print 'done loading.' rbm, meanCosts = test_rbm(datasets = datasets, training_epochs = 45, img_dim = img_dim, n_hidden = 200, learning_rate = .1, output_dir = resman.rundir, quickHack = False, imgPlotFunction = lambda xx: xx[:,0:img_dim*img_dim], # HACK: plot first slice ) resman.stop()
featureMat = np.zeros((len(sentence_list), 5)) for i in range(5): for j in range(len(sentence_list)): featureMat[j][i] = featureMatrix[i][j] featureMat_normed = featureMat feature_sum = [] for i in range(len(np.sum(featureMat, axis=1))): feature_sum.append(np.sum(featureMat, axis=1)[i]) temp = rbm.test_rbm(dataset=featureMat_normed, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=5) enhanced_feature_sum = [] enhanced_feature_sum2 = [] for i in range(len(np.sum(temp, axis=1))): enhanced_feature_sum.append([np.sum(temp, axis=1)[i], i]) enhanced_feature_sum2.append(np.sum(temp, axis=1)[i]) enhanced_feature_sum.sort(key=lambda x: x[0]) length_to_be_extracted = len(enhanced_feature_sum) / 2 for x in range(len(sentence_list)): print(sentence_list[x])
#! /usr/bin/env python import numpy, time, gzip, PIL.Image, os, pdb #import cPickle as pickle import pickle from numpy import * from ResultsManager import resman from rbm import RBM, test_rbm from utils import loadUpsonData if __name__ == '__main__': resman.start('junk', diary=False) datasets = loadUpsonData('../data/upson_rovio_1/train_10_50000.pkl.gz', '../data/upson_rovio_1/test_10_50000.pkl.gz') print 'done loading.' test_rbm(datasets=datasets, training_epochs=45, img_dim=10, n_hidden=500, learning_rate=.002, output_dir=resman.rundir, quickHack=False) resman.stop()
def test_rbm(): rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1, output_folder='tmp_rbm_plots')
def get_summary(self): paragraphs = self.content.split('\n\n') sentences = RBM_summarizer.split_into_sentences(self.content) tokenized_sentences = RBM_summarizer.remove_stop_words(sentences) uppercase_score = RBM_summarizer.upperCaseFeature(sentences) namedEntityRecogScore = RBM_summarizer.namedEntityRecog(sentences) sentencePosScore = RBM_summarizer.sentencePosition(sentences) sentenceParaScore = RBM_summarizer.sentencePosition(paragraphs) thematicFeatureScore = RBM_summarizer.thematicFeature( tokenized_sentences) tagged = RBM_summarizer.posTagger(tokenized_sentences) tfIsfScore = RBM_summarizer.tfIsf(tokenized_sentences) similarityScore = RBM_summarizer.similarityScores(tokenized_sentences) numericTokenScore = RBM_summarizer.numericToken(tokenized_sentences) sentenceLengthScore = RBM_summarizer.sentenceLength( tokenized_sentences) properNounScore = RBM_summarizer.properNounScores(tagged) centroidSimilarityScore = RBM_summarizer.centroidSimilarity( sentences, tfIsfScore) featureMat = np.zeros((len(sentences), 8)) featureMatrix = [] featureMatrix.append(thematicFeatureScore) featureMatrix.append(sentencePosScore) featureMatrix.append(sentenceLengthScore) featureMatrix.append(sentenceParaScore) featureMatrix.append(properNounScore) featureMatrix.append(numericTokenScore) featureMatrix.append(namedEntityRecogScore) featureMatrix.append(tfIsfScore) featureMatrix.append(centroidSimilarityScore) for i in range(8): for j in range(len(sentences)): featureMat[j][i] = featureMatrix[i][j] feature_sum = [] for i in range(len(np.sum(featureMat, axis=1))): feature_sum.append(np.sum(featureMat, axis=1)[i]) temp = rbm.test_rbm(dataset=featureMat, learning_rate=0.1, training_epochs=14, batch_size=5, n_chains=5, n_hidden=8) enhanced_feature_sum = [] enhanced_feature_sum2 = [] for i in range(len(np.sum(temp, axis=1))): enhanced_feature_sum.append([np.sum(temp, axis=1)[i], i]) enhanced_feature_sum2.append(np.sum(temp, axis=1)[i]) enhanced_feature_sum.sort(key=lambda x: x[0]) length_to_be_extracted = len(enhanced_feature_sum) / 4 extracted_sentences = [] extracted_sentences.append([sentences[0], 0]) indeces_extracted = [] indeces_extracted.append(0) for x in range(int(length_to_be_extracted)): if (enhanced_feature_sum[x][1] != 0): extracted_sentences.append([ sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1] ]) indeces_extracted.append(enhanced_feature_sum[x][1]) extracted_sentences.sort(key=lambda x: x[1]) finalText = "" for i in range(len(extracted_sentences)): finalText = finalText + extracted_sentences[i][0] return finalText
) % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.), file=sys.stderr) return test_score * 100. if __name__ == '__main__': #MNIST dataset dataset = 'mnist.pkl.gz' datasets = load_data(dataset) rbm_results_mnist = [] cd = [False, True] gibbs = [1,5,10,15] for i in range(len(gibbs)): for j in range(len(cd)): rbm_results_mnist.append(test_rbm(rbm_persistence = cd[j], gibbs_steps = gibbs[i], dataset_nm = datasets)) dbn_results_mnist = [] dbn_results_mnist.append(test_DBN(hidden_layers = [700], k=10, dataset_nm = datasets)) dbn_results_mnist.append(test_DBN(hidden_layers = [1000], k=10, dataset_nm = datasets)) dbn_results_mnist.append(test_DBN(hidden_layers = [700, 700], k=10, dataset_nm = datasets)) dbn_results_mnist.append(test_DBN(hidden_layers = [700, 1000], k=10, dataset_nm = datasets)) #CIFAR dataset datasets = load_data_cifar() rbm_results_cifar = [] cd = [False, True] gibbs = [1,5,10,15] for i in range(len(gibbs)): for j in range(len(cd)):
def train(): # if os.path.exists('dataset/per_feature_matrix'): # per_feature_matrix = pickle.load(open('dataset/per_feature_matrix', 'rb')) # else: start = time.time() print "extracting feature matrix..." if 1: per_feature_matrix = {} for each in os.listdir('dataset/per_feature'): path = os.path.join('dataset/per_feature/', each) per_feature_matrix = dict(pickle.load(open(path, 'rb')), **per_feature_matrix) per_feature_matrix = per_feature_matrix.values() pickle.dump(per_feature_matrix, open('dataset/per_feature_matrix', 'wb')) # if os.path.exists('dataset/api_feature_matrix'): # api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb')) # else: if 1: api_feature_matrix = {} for each in os.listdir('dataset/api_feature'): path = os.path.join('dataset/api_feature/', each) api_feature_matrix = dict(pickle.load(open(path, 'rb')), **api_feature_matrix) api_feature_matrix = api_feature_matrix.values() pickle.dump(api_feature_matrix, open('dataset/api_feature_matrix', 'wb')) # if os.path.exists('dataset/ngram_feature_matrix'): # ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb')) # else: if 1: ngram_feature_matrix = {} for each in os.listdir('dataset/ngram_feature'): path = os.path.join('dataset/ngram_feature/', each) ngram_feature_matrix = dict(pickle.load(open(path, 'rb')), **ngram_feature_matrix) ngram_feature_matrix = ngram_feature_matrix.values() pickle.dump(ngram_feature_matrix, open('dataset/ngram_feature_matrix', 'wb')) classification = pickle.load(open('dataset/classification', 'rb')) if per_feature_matrix is not None and api_feature_matrix is not None and ngram_feature_matrix is not None: feature_matrix = _concatenate(per_feature_matrix, api_feature_matrix, ngram_feature_matrix) elif per_feature_matrix is not None: feature_matrix = per_feature_matrix elif api_feature_matrix is not None: feature_matrix = api_feature_matrix elif ngram_feature_matrix is not None: feature_matrix = ngram_feature_matrix else: return print "extracting feature matrix done." print "处理前样本总数:%d" % len(feature_matrix) #print len(feature_matrix) #print len(classification) features = 400 fsmodel = SelectKBest(chi2, k=features) raw_feature_matrix = feature_matrix feature_matrix = fsmodel.fit_transform(feature_matrix, classification) pickle.dump(fsmodel, open('dataset/fsmodel', 'wb')) features = 300 svc = SVC(kernel="linear", C=1) fsmodel2 = RFE(estimator=svc, n_features_to_select=features, step=1) ######################### DEBUG ############################ #classification = classification[7:] ################################################################## feature_matrix = fsmodel2.fit_transform(feature_matrix, classification) pickle.dump(fsmodel2, open('dataset/fsmodel2', 'wb')) ######################### DEBUG ############################ b_s = 5 #改这里也要改dl.py里面的默认值 length = len(feature_matrix) feature_matrix = feature_matrix[length % b_s:] raw_feature_matrix = raw_feature_matrix[length % b_s:] classification = classification[length % b_s:] print "处理后样本总数:%d" % len(feature_matrix) ################################################################## ######################### DEBUG ############################ fs_vec = [] for i in range(len(raw_feature_matrix[0])): fs_vec.append(i) #构造值等于编号的特殊向量 fs_vec = fsmodel.transform(fs_vec) #print fs_vec fs_vec = fsmodel2.transform(fs_vec) #print fs_vec feature_matrix_dl = [x for x in range(len(raw_feature_matrix))] for i in range(len(feature_matrix_dl)): feature_matrix_dl[i] = [ x for x in range(len(raw_feature_matrix[0]) - features) ] temp = 0 for i in range(len(raw_feature_matrix[0])): if i not in fs_vec: print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl)): feature_matrix_dl[j][temp] = raw_feature_matrix[j][i] temp = temp + 1 #print "行数%d" % len(feature_matrix_dl) #print "列数%d" % len(feature_matrix_dl[0]) #print feature_matrix_dl ################################################################## #hiddeny, da = test_dA(feature_matrix_dl, len(feature_matrix_dl[0])) # hiddeny2, test = test_dA(feature_matrix,len(feature_matrix[0]), batch_size=6, da_object = da) hiddeny, da = test_rbm(feature_matrix_dl, len(feature_matrix_dl[0])) #print len(feature_matrix) print "浅度特征数:%d" % len(feature_matrix[0]) #print len(hiddeny) print "深度特征数:%d" % len(hiddeny[0]) # print (hiddeny == hiddeny2).all() #固化深度训练器 pickle.dump(da, open('dataset/rbmmodel', 'wb')) # 深度特征融合 feature_matrix = numpy.concatenate((feature_matrix, hiddeny), axis=1) Z = [] count = 0 for i in feature_matrix: Z.append([]) for j in i: Z[count].append(j) count += 1 feature_matrix = Z # print feature_matrix Z = [] for i in classification: Z.append(int(i)) classification = Z if 1: per_feature_matrix2 = {} for each in os.listdir('test/per_feature'): path = os.path.join('test/per_feature/', each) per_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **per_feature_matrix2) per_feature_matrix2 = per_feature_matrix2.values() pickle.dump(per_feature_matrix2, open('test/per_feature_matrix', 'wb')) # if os.path.exists('dataset/api_feature_matrix'): # api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb')) # else: if 1: api_feature_matrix2 = {} for each in os.listdir('test/api_feature'): path = os.path.join('test/api_feature/', each) api_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **api_feature_matrix2) api_feature_matrix2 = api_feature_matrix2.values() pickle.dump(api_feature_matrix2, open('test/api_feature_matrix', 'wb')) # if os.path.exists('dataset/ngram_feature_matrix'): # ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb')) # else: if 1: ngram_feature_matrix2 = {} for each in os.listdir('test/ngram_feature'): path = os.path.join('test/ngram_feature/', each) ngram_feature_matrix2 = dict(pickle.load(open(path, 'rb')), **ngram_feature_matrix2) ngram_feature_matrix2 = ngram_feature_matrix2.values() pickle.dump(ngram_feature_matrix2, open('test/ngram_feature_matrix', 'wb')) classification2 = pickle.load(open('test/classification', 'rb')) if per_feature_matrix2 is not None and api_feature_matrix2 is not None and ngram_feature_matrix2 is not None: feature_matrix2 = _concatenate(per_feature_matrix2, api_feature_matrix2, ngram_feature_matrix2) elif per_feature_matrix2 is not None: feature_matrix2 = per_feature_matrix2 elif api_feature_matrix2 is not None: feature_matrix2 = api_feature_matrix2 elif ngram_feature_matrix2 is not None: feature_matrix2 = ngram_feature_matrix2 else: return print "extracting feature matrix done." print "处理前样本总数:%d" % len(feature_matrix2) #print len(feature_matrix) #print len(classification) features = 400 fsmodel2 = SelectKBest(chi2, k=features) raw_feature_matrix2 = feature_matrix2 feature_matrix2 = fsmodel.fit_transform(feature_matrix2, classification2) features2 = 300 svc = SVC(kernel="linear", C=1) fsmodel2 = RFE(estimator=svc, n_features_to_select=features2, step=1) feature_matrix2 = fsmodel2.fit_transform(feature_matrix2, classification2) ######################### DEBUG ############################ b_s = 5 #改这里也要改dl.py里面的默认值 length = len(feature_matrix2) feature_matrix2 = feature_matrix2[length % b_s:] raw_feature_matrix2 = raw_feature_matrix2[length % b_s:] classification2 = classification2[length % b_s:] print "处理后样本总数:%d" % len(feature_matrix2) ################################################################## ######################### DEBUG ############################ fs_vec2 = [] for i in range(len(raw_feature_matrix2[0])): fs_vec2.append(i) #构造值等于编号的特殊向量 fs_vec2 = fsmodel.transform(fs_vec2) #print fs_vec fs_vec2 = fsmodel2.transform(fs_vec2) #print fs_vec feature_matrix_dl2 = [x for x in range(len(raw_feature_matrix2))] for i in range(len(feature_matrix_dl2)): feature_matrix_dl2[i] = [ x for x in range(len(raw_feature_matrix2[0]) - features2) ] temp = 0 for i in range(len(raw_feature_matrix2[0])): if i not in fs_vec2: print "第%d列特征没有选用" % i for j in range(len(feature_matrix_dl2)): feature_matrix_dl2[j][temp] = raw_feature_matrix2[j][i] temp = temp + 1 hiddeny2, da = test_rbm(feature_matrix_dl2, len(feature_matrix_dl2[0])) #print len(feature_matrix) print "浅度特征数:%d" % len(feature_matrix2[0]) #print len(hiddeny) print "深度特征数:%d" % len(hiddeny2[0]) # print (hiddeny == hiddeny2).all() # 深度特征融合 feature_matrix2 = numpy.concatenate((feature_matrix2, hiddeny2), axis=1) Z = [] count = 0 for i in feature_matrix2: Z.append([]) for j in i: Z[count].append(j) count += 1 feature_matrix2 = Z # print feature_matrix Z = [] for i in classification2: Z.append(int(i)) classification2 = Z ''' kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with RF..." rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) rf.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(rf, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with RF done.\n" pickle.dump(rf, open('dataset/model', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with GBDT..." gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) gbdt.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(gbdt, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with GBDT done.\n" pickle.dump(gbdt, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with AdaBoost..." ada = AdaBoostClassifier(n_estimators=300) ada.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(ada, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with AdaBoost done.\n" pickle.dump(ada, open('dataset/model3', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with LogisticRegression..." lr = LogisticRegression() lr.fit(feature_matrix2, classification2) print "Cross Validating..." scores = cross_validation.cross_val_score(lr, feature_matrix2, classification2, cv=kf) print scores print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) print "learning with LogisticRegression done.\n" pickle.dump(lr, open('dataset/model4', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with RF..." rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) rf.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(rf, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with RF done.\n" pickle.dump(rf, open('dataset/model', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with GBDT..." gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) gbdt.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(gbdt, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with GBDT done.\n" pickle.dump(gbdt, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with AdaBoost..." ada = AdaBoostClassifier(n_estimators=300) ada.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(ada, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with AdaBoost done.\n" pickle.dump(ada, open('dataset/model3', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) print "learning with LogisticRegression..." lr = LogisticRegression() lr.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(lr, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with LogisticRegression done.\n" pickle.dump(lr, open('dataset/model4', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' ''' kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) print "\nlearning with SVC..." slffork=SVC(kernel='rbf',probability = True) slffork.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(slffork, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with SVC done.\n" pickle.dump(slffork, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' ''' print "learning with BaggingClassifier..." kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True) baggingfork = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5,max_features=0.5) baggingfork.fit(feature_matrix2, classification2) print "Cross Validating..." predicted = cross_validation.cross_val_predict(baggingfork, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) print "learning with BaggingClassifier done.\n" pickle.dump(baggingfork, open('dataset/model2', 'wb')) # 固化训练结果 #print 'time :%f'% (time.time() - start) ''' '''kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)''' rf = RandomForestClassifier(n_estimators=300, min_samples_split=10) gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0, max_depth=100, min_samples_split=10, random_state=0) ada = AdaBoostClassifier(n_estimators=300) #slf1=SVC(kernel='rbf',probability = True) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) print "learning with Voting Classifier..." vc = VotingClassifier(estimators=[('rf', rf), ('ada', ada), ('bagging', bagging), ('gbdt', gbdt)], voting='soft', weights=[1.5, 1.5, 1.3, 1.5]) vc.fit(feature_matrix, classification) ''' print "Cross Validating..." predicted = cross_validation.cross_val_predict(vc, feature_matrix2, classification2, cv=kf) print "Confusion matrix: " print metrics.confusion_matrix(classification2, predicted) print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted)) print "Precision: " print metrics.precision_score(classification2, predicted, average=None) print "Recall: " print metrics.recall_score(classification2, predicted, average=None) print "F1 " print metrics.f1_score(classification2, predicted, average=None) ''' print "learning with Ensemble Classifier done.\n" pickle.dump(vc, open('dataset/model_final', 'wb')) # 固化训练结果 print 'time :%f' % (time.time() - start)
batch_size=24, shuffle=False) input_dim, hidden_dim1, hidden_dim2, output_dim = 36, 36 * 2, 36, 10 model = Net(input_dim, hidden_dim1, hidden_dim2, output_dim) learning_rate = 0.01 optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) training_results = train(model=model, criterion=criterion, train_loader=train_loader, test_loader=test_loader, optimizer=optimizer, epochs=10) #Part3. Restricted Boltzman Machine result_model = test_rbm() income_rbm = 0 for index in range(9900): print(index) res = result_model[index][:37].mean() print(res) if res > 0.55: income_rbm += ALL.OPEN_Bid.iloc[index + 10] - ALL.OPEN_Bid.iloc[index] print("Income is:", income_rbm)
def executeForAFile(filename,output_file_name,humanExtractedYesOrNo_files,humanExtractGiven) : file = open(filename, 'r') #edited by amritha # file = open('./gdrive/My Drive/TextSummarizer/article1','r') text = file.read() paragraphs = para_reader.show_paragraphs(filename) # print("Number of paras : %d",len(paragraphs)) sentences = split_into_sentences(text) text_len = len(sentences) humanYesOrNo = [] if humanExtractGiven == False : # humanYesOrNo = askHuman.humanGenerator(text) x = 0 else: with open(humanExtractedYesOrNo_files) as fileobj: for word in fileobj: for ch in word: humanYesOrNo.append(ord(ch)-48) tokenized_sentences = remove_stop_words(sentences) tagged = posTagger(remove_stop_words(sentences)) thematicFeature(tokenized_sentences) # print(upperCaseFeature(sentences)) sentencePosition(paragraphs) tfIsfScore = tfIsf(tokenized_sentences) similarityScore = similarityScores(tokenized_sentences) # print("\n\nProper Noun Score : \n") properNounScore = properNounScores(tagged) # print(properNounScore) centroidSimilarityScore = centroidSimilarity(sentences,tfIsfScore) numericTokenScore = numericToken(tokenized_sentences) namedEntityRecogScore = namedEntityRecog(sentences) sentencePosScore = sentencePos(sentences) sentenceLengthScore = sentenceLength(tokenized_sentences) featureMatrix = [] featureMatrix.append(tfIsfScore) featureMatrix.append(similarityScore) featureMatrix.append(properNounScore) featureMatrix.append(centroidSimilarityScore) featureMatrix.append(numericTokenScore) featureMatrix.append(namedEntityRecogScore) featureMatrix.append(sentencePosScore) featureMatrix.append(sentenceLengthScore) featureMat = np.zeros((len(sentences),8)) for i in range(8) : for j in range(len(sentences)): featureMat[j][i] = featureMatrix[i][j] featureMat_normed = featureMat temp = rbm.test_rbm(dataset = featureMat_normed,learning_rate=0.1, training_epochs=14, batch_size=5,n_chains=5, n_hidden=8) enhanced_feature_sum = [] for i in range(len(np.sum(temp,axis=1))) : enhanced_feature_sum.append([np.sum(temp,axis=1)[i],i]) enhanced_feature_sum.sort(key=lambda x: x[0]) length_to_be_extracted = int(len(enhanced_feature_sum)/2) extracted_sentences = [] extracted_sentences.append([sentences[0], 0]) indeces_extracted = [] indeces_extracted.append(0) for x in range(length_to_be_extracted) : if(enhanced_feature_sum[x][1] != 0) : extracted_sentences.append([sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1]]) indeces_extracted.append(enhanced_feature_sum[x][1]) autoYesOrNo = askHuman.automaticGenerator(indeces_extracted,text_len) # Supervised learning # precision, recall, Fscore = askHuman.compareHumanAndAutomatic(humanYesOrNo,autoYesOrNo) # precision_values.append(precision) # recall_values.append(recall) # Fscore_values.append(Fscore) # print(extracted_sentences) extracted_sentences.sort(key=lambda x: x[1]) # print(extracted_sentences) finalText = "" # print("\nExtracted Final Text : ") for i in range(len(extracted_sentences)): # print(extracted_sentences[i][0]) finalText = finalText + extracted_sentences[i][0] # print("Precision : " + repr(precision) +"\nRecall : " + repr(recall) + "\nFscore : "+ repr(Fscore)) file = open(output_file_name, "w") file.write(finalText) # print(finalText) file.close()