def test_rbm():
    rbm.test_rbm(training_epochs=1,
                 batch_size=300,
                 n_chains=1,
                 n_samples=1,
                 n_hidden=20,
                 output_folder='tmp_rbm_plots')
Exemple #2
0
def test_rbm():
    t0 = time.time()
    rbm.test_rbm(training_epochs=1,
                 batch_size=300,
                 n_chains=1,
                 n_samples=1,
                 output_folder='tmp_rbm_plots')
    print >> sys.stderr, "test_rbm took %.3fs expected ??s in our buildbot" % (
        time.time() - t0)
Exemple #3
0
def main():
    whiten = False
    if len(sys.argv) > 1 and sys.argv[1] == '--whiten':
        whiten = True
        del sys.argv[1]

    if len(sys.argv) <= 3:
        print 'Usage: %s pcaDims n_hidden learningRate' % sys.argv[0]
        sys.exit(1)

    # loads data like datasets = ((train_x, train_y), ([], None), (test_x, None))
    datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz',
                             '../data/upson_rovio_1/test_15_50000.pkl.gz')
    img_dim = 15  # must match actual size of training data

    print 'done loading.'

    pcaDims = int(sys.argv[1])
    pca = PCA(datasets[0][0])  # train
    datasets[0][0] = pca.toPC(datasets[0][0], pcaDims, whiten=whiten)  # train
    datasets[1][0] = pca.toPC(
        datasets[1][0], pcaDims,
        whiten=whiten) if len(datasets[1][0]) > 0 else array([])  # valid
    datasets[2][0] = pca.toPC(datasets[2][0], pcaDims, whiten=whiten)  # test
    print 'reduced by PCA to'
    print('(%d, %d, %d) %d dimensional examples in (train, valid, test)' %
          (datasets[0][0].shape[0], datasets[1][0].shape[0],
           datasets[2][0].shape[0], datasets[0][0].shape[1]))

    # plot mean and principle components
    image = Image.fromarray(
        tile_raster_images(X=pca.meanAndPc(pcaDims).T,
                           img_shape=(img_dim, img_dim),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save(os.path.join(resman.rundir, 'meanAndPc.png'))

    # plot fractional stddev in PCA dimensions
    pyplot.semilogy(pca.fracStd, 'bo-')
    if pcaDims is not None:
        pyplot.axvline(pcaDims)
    pyplot.savefig(os.path.join(resman.rundir, 'fracStd.png'))
    pyplot.clf()

    test_rbm(datasets=datasets,
             training_epochs=45,
             img_dim=img_dim,
             n_input=pcaDims if pcaDims else img_dim * img_dim,
             n_hidden=int(sys.argv[2]),
             learning_rate=float(sys.argv[3]),
             output_dir=resman.rundir,
             quickHack=False,
             visibleModel='real',
             initWfactor=.01,
             imgPlotFunction=lambda xx: pca.fromPC(xx, unwhiten=whiten))
def main():
    whiten = False
    if len(sys.argv) > 1 and sys.argv[1] == '--whiten':
        whiten = True
        del sys.argv[1]
    
    if len(sys.argv) <= 3:
        print 'Usage: %s pcaDims n_hidden learningRate' % sys.argv[0]
        sys.exit(1)
    

    # loads data like datasets = ((train_x, train_y), ([], None), (test_x, None))
    datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz',
                             '../data/upson_rovio_1/test_15_50000.pkl.gz')
    img_dim = 15   # must match actual size of training data

    print 'done loading.'

    pcaDims = int(sys.argv[1])
    pca = PCA(datasets[0][0])  # train
    datasets[0][0] = pca.toPC(datasets[0][0], pcaDims, whiten = whiten) # train
    datasets[1][0] = pca.toPC(datasets[1][0], pcaDims, whiten = whiten) if len(datasets[1][0]) > 0 else array([]) # valid
    datasets[2][0] = pca.toPC(datasets[2][0], pcaDims, whiten = whiten) # test
    print 'reduced by PCA to'
    print ('(%d, %d, %d) %d dimensional examples in (train, valid, test)' % 
           (datasets[0][0].shape[0], datasets[1][0].shape[0], datasets[2][0].shape[0], datasets[0][0].shape[1]))

    # plot mean and principle components
    image = Image.fromarray(tile_raster_images(
             X = pca.meanAndPc(pcaDims).T,
             img_shape = (img_dim,img_dim),tile_shape = (10,10),
             tile_spacing=(1,1)))
    image.save(os.path.join(resman.rundir, 'meanAndPc.png'))
    
    # plot fractional stddev in PCA dimensions
    pyplot.semilogy(pca.fracStd, 'bo-')
    if pcaDims is not None:
        pyplot.axvline(pcaDims)
    pyplot.savefig(os.path.join(resman.rundir, 'fracStd.png'))
    pyplot.clf()
    
    
    test_rbm(datasets = datasets,
             training_epochs = 45,
             img_dim = img_dim,
             n_input = pcaDims if pcaDims else img_dim * img_dim,
             n_hidden = int(sys.argv[2]),
             learning_rate = float(sys.argv[3]),
             output_dir = resman.rundir,
             quickHack = False,
             visibleModel = 'real',
             initWfactor = .01,
             imgPlotFunction = lambda xx: pca.fromPC(xx, unwhiten = whiten))
Exemple #5
0
def main():
    # Load both squares and spheres datasets
    img_dim = 10    # 2, 4, 10, 15, 28
    cubeDatasets = loadPickledData('../data/cubes/train_%d_50000.pkl.gz' % img_dim,
                                   '../data/cubes/test_%d_50000.pkl.gz' % img_dim)
    sphereDatasets = loadPickledData('../data/spheres/train_%d_50000.pkl.gz' % img_dim,
                                     '../data/spheres/test_%d_50000.pkl.gz' % img_dim)

    # If necessary, reduce to 20000 rows to prevent memory error
    #cubeDatasets = ((cubeDatasets[0][0][:20000,:], None),
    #                (None, None),
    #                (cubeDatasets[2][0][:20000,:], None))
    #sphereDatasets = ((sphereDatasets[0][0][:20000,:], None),
    #                  (None, None),
    #                  (sphereDatasets[2][0][:20000,:], None))

    # make different size datasets
    sizes = [10, 20, 40, 100, 200, 400, 1000, 2000, 4000, 10000, 20000, 40000]
    #sizes = [10, 20, 40, 100, 200, 400, 1000, 2000, 4000, 10000]
    #sizes = [20000]
    #sizes = [40000]
    #sizes = [10, 20]

    sizedDatasetsX = {}
    sizedDatasetsXY = {}
    for size in sizes:
        sizedDatasetsX[size]  = makeSizedDataset(size, cubeDatasets, sphereDatasets, appendClass = False)
        sizedDatasetsXY[size] = makeSizedDataset(size, cubeDatasets, sphereDatasets, appendClass = True)
    testDatasetX  = makeSizedDataset(40000, cubeDatasets, sphereDatasets, appendClass = False)
    testDatasetXY = makeSizedDataset(40000, cubeDatasets, sphereDatasets, appendClass = True)

    print 'done loading.'

    for useXY in [False, True]:
        for size in sizes:
            print 'useXY', useXY, ', Size:', size
            thisDir = os.path.join(resman.rundir, '%s_size_%05d' % ('xy' if useXY else 'x', size))
            os.mkdir(thisDir)
            if useXY:
                thisDataset = (sizedDatasetsXY[size], (array([]), None), testDatasetXY)
            else:
                thisDataset = (sizedDatasetsX[size],  (array([]), None), testDatasetX)

            # this automatically saves the RBM to the given directory
            rbm, meanCosts = test_rbm(datasets = thisDataset,
                                      training_epochs = 45,
                                      img_dim = img_dim,
                                      n_hidden = 400, 
                                      learning_rate = .002, 
                                      output_dir = thisDir,
                                      quickHack = False,
                                      initWfactor = .02, 
                                      imgPlotFunction = lambda xx: xx[:,0:img_dim*img_dim],  # HACK: plot first slice
                                      )
Exemple #6
0
    def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123):
        assert vsize and hsizes and lr

        #input = T.dmatrix('global_input')

        self.layers = []
        for hsize in hsizes:
            r = rbm.test_rbm(learning_rate=lr, output_folder='dbn_rbm_plots')

            # configure inputs for subsequent layer
            input = self.layers[-1].hid
            vsize = hsize
    def __init__(self, vsize=None, hsizes=[], lr=None, bsize=10, seed=123):
        assert vsize and hsizes and lr

        #input = T.dmatrix('global_input')

        self.layers = []
        for hsize in hsizes:
            r = rbm.test_rbm(learning_rate=lr, output_folder='dbn_rbm_plots')

            # configure inputs for subsequent layer
            input = self.layers[-1].hid
            vsize = hsize
    with gzip.open('mnist.pkl.gz', 'rb') as f:
        try:
            train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
        except:
            train_set, valid_set, test_set = pickle.load(f)

    mnist_x, data_y = train_set
    test_x, test_y = test_set

    for i in range(10):

        res = test_rbm(dataset = mnist_x[data_y == i],
                    neg_dataset = mnist_x[data_y != i],
                    learning_rate = 0.1,
                    training_epochs = 10,
                    batch_size = 20,
                    output_folder = "disscd_digit_%d" % i,
                    n_hidden = 500,
                    k = 10,
                    pcd=False)

        input = T.matrix('input')
        en = res.get_energy(input)
        ef = theano.function(inputs=[input], outputs=[en])


        persistent_vis_chain = theano.shared(
            numpy.asarray(
                test_x[test_y == i],
                dtype=theano.config.floatX
            )
import rbm, DBN
from rbm import test_rbm
from DBN import test_DBN
''' A simple script to run some of the default tests to determine the speed of
 training different algorithms. 

I found that my result was slower for the RBM
 test (151.38 compared to 122.47), but seemed faster than the DBM test result
 quoted on deeplearning.net for LISA labs (On my ThinkPad T430 I get ~1.65 mins/epoch for the
 DBN compared to 2.2 mins/epoch there). The pretraining ran for 871.49 m and the fine tuning ran for 226.5 m, giving a best validation and test error of 1.49% (compared to 1.27 and 1.34 respectively for lisa labs) at iteration 110000. This is slower than the 615 minutes for pretraining and 101 minutes for fine tuning reported by lisa labs. Possibly the fact that I was running this and lrn2 at the same time may have slowed it down at the end. 

Started testing the default mnist_pretrain algorithm from lrn2 (the one with RBMs). Each epoch takes about 97 seconds. Full training took 590.5 m. 
'''
if (sys.argv[1] == 'rbm'):
    test_rbm()
elif (sys.argv[1] == 'dbn'):
    test_DBN()
elif (sys.argv[3] == 'mnist_pretrain'):
    parser = argparse.ArgumentParser(
        description="Run a complete lrn2 work flow")

    parser.add_argument("run_keyword",
                        metavar="run_keyword",
                        help="Keyword for the current test")

    parser.add_argument("modelconfig", help="model config file")
    parser.add_argument("nettype", help="Type of net I am using")

    parser.add_argument(
        "--re-train",
def summarize(text):
    # SPLIT TO PARAGRAPHS
    pre_paragraphs = text.split('\n')
    paragraphs = []
    for i, p in enumerate(pre_paragraphs):
        if not re.match(r'^\s*$', p) and (i == len(pre_paragraphs) - 1
                                          or re.match(r'^\s*$',
                                                      pre_paragraphs[i + 1])):
            paragraphs.append(p)
    # print(f'Num of paragraphs: {len(paragraphs)}')
    # for i, p in enumerate(paragraphs):
    #     print(f'par#{i+1}: {p}')

    # SPLIT TO SENTENCES
    sentences = separator.separate(text)
    print(f'Num of sentences: {len(sentences)}')
    for i, s in enumerate(sentences):
        print(f'#{i+1}: {s}')

    # TOKENIZE
    stem = False
    if stem:
        tokenized_sentences = [[
            czech_stemmer.cz_stem(word, aggressive=False) for word in sentence
        ] for sentence in tokenize(sentences)]
    else:
        tokenized_sentences = tokenize(sentences)

    # REMOVE STOPWORDS
    tokenized_sentences_without_stopwords = remove_stop_words(
        tokenized_sentences, keep_case=False)
    sentences_without_stopwords_case = remove_stop_words(
        sentences, keep_case=True, is_tokenized=False, return_tokenized=False)
    print('===Sentences without stopwords===')
    for i, s in enumerate(tokenized_sentences_without_stopwords):
        print(f'''#{i+1}: {' '.join(s)}''')

    print('===Sentences without stopwords CASE===')
    for i, s in enumerate(sentences_without_stopwords_case):
        print(f'''#{i+1}: {s}''')

    # POS-TAG
    tagged_sentences = pos_tag(sentences_without_stopwords_case)
    print('=====Tagged_sentences=====')
    for i, s in enumerate(tagged_sentences):
        print(f'''#{i+1}: {s}''')

    # 1. THEMATICITY FEATURE
    thematicity_feature_scores = thematicity_feature(
        tokenized_sentences_without_stopwords)

    # 2. SENTENCE POSITION FEATURE - NOTE: shitty!
    sentence_position_scores = sentence_position_feature(len(sentences))

    # 3. SENTENCE LENGTH FEATURE
    sentence_length_scores = sentence_length_feature(tokenized_sentences)

    # 4. SENTENCE PARAGRAPH POSITION FEATURE

    # 5. PROPER_NOUN FEATURE
    proper_noun_scores = proper_noun_feature(tagged_sentences)

    # 6. NUMERALS FEATURE
    numerals_scores = numerals_feature(tokenized_sentences)

    # 7. NAMED ENTITIES FEATURE - very similar to PROPER_NOUN FEATURE

    # 8. TF_ISF FEATURE - NOTE: TextRank instead of TS_ISF ??? ts_isf_orig is meh
    tf_isf_scores = tf_isf_orig_feature(tokenized_sentences_without_stopwords)

    # 9. CENTROID SIMILARITY FEATURE
    centroid_similarity_scores = centroid_similarity_feature(
        sentences, tf_isf_scores)

    # 10. UPPER-CASE FEATURE (not in the paper)
    upper_case_scores = upper_case_feature(tokenized_sentences)

    # 11. QUOTES FEATURE (not in the paper)
    quotes_scores = quotes_feature(sentences)

    # 12. REFERENCES FEATURE (not in the paper)
    references_scores = references_feature(tokenized_sentences)

    # 13. TEXTRANK FEATURE (not in the paper)
    textrank_scores = textrank.textrank(tokenized_sentences, True,
                                        '4-1-0.0001')

    feature_matrix = []
    feature_matrix.append(thematicity_feature_scores)
    feature_matrix.append(sentence_position_scores)
    feature_matrix.append(sentence_length_scores)
    feature_matrix.append(proper_noun_scores)
    feature_matrix.append(numerals_scores)
    feature_matrix.append(tf_isf_scores)
    feature_matrix.append(centroid_similarity_scores)
    feature_matrix.append(upper_case_scores)

    features = [
        '  thema', 'sen_pos', 'sen_len', '  propn', '    num', ' tf_isf',
        'cen_sim', '  upper'
    ]

    feature_matrix_2 = np.zeros((len(sentences), len(features)))
    for i in range(len(features)):
        for j in range(len(sentences)):
            feature_matrix_2[j][i] = feature_matrix[i][j]

    feature_sum = []
    for i in range(len(np.sum(feature_matrix_2, axis=1))):
        feature_sum.append(np.sum(feature_matrix_2, axis=1)[i])

    print('=====Scores=====')
    print(35 * ' ', end='|')
    for f in features:
        print(f, end='|')
    print()
    for i, s in enumerate(sentences):
        print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|')
        for f_s in feature_matrix:
            print('{: .4f}'.format(round(f_s[i], 4)), end='|')
        print('{: .4f}'.format(round(feature_sum[i], 4)))

    print('Training rbm...')
    rbm_trained = rbm.test_rbm(dataset=feature_matrix_2,
                               learning_rate=0.1,
                               training_epochs=14,
                               batch_size=5,
                               n_chains=5,
                               n_hidden=len(features))
    # another implementation of rbm, from sklearn
    # rbm2 = BernoulliRBM(n_components=len(features), n_iter=14, batch_size=5, learning_rate=0.1)
    # rbm_trained = rbm2.fit_transform(feature_matrix_2)
    # print(rbm_trained)
    rbm_trained_sums = np.sum(rbm_trained, axis=1)

    print('=====RBM Enhanced Scores=====')
    print(35 * ' ', end='|')
    for f in features:
        print(f, end='|')
    print()
    for i, s in enumerate(sentences):
        print(f'#{"{:2d}".format(i + 1)}: {s[:30]}', end='|')
        for f_s in rbm_trained[i]:
            print('{: .4f}'.format(round(f_s, 4)), end='|')
        print('{: .4f}'.format(round(rbm_trained_sums[i], 4)))

    enhanced_feature_sum = []
    feature_sum = []

    for i in range(len(np.sum(rbm_trained, axis=1))):
        enhanced_feature_sum.append([np.sum(rbm_trained, axis=1)[i], i])
        feature_sum.append([np.sum(feature_matrix_2, axis=1)[i], i])

    print(f'enhanced_feature_sum: {enhanced_feature_sum}')
    print(f'feature_sum: {feature_sum}')

    enhanced_feature_sum.sort(key=lambda x: x[0])
    feature_sum.sort(key=lambda x: -1 * x[0])
    print('=====Sorted=====')
    print(f'enhanced_feature_sum: {enhanced_feature_sum}')
    print(f'feature_sum: {feature_sum}')

    # print('=====The text=====')
    # for x in range(len(sentences)):
    #     print(sentences[x])

    extracted_sentences_rbm = []
    extracted_sentences_rbm.append([sentences[0], 0])
    extracted_sentences_simple = []
    extracted_sentences_simple.append([sentences[0], 0])

    summary_length = max(min(round(len(sentences) / 4), 12),
                         3)  # length between 3-12 sentences
    for x in range(summary_length):
        if enhanced_feature_sum[x][1] != 0:
            extracted_sentences_rbm.append([
                sentences[enhanced_feature_sum[x][1]],
                enhanced_feature_sum[x][1]
            ])
        if feature_sum[x][1] != 0:
            extracted_sentences_simple.append(
                [sentences[feature_sum[x][1]], feature_sum[x][1]])

    extracted_sentences_rbm.sort(key=lambda x: x[1])
    extracted_sentences_simple.sort(key=lambda x: x[1])

    final_text_rbm = ''
    for i in range(len(extracted_sentences_rbm)):
        final_text_rbm += extracted_sentences_rbm[i][0] + '\n'
    final_text_simple = ''
    for i in range(len(extracted_sentences_simple)):
        final_text_simple += extracted_sentences_simple[i][0] + '\n'

    print('=====Extracted Final Text RBM=====')
    print(final_text_rbm)
    print()
    print('=====Extracted Final Text simple=====')
    print(final_text_simple)

    return final_text_rbm
Exemple #11
0
def executeForAFile_input(filename, cwd):
    os.chdir(cwd + "/Input_file")

    file = open(filename, 'r')
    text = file.read()
    file.close()
    # os.chdir(cwd)
    paragraphs = para_reader.show_paragraphs(filename)
    sentences = split_into_sentences(text)
    text_len = len(sentences)

    tokenized_sentences = remove_stop_words(sentences)
    # print(tokenized_sentences)
    # # tagged = pos_tag(remove_stop_words(sentences))
    # print("LENNNNN : ")
    # print(len(senPos(paragraphs)))
    # term frequency score
    tfIdfScore = tFiDF(tokenized_sentences)
    # print('Term frequency')
    # print(tfIdfScore)

    # Number of numerals
    numericTokenScore = numericToken(tokenized_sentences)
    # print('numeric token score')
    # print(numericTokenScore)
    # Number of Named entity
    namedEntityRecogScore = namedEntityRecog(sentences, cwd)
    print('named Entity score')
    print(namedEntityRecogScore)
    #
    # # Sentence position score
    sentencePosScore = senPos(sentences)
    # print('Sentence score')
    # print(sentencePosScore)
    #
    # Sentence Position score
    sentenceParaScore = paraPos(paragraphs)
    # print('Sentence Para score')
    # print(sentenceParaScore)
    featureMatrix = []
    featureMatrix.append(sentencePosScore)
    featureMatrix.append(sentenceParaScore)
    featureMatrix.append(numericTokenScore)
    featureMatrix.append(namedEntityRecogScore)
    featureMatrix.append(tfIdfScore)

    # prepare feature matrix for training
    featureMat = np.zeros((len(sentences), 5))
    for i in range(5):
        for j in range(len(sentences)):
            featureMat[j][i] = featureMatrix[i][j]

    print("\n\n\nPrinting Feature Matrix : ")
    print(featureMat)

    # feature sum generation
    feature_sum = []

    for i in range(len(np.sum(featureMat, axis=1))):
        feature_sum.append(np.sum(featureMat, axis=1)[i])
    print(feature_sum)

    #training using rbm
    temp = rbm.test_rbm(featureMat,
                        learning_rate=0.1,
                        training_epochs=10,
                        batch_size=5,
                        n_chains=5,
                        n_hidden=5)
    enhancedFeatureSum = []
    for i in range(len(sentences)):
        enhancedFeatureSum.append([np.sum(temp, axis=1)[i], i])
    index_sentence = sorted(enhancedFeatureSum,
                            key=lambda x: x[0],
                            reverse=True)[0][1]
    output = sentences[index_sentence]
    #get enhancedfeature
    return output
from pca import PCA

if __name__ == '__main__':
    resman.start('junk', diary=True)
    datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz',
                             '../data/upson_rovio_1/test_15_50000.pkl.gz')

    #meanTrain = mean(datasets[0][0])
    #stdTrain  = std(datasets[0][0])
    #datasets[0][0] = (datasets[0][0] - meanTrain) / stdTrain
    #datasets[2][0] = (datasets[2][0] - meanTrain) / stdTrain

    pca = PCA(datasets[0][0])
    datasets[0][0] = pca.toZca(datasets[0][0], None, epsilon=.1)
    datasets[2][0] = pca.toZca(datasets[2][0], None, epsilon=.1)

    print 'done loading.'

    test_rbm(
        datasets=datasets,
        training_epochs=45,
        img_dim=15,  # must match actual size of training data
        n_hidden=int(sys.argv[1]),
        learning_rate=float(sys.argv[2]),
        output_dir=resman.rundir,
        quickHack=False,
        visibleModel='real',
        initWfactor=.01,
        pcaDims=None)
    resman.stop()
Exemple #13
0
def executeForAFile(filename, output_file_name, cwd):

    os.chdir(cwd + "/articles")
    file = open(filename, 'r')
    text = file.read()
    paragraphs = para_reader.show_paragraphs(filename)
    print(paragraphs)
    print("Number of paras : %d", len(paragraphs))
    sentences = split_into_sentences(text)
    text_len = len(sentences)
    sentenceLengths.append(text_len)

    tokenized_sentences = remove_stop_words(sentences)
    tagged = posTagger(remove_stop_words(sentences))

    thematicFeature(tokenized_sentences)
    print(upperCaseFeature(sentences))
    print("LENNNNN : ")
    print(len(sentencePosition(paragraphs)))

    tfIsfScore = tfIsf(tokenized_sentences)
    similarityScore = similarityScores(tokenized_sentences)

    print("\n\nProper Noun Score : \n")
    properNounScore = properNounScores(tagged)
    print(properNounScore)
    centroidSimilarityScore = centroidSimilarity(sentences, tfIsfScore)
    numericTokenScore = numericToken(tokenized_sentences)
    namedEntityRecogScore = namedEntityRecog(sentences)
    sentencePosScore = sentencePos(sentences)
    sentenceLengthScore = sentenceLength(tokenized_sentences)
    thematicFeatureScore = thematicFeature(tokenized_sentences)
    sentenceParaScore = sentencePosition(paragraphs)

    featureMatrix = []
    featureMatrix.append(thematicFeatureScore)
    featureMatrix.append(sentencePosScore)
    featureMatrix.append(sentenceLengthScore)
    #featureMatrix.append(sentenceParaScore)
    featureMatrix.append(properNounScore)
    featureMatrix.append(numericTokenScore)
    featureMatrix.append(namedEntityRecogScore)
    featureMatrix.append(tfIsfScore)
    featureMatrix.append(centroidSimilarityScore)

    featureMat = np.zeros((len(sentences), 8))
    for i in range(8):
        for j in range(len(sentences)):
            featureMat[j][i] = featureMatrix[i][j]

    print("\n\n\nPrinting Feature Matrix : ")
    print(featureMat)
    print("\n\n\nPrinting Feature Matrix Normed : ")
    #featureMat_normed = featureMat / featureMat.max(axis=0)
    featureMat_normed = featureMat

    feature_sum = []

    for i in range(len(np.sum(featureMat, axis=1))):
        feature_sum.append(np.sum(featureMat, axis=1)[i])

    print(featureMat_normed)
    for i in range(len(sentences)):
        print(featureMat_normed[i])

    temp = rbm.test_rbm(dataset=featureMat_normed,
                        learning_rate=0.1,
                        training_epochs=14,
                        batch_size=5,
                        n_chains=5,
                        n_hidden=8)

    print("\n\n")
    print(np.sum(temp, axis=1))

    enhanced_feature_sum = []
    enhanced_feature_sum2 = []

    for i in range(len(np.sum(temp, axis=1))):
        enhanced_feature_sum.append([np.sum(temp, axis=1)[i], i])
        enhanced_feature_sum2.append(np.sum(temp, axis=1)[i])

    print(enhanced_feature_sum)
    print("\n\n\n")

    enhanced_feature_sum.sort(key=lambda x: x[0])
    print(enhanced_feature_sum)

    length_to_be_extracted = len(enhanced_feature_sum) / 2

    print("\n\nThe text is : \n\n")
    for x in range(len(sentences)):
        print(sentences[x])

    print("\n\n\nExtracted sentences : \n\n\n")
    extracted_sentences = []
    extracted_sentences.append([sentences[0], 0])

    indeces_extracted = []
    indeces_extracted.append(0)

    for x in range(length_to_be_extracted):
        if (enhanced_feature_sum[x][1] != 0):
            extracted_sentences.append([
                sentences[enhanced_feature_sum[x][1]],
                enhanced_feature_sum[x][1]
            ])
            indeces_extracted.append(enhanced_feature_sum[x][1])

    extracted_sentences.sort(key=lambda x: x[1])

    finalText = ""
    print("\n\n\nExtracted Final Text : \n\n\n")
    for i in range(len(extracted_sentences)):
        print("\n" + extracted_sentences[i][0])
        finalText = finalText + extracted_sentences[i][0]

    os.chdir(cwd + "/outputs")
    file = open(output_file_name, "w")
    file.write(finalText)
    file.close()

    os.chdir(cwd)
    file = open("featureSum", "w")
    for item in feature_sum:
        print(item, end="\n", file=file)

    file = open("enhancedfeatureSum", "w")
    for item in enhanced_feature_sum2:
        print(item, end="\n", file=file)
def test_rbm():
    t0=time.time()
    rbm.test_rbm(training_epochs = 1, batch_size = 300, n_chains = 1, n_samples = 1, 
            output_folder =  'tmp_rbm_plots')
    print >> sys.stderr, "test_rbm took %.3fs expected ??s in our buildbot"%(time.time()-t0)
Exemple #15
0
def run(filename, out_path):

    decode_path = os.path.join(out_path, filename[:-4])
    cmd = 'apktool d "' + out_path + filename + '" -o ' + decode_path
    os.system(cmd)  # decode apk
    if os.path.exists(os.path.join(decode_path,
                                   "AndroidManifest.xml")) is not True:
        icon = None
        meta = []
        all_permission = []
        all_api = []
        result = u"该应用设有防止反编译机制,无法反编译,请换一个应用"
        return icon, meta, all_permission, all_api, result

    icon, meta = metaInfo(os.path.join(out_path, filename), decode_path)

    #permission_vector, all_permission = permission.ml_per_feature(decode_path)  # return all permission needed
    #api_vector, all_api = api.ml_api_feature(decode_path)  # return all api called
    #ngram_vector, all_ngram = ngram.ml_ngram_feature(decode_path)
    ''' 加载训练模型
    '''
    '''
    rf = pickle.load(open('dataset/model', 'rb'))
    gbdt = pickle.load(open('dataset/model2', 'rb'))
    ada = pickle.load(open('dataset/model3', 'rb'))
    lr = pickle.load(open('dataset/model4', 'rb'))
    '''
    vc = pickle.load(open('dataset/model_final', 'rb'))
    '''构造该应用的特征向量
    '''
    permission_vector, all_permission = permission.ml_per_feature(
        decode_path)  # return all permission needed
    api_vector, all_api = api.ml_api_feature(
        decode_path)  # return all api called
    ngram_vector, all_ngram = ngram.ml_ngram_feature(decode_path)
    feature_vector = permission_vector + api_vector + ngram_vector
    '''feature-selection
    '''
    #make list to a vector
    train_set_x = list()
    train_set_x.append(feature_vector)
    raw_train_set_x = train_set_x
    fsmodel = pickle.load(open('dataset/fsmodel', 'rb'))
    #print len(train_set_x)
    fs_vector = fsmodel.transform(train_set_x)

    fsmodel2 = pickle.load(open('dataset/fsmodel2', 'rb'))
    fs_vector = fsmodel2.transform(fs_vector)

    feature_vector = fs_vector[0]
    train_set_x = fs_vector  # + fs_vector
    #print train_set_x		#[[]]

    #########################    DEBUG    ############################
    fs_vec = []
    for i in range(len(raw_train_set_x[0])):
        fs_vec.append(i)

    print fs_vec
    fs_vec = fsmodel.transform(fs_vec)
    fs_vec = fsmodel2.transform(fs_vec)
    print fs_vec

    feature_matrix_dl = [x for x in range(len(raw_train_set_x))]
    for i in range(len(feature_matrix_dl)):
        feature_matrix_dl[i] = [
            x for x in range(len(raw_train_set_x[0]) - len(fs_vec[0]))
        ]
    temp = 0
    for i in range(len(raw_train_set_x[0])):
        if i not in fs_vec:
            #print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl)):
                feature_matrix_dl[j][temp] = raw_train_set_x[j][i]
            temp = temp + 1

    #print "行数%d" % len(feature_matrix_dl)
    #print "列数%d" % len(feature_matrix_dl[0])
    train_set_x = feature_matrix_dl
    #print train_set_x
    train_set_x1 = [train_set_x[0]]
    b_s = 5
    for i in range(1, b_s):
        train_set_x1.append(1)
        train_set_x1[i] = train_set_x[0]
    #print train_set_x1
    train_set_x = train_set_x1
    ##################################################################
    print len(raw_train_set_x)
    print len(fs_vec[0])

    rbm = pickle.load(open('dataset/rbmmodel', 'rb'))
    hiddeny, test = test_rbm(train_set_x,
                             train_set_x_feature_num=len(train_set_x[0]),
                             batch_size=b_s,
                             rbm_object=rbm)
    hiddeny = hiddeny[0]

    feature_vector = numpy.concatenate((feature_vector, hiddeny),
                                       axis=0).reshape(1, -1).tolist()
    #print feature_vector
    # print len(feature_vector)
    ''' 预测
    '''
    '''
    #[r1, r2, r3, r4] = [rf.predict(feature_vector), gbdt.predict(feature_vector), ada.predict(feature_vector), lr.predict(feature_vector)]
    [p1, p2, p3, p4] = [rf.predict_proba(feature_vector), gbdt.predict_proba(feature_vector), ada.predict_proba(feature_vector), lr.predict_proba(feature_vector)]
    [w1, w2, w3, w4] = [1, 1, 1, 1]
    #expect = w1 * r1 + w2 * r2 + w3 * r3 + w4 * r4
    expect = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
    print ("risk score: %0.2f" % (expect[0][1] / (expect[0][0] + expect[0][1])))
    #if expect > 2:
    if expect[0][0] < expect[0][1]:
        result = u"预测该应用是  恶意应用"
    else:
        result = u"预测该应用是  良性应用"
    print result
    #print [r1, r2, r3, r4]
    p = [p1, p2, p3, p4]
    print p
    #print all_permission
    '''
    expect1 = vc.predict(feature_vector)
    score1 = '%.0f' % (vc.predict_proba(feature_vector)[0][1] * 100)

    print("risk score: %s" % score1)
    print filename

    if filename.find("apk"):
        expect2, score22 = g_predict.run(out_path + filename)
        print "expect2 = ", expect2

    else:
        expect2, score22 = g_predict.run(out_path + filename + ".apk")

    print out_path
    if filename.find("apk"):
        expect3, score3 = dy_predict.dyrun(out_path + filename)
    else:
        expect3, score3 = dy_predict.dyrun(out_path + filename + ".apk")

    if expect2 == -1:
        score = score1 * 0.8 + score3 * 0.2
        expect = expect1 * 0.8 + expect3 * 0.2
        if expect < 0.5:
            expect = 0
        else:
            expect = 1
    else:
        score2 = '%.0f' % (score22 * 100)
        print "score1", score1
        print "score2", score2
        score2 = int(score2)
        score1 = int(score1)
        score3 = int(score3)
        print "score2 by int", score2
        print "score3", score3
        score = score1 * 0.6 + score2 * 0.2 + score3 * 0.2
        print score
        score = int(score)
        print score
        expect = expect1 * 0.6 + expect2 * 0.2 + expect3 * 0.2
        if expect < 0.5:
            expect = 0
        else:
            expect = 1

    img1 = int(score) / 10
    img2 = int(score) % 10

    if expect == 1:
        result = u"预测该应用是 恶意应用"
    else:
        result = u"预测该应用是 良性应用"
    print result

    # -------------------- Permission OUTPUT ---------------------------------------------
    permission_output = {}
    # print all_permission
    with open('/home/ubuntu/Code/data/permission.csv') as f:
        f_csv = csv.reader(f)
        for row in f_csv:
            if row[0].strip() in all_permission:
                permission_output[row[0].strip()] = {}
                permission_output[row[0].strip(
                )]['Description'] = row[1].strip().decode('utf-8')
                permission_output[
                    row[0].strip()]['ThreatLevel'] = row[2].strip()

    # -----------------        Sensitive Api OUTPUT       ---------------------------------
    sensitive_api_output = {}
    sensitive_api_list = {}
    with open('/home/ubuntu/Code/data/all_sensitive_api.csv') as f:
        f_csv = csv.reader(f)
        for row in f_csv:
            if row[0] != 'API Name':
                sensitive_api_list[row[0].strip()] = {}
                sensitive_api_list[row[0].strip(
                )]['Description'] = row[1].strip().decode('utf-8')
                sensitive_api_list[
                    row[0].strip()]['ThreatLevel'] = row[2].strip()
    # sensitive api输出结构为二维字典
    # packagename :{ api:{'Description':xxx, ThreatLevel:'xxx' }}
    for each_api in all_api:
        if each_api in sensitive_api_list:
            packagename, api_name = each_api.split('->')
            # print packagename, '#' ,api_name
            # print packagename
            # print api_name
            if packagename not in sensitive_api_output:
                sensitive_api_output[packagename] = {}
            sensitive_api_output[packagename][api_name] = sensitive_api_list[
                each_api]
    # print sensitive_api_output
    # -------------------------   Component        ----------------------------------------
    component_output = component.run(decode_path)

    # ----------------------          DY OUTPUT     ---------------------------------------
    dy_path = os.path.join('/home/project/apks/uploads/',
                           meta['File Md5'] + '.dy')
    if not os.path.exists(dy_path):
        behavior = dy_analyse1.dy_demo(os.path.join(out_path, filename),
                                       meta['Package Name'])
        #print "bebavior:" + behavior
        #pickle.dump(behavior, open(dy_path, 'wb'))
    else:
        print "ddddddd"
        #behavior = pickle.load(open(dy_path, 'rb'))

    #behavior1 = eval(open('/home/project/out.txt','r').read())
    outpath1 = '/home/ubuntu/DroidBox/output/out1.json'
    behavior = [json.loads(line) for line in open(outpath1)]
    #print behavior
    #behavior = behavior1[0]

    # ------------------------         FILE ACCESS  ----------------------------------------
    file_access = []

    if 'fdaccess' in behavior[0]:
        if behavior[0]['fdaccess']:
            file_bebavior = behavior[0]['fdaccess']
            print file_bebavior
            for each_info in file_bebavior:
                file_access.append([
                    file_bebavior[each_info]['path'],
                    file_bebavior[each_info]['operation'],
                    file_bebavior[each_info]['data']
                ])

    # ------------------------         SMS ACCESS  ----------------------------------------
    sms_access = []

    if 'sendsms' in behavior[0]:
        if behavior[0]['sendsms']:
            sms_bebavior = behavior[0]['sendsms']
            for each_info in sms_bebavior:
                sms_access.append([
                    sms_bebavior[each_info]['number'],
                    sms_bebavior[each_info]['message']
                ])

    # ------------------------         Net ACCESS  ----------------------------------------
    net_send = []
    net_recv = []

    if 'sendnet' in behavior[0]:
        if behavior[0]['sendnet']:
            net_send_bebavior = behavior[0]['sendnet']
            for each_info in net_send_bebavior:
                net_send.append([net_send_bebavior[each_info]['desthost'],net_send_bebavior[each_info]['data'],\
                                 net_send_bebavior[each_info]['operation'],net_send_bebavior[each_info]['destport']])
    if 'recvnet' in behavior[0]:
        if behavior[0]['recvnet']:
            net_recv_bebavior = behavior[0]['recvnet']
            for each_info in net_recv_bebavior:
                net_recv.append([net_recv_bebavior[each_info]['host'],net_recv_bebavior[each_info]['port'],\
                                 net_recv_bebavior[each_info]['data']])

#------------------------------- Sensitive Data operation---------------------------------
    data_operation = []
    sensitive_data_list = {}
    with open('/home/ubuntu/Code/data/dy_feature.csv') as f:
        f_csv = csv.reader(f)
        for row in f_csv:
            if row[0] != 'Feature':
                sensitive_data_list[row[0].strip()] = {}
                sensitive_data_list[row[0].strip(
                )]['Description'] = row[1].strip().decode('utf-8')
        #print sensitive_api_list
    if 'dataleaks' in behavior[0]:
        if behavior[0]['dataleaks']:
            sen_data_bebavior = behavior[0]['dataleaks']
            print sen_data_bebavior
            for item in sensitive_data_list:
                for each_info in sen_data_bebavior:
                    if item in sen_data_bebavior[each_info]['tag']:
                        data_operation.append([
                            str(sen_data_bebavior[each_info]['tag']).decode(
                                'utf-8'),
                            sensitive_data_list[item]['Description'],
                            sen_data_bebavior[each_info]['data']
                        ])

    print "data_operation:", data_operation

    #---------------------------------dataleak-----------------------------------------
    data_leak = []
    dataleak_list = {}
    with open('/home/ubuntu/Code/data/dy_feature.csv') as f:
        f_csv = csv.reader(f)
        for row in f_csv:
            if row[0] != 'Feature':
                dataleak_list[row[0].strip()] = {}
                dataleak_list[row[0].strip()]['Description'] = row[1].strip(
                ).decode('utf-8')
        #print dataleak_list
    flag = 0
    if 'sendnet' in behavior[0]:
        if behavior[0]['sendnet']:
            data_leak_bebavior = behavior[0]['sendnet']
            #print data_leak_bebavior
            for item in dataleak_list:
                if str(item).find('sendnet_') != -1:
                    flag = 1
                    print item, flag
                    for each_info in data_leak_bebavior:
                        temp = (str(item))[8:]

                        if temp in (str(data_leak_bebavior[each_info]['tag'])):
                            print temp
                            data_leak.append([
                                data_leak_bebavior[each_info]['desthost'],
                                str(data_leak_bebavior[each_info]
                                    ['tag']).decode('utf-8'),
                                dataleak_list[item]['Description']
                            ])
                        break
    #print "data_leak:",data_leak


#--------------------------------enfperm---------------------------------------
    enf_per = []
    if 'enfperm' in behavior[0]:
        if behavior[0]['enfperm']:
            enf_bebavior = behavior[0]['enfperm']
            print enf_bebavior
            for each_info in enf_bebavior:
                enf_per.append([enf_bebavior[each_info]])
    print "enf_per:", enf_per

    #-------------------------------------------------------------------------------

    return icon, meta, permission_output, sensitive_api_output, component_output, file_access, sms_access, net_send, net_recv, data_operation, data_leak, enf_per, result, img1, img2, expect


if __name__ == '__main__':
    resman.start('junk', diary = True)
    datasets = loadUpsonData('../data/upson_rovio_1/train_15_50000.pkl.gz',
                             '../data/upson_rovio_1/test_15_50000.pkl.gz')

    #meanTrain = mean(datasets[0][0])
    #stdTrain  = std(datasets[0][0])
    #datasets[0][0] = (datasets[0][0] - meanTrain) / stdTrain
    #datasets[2][0] = (datasets[2][0] - meanTrain) / stdTrain

    pca = PCA(datasets[0][0])
    datasets[0][0] = pca.toZca(datasets[0][0], None, epsilon = .1)
    datasets[2][0] = pca.toZca(datasets[2][0], None, epsilon = .1)

    print 'done loading.'
    
    test_rbm(datasets = datasets,
             training_epochs = 45,
             img_dim = 15,   # must match actual size of training data
             n_hidden = int(sys.argv[1]),
             learning_rate = float(sys.argv[2]),
             output_dir = resman.rundir,
             quickHack = False,
             visibleModel = 'real',
             initWfactor = .01,
             pcaDims = None)
    resman.stop()
        return train_set



if __name__ == '__main__':
    resman.start('junk', diary = True)

    circles = False
    if len(sys.argv) > 1 and sys.argv[1] == '--circles':
        circles = True
        del sys.argv[1]

    print 'Using dataset:', 'circles' if circles else 'squares'

    img_dim = 10    # 2, 4, 10, 15, 28
    if circles:
        datasets = loadPickledData('../data/circles/train_%d_50000.pkl.gz' % img_dim,
                                   '../data/circles/test_%d_50000.pkl.gz' % img_dim)
    else:
        datasets = loadPickledData('../data/squares/train_%d_50000.pkl.gz' % img_dim,
                                   '../data/squares/test_%d_50000.pkl.gz' % img_dim)
    print 'done loading.'
    test_rbm(datasets = datasets,
             training_epochs = 5,
             img_dim = img_dim,
             n_hidden = 100, 
             learning_rate = .1, 
             output_dir = resman.rundir,
             quickHack = False)
    resman.stop()

if __name__ == '__main__':
    resman.start('junk', diary = True)

    spheres = False
    if len(sys.argv) > 1 and sys.argv[1] == '--spheres':
        spheres = True
        del sys.argv[1]

    print 'Using dataset:', 'spheres' if spheres else 'cubes'

    img_dim = 10    # 2, 4, 10, 15, 28
    if spheres:
        datasets = loadPickledData('../data/spheres/train_%d_50000.pkl.gz' % img_dim,
                                   '../data/spheres/test_%d_50000.pkl.gz' % img_dim)
    else:
        datasets = loadPickledData('../data/cubes/train_%d_50000.pkl.gz' % img_dim,
                                   '../data/cubes/test_%d_50000.pkl.gz' % img_dim)
    print 'done loading.'
    rbm, meanCosts = test_rbm(datasets = datasets,
                              training_epochs = 45,
                              img_dim = img_dim,
                              n_hidden = 200, 
                              learning_rate = .1, 
                              output_dir = resman.rundir,
                              quickHack = False,
                              imgPlotFunction = lambda xx: xx[:,0:img_dim*img_dim],  # HACK: plot first slice
                              )
    resman.stop()
Exemple #19
0
featureMat = np.zeros((len(sentence_list), 5))
for i in range(5):
    for j in range(len(sentence_list)):
        featureMat[j][i] = featureMatrix[i][j]

featureMat_normed = featureMat

feature_sum = []

for i in range(len(np.sum(featureMat, axis=1))):
    feature_sum.append(np.sum(featureMat, axis=1)[i])

temp = rbm.test_rbm(dataset=featureMat_normed,
                    learning_rate=0.1,
                    training_epochs=14,
                    batch_size=5,
                    n_chains=5,
                    n_hidden=5)

enhanced_feature_sum = []
enhanced_feature_sum2 = []

for i in range(len(np.sum(temp, axis=1))):
    enhanced_feature_sum.append([np.sum(temp, axis=1)[i], i])
    enhanced_feature_sum2.append(np.sum(temp, axis=1)[i])

enhanced_feature_sum.sort(key=lambda x: x[0])
length_to_be_extracted = len(enhanced_feature_sum) / 2

for x in range(len(sentence_list)):
    print(sentence_list[x])
#! /usr/bin/env python

import numpy, time, gzip, PIL.Image, os, pdb
#import cPickle as pickle
import pickle
from numpy import *

from ResultsManager import resman
from rbm import RBM, test_rbm
from utils import loadUpsonData

if __name__ == '__main__':
    resman.start('junk', diary=False)
    datasets = loadUpsonData('../data/upson_rovio_1/train_10_50000.pkl.gz',
                             '../data/upson_rovio_1/test_10_50000.pkl.gz')
    print 'done loading.'
    test_rbm(datasets=datasets,
             training_epochs=45,
             img_dim=10,
             n_hidden=500,
             learning_rate=.002,
             output_dir=resman.rundir,
             quickHack=False)
    resman.stop()
def test_rbm():
    rbm.test_rbm(training_epochs=1, batch_size=300, n_chains=1, n_samples=1,
            output_folder='tmp_rbm_plots')
Exemple #22
0
    def get_summary(self):

        paragraphs = self.content.split('\n\n')
        sentences = RBM_summarizer.split_into_sentences(self.content)
        tokenized_sentences = RBM_summarizer.remove_stop_words(sentences)
        uppercase_score = RBM_summarizer.upperCaseFeature(sentences)
        namedEntityRecogScore = RBM_summarizer.namedEntityRecog(sentences)
        sentencePosScore = RBM_summarizer.sentencePosition(sentences)
        sentenceParaScore = RBM_summarizer.sentencePosition(paragraphs)
        thematicFeatureScore = RBM_summarizer.thematicFeature(
            tokenized_sentences)
        tagged = RBM_summarizer.posTagger(tokenized_sentences)
        tfIsfScore = RBM_summarizer.tfIsf(tokenized_sentences)
        similarityScore = RBM_summarizer.similarityScores(tokenized_sentences)
        numericTokenScore = RBM_summarizer.numericToken(tokenized_sentences)
        sentenceLengthScore = RBM_summarizer.sentenceLength(
            tokenized_sentences)
        properNounScore = RBM_summarizer.properNounScores(tagged)
        centroidSimilarityScore = RBM_summarizer.centroidSimilarity(
            sentences, tfIsfScore)

        featureMat = np.zeros((len(sentences), 8))

        featureMatrix = []
        featureMatrix.append(thematicFeatureScore)
        featureMatrix.append(sentencePosScore)
        featureMatrix.append(sentenceLengthScore)
        featureMatrix.append(sentenceParaScore)
        featureMatrix.append(properNounScore)
        featureMatrix.append(numericTokenScore)
        featureMatrix.append(namedEntityRecogScore)
        featureMatrix.append(tfIsfScore)
        featureMatrix.append(centroidSimilarityScore)

        for i in range(8):
            for j in range(len(sentences)):
                featureMat[j][i] = featureMatrix[i][j]

        feature_sum = []

        for i in range(len(np.sum(featureMat, axis=1))):
            feature_sum.append(np.sum(featureMat, axis=1)[i])

        temp = rbm.test_rbm(dataset=featureMat,
                            learning_rate=0.1,
                            training_epochs=14,
                            batch_size=5,
                            n_chains=5,
                            n_hidden=8)

        enhanced_feature_sum = []
        enhanced_feature_sum2 = []

        for i in range(len(np.sum(temp, axis=1))):
            enhanced_feature_sum.append([np.sum(temp, axis=1)[i], i])
            enhanced_feature_sum2.append(np.sum(temp, axis=1)[i])

        enhanced_feature_sum.sort(key=lambda x: x[0])

        length_to_be_extracted = len(enhanced_feature_sum) / 4

        extracted_sentences = []
        extracted_sentences.append([sentences[0], 0])

        indeces_extracted = []
        indeces_extracted.append(0)

        for x in range(int(length_to_be_extracted)):
            if (enhanced_feature_sum[x][1] != 0):
                extracted_sentences.append([
                    sentences[enhanced_feature_sum[x][1]],
                    enhanced_feature_sum[x][1]
                ])
                indeces_extracted.append(enhanced_feature_sum[x][1])

        extracted_sentences.sort(key=lambda x: x[1])

        finalText = ""

        for i in range(len(extracted_sentences)):
            finalText = finalText + extracted_sentences[i][0]

        return finalText
Exemple #23
0
           ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print('The fine tuning code for file ' + os.path.split(__file__)[1] +
          ' ran for %.2fm' % ((end_time - start_time) / 60.), file=sys.stderr)
    return test_score * 100.

if __name__ == '__main__':

#MNIST dataset
    dataset = 'mnist.pkl.gz'
    datasets = load_data(dataset)  
    rbm_results_mnist = []
    cd = [False, True]
    gibbs = [1,5,10,15]
    for i in range(len(gibbs)):
        for j in range(len(cd)):
            rbm_results_mnist.append(test_rbm(rbm_persistence = cd[j], gibbs_steps = gibbs[i], dataset_nm = datasets))
    
    dbn_results_mnist = []
    
    dbn_results_mnist.append(test_DBN(hidden_layers = [700], k=10, dataset_nm = datasets))
    dbn_results_mnist.append(test_DBN(hidden_layers = [1000], k=10, dataset_nm = datasets))
    dbn_results_mnist.append(test_DBN(hidden_layers = [700, 700], k=10, dataset_nm = datasets))
    dbn_results_mnist.append(test_DBN(hidden_layers = [700, 1000], k=10, dataset_nm = datasets))

#CIFAR dataset
    datasets = load_data_cifar()
    rbm_results_cifar = []
    cd = [False, True]
    gibbs = [1,5,10,15]
    for i in range(len(gibbs)):
        for j in range(len(cd)):
def train():
    # if os.path.exists('dataset/per_feature_matrix'):
    #     per_feature_matrix = pickle.load(open('dataset/per_feature_matrix', 'rb'))
    # else:
    start = time.time()
    print "extracting feature matrix..."
    if 1:
        per_feature_matrix = {}
        for each in os.listdir('dataset/per_feature'):
            path = os.path.join('dataset/per_feature/', each)
            per_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                      **per_feature_matrix)
        per_feature_matrix = per_feature_matrix.values()
        pickle.dump(per_feature_matrix, open('dataset/per_feature_matrix',
                                             'wb'))

    # if os.path.exists('dataset/api_feature_matrix'):
    #     api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb'))
    # else:
    if 1:
        api_feature_matrix = {}
        for each in os.listdir('dataset/api_feature'):
            path = os.path.join('dataset/api_feature/', each)
            api_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                      **api_feature_matrix)
        api_feature_matrix = api_feature_matrix.values()
        pickle.dump(api_feature_matrix, open('dataset/api_feature_matrix',
                                             'wb'))

    # if os.path.exists('dataset/ngram_feature_matrix'):
    #     ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb'))
    # else:
    if 1:
        ngram_feature_matrix = {}
        for each in os.listdir('dataset/ngram_feature'):
            path = os.path.join('dataset/ngram_feature/', each)
            ngram_feature_matrix = dict(pickle.load(open(path, 'rb')),
                                        **ngram_feature_matrix)
        ngram_feature_matrix = ngram_feature_matrix.values()
        pickle.dump(ngram_feature_matrix,
                    open('dataset/ngram_feature_matrix', 'wb'))

    classification = pickle.load(open('dataset/classification', 'rb'))
    if per_feature_matrix is not None and api_feature_matrix is not None and ngram_feature_matrix is not None:
        feature_matrix = _concatenate(per_feature_matrix, api_feature_matrix,
                                      ngram_feature_matrix)
    elif per_feature_matrix is not None:
        feature_matrix = per_feature_matrix
    elif api_feature_matrix is not None:
        feature_matrix = api_feature_matrix
    elif ngram_feature_matrix is not None:
        feature_matrix = ngram_feature_matrix
    else:
        return
    print "extracting feature matrix done."
    print "处理前样本总数:%d" % len(feature_matrix)

    #print len(feature_matrix)
    #print len(classification)

    features = 400
    fsmodel = SelectKBest(chi2, k=features)
    raw_feature_matrix = feature_matrix
    feature_matrix = fsmodel.fit_transform(feature_matrix, classification)

    pickle.dump(fsmodel, open('dataset/fsmodel', 'wb'))

    features = 300
    svc = SVC(kernel="linear", C=1)
    fsmodel2 = RFE(estimator=svc, n_features_to_select=features, step=1)

    #########################    DEBUG    ############################
    #classification = classification[7:]
    ##################################################################
    feature_matrix = fsmodel2.fit_transform(feature_matrix, classification)

    pickle.dump(fsmodel2, open('dataset/fsmodel2', 'wb'))

    #########################    DEBUG    ############################
    b_s = 5  #改这里也要改dl.py里面的默认值
    length = len(feature_matrix)
    feature_matrix = feature_matrix[length % b_s:]
    raw_feature_matrix = raw_feature_matrix[length % b_s:]
    classification = classification[length % b_s:]
    print "处理后样本总数:%d" % len(feature_matrix)
    ##################################################################

    #########################    DEBUG    ############################
    fs_vec = []
    for i in range(len(raw_feature_matrix[0])):
        fs_vec.append(i)  #构造值等于编号的特殊向量

    fs_vec = fsmodel.transform(fs_vec)
    #print fs_vec
    fs_vec = fsmodel2.transform(fs_vec)
    #print fs_vec

    feature_matrix_dl = [x for x in range(len(raw_feature_matrix))]
    for i in range(len(feature_matrix_dl)):
        feature_matrix_dl[i] = [
            x for x in range(len(raw_feature_matrix[0]) - features)
        ]
    temp = 0
    for i in range(len(raw_feature_matrix[0])):
        if i not in fs_vec:
            print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl)):
                feature_matrix_dl[j][temp] = raw_feature_matrix[j][i]
            temp = temp + 1

    #print "行数%d" % len(feature_matrix_dl)
    #print "列数%d" % len(feature_matrix_dl[0])
    #print feature_matrix_dl

    ##################################################################
    #hiddeny, da = test_dA(feature_matrix_dl, len(feature_matrix_dl[0]))
    # hiddeny2, test = test_dA(feature_matrix,len(feature_matrix[0]), batch_size=6, da_object = da)
    hiddeny, da = test_rbm(feature_matrix_dl, len(feature_matrix_dl[0]))
    #print len(feature_matrix)
    print "浅度特征数:%d" % len(feature_matrix[0])
    #print len(hiddeny)
    print "深度特征数:%d" % len(hiddeny[0])
    # print (hiddeny == hiddeny2).all()

    #固化深度训练器
    pickle.dump(da, open('dataset/rbmmodel', 'wb'))

    # 深度特征融合
    feature_matrix = numpy.concatenate((feature_matrix, hiddeny), axis=1)

    Z = []
    count = 0
    for i in feature_matrix:
        Z.append([])
        for j in i:
            Z[count].append(j)

        count += 1

    feature_matrix = Z

    # print feature_matrix

    Z = []
    for i in classification:
        Z.append(int(i))

    classification = Z

    if 1:
        per_feature_matrix2 = {}
        for each in os.listdir('test/per_feature'):
            path = os.path.join('test/per_feature/', each)
            per_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                       **per_feature_matrix2)
        per_feature_matrix2 = per_feature_matrix2.values()
        pickle.dump(per_feature_matrix2, open('test/per_feature_matrix', 'wb'))

    # if os.path.exists('dataset/api_feature_matrix'):
    #     api_feature_matrix = pickle.load(open('dataset/api_feature_matrix', 'rb'))
    # else:
    if 1:
        api_feature_matrix2 = {}
        for each in os.listdir('test/api_feature'):
            path = os.path.join('test/api_feature/', each)
            api_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                       **api_feature_matrix2)
        api_feature_matrix2 = api_feature_matrix2.values()
        pickle.dump(api_feature_matrix2, open('test/api_feature_matrix', 'wb'))

    # if os.path.exists('dataset/ngram_feature_matrix'):
    #     ngram_feature_matrix = pickle.load(open('dataset/ngram_feature_matrix', 'rb'))
    # else:
    if 1:
        ngram_feature_matrix2 = {}
        for each in os.listdir('test/ngram_feature'):
            path = os.path.join('test/ngram_feature/', each)
            ngram_feature_matrix2 = dict(pickle.load(open(path, 'rb')),
                                         **ngram_feature_matrix2)
        ngram_feature_matrix2 = ngram_feature_matrix2.values()
        pickle.dump(ngram_feature_matrix2,
                    open('test/ngram_feature_matrix', 'wb'))

    classification2 = pickle.load(open('test/classification', 'rb'))
    if per_feature_matrix2 is not None and api_feature_matrix2 is not None and ngram_feature_matrix2 is not None:
        feature_matrix2 = _concatenate(per_feature_matrix2,
                                       api_feature_matrix2,
                                       ngram_feature_matrix2)
    elif per_feature_matrix2 is not None:
        feature_matrix2 = per_feature_matrix2
    elif api_feature_matrix2 is not None:
        feature_matrix2 = api_feature_matrix2
    elif ngram_feature_matrix2 is not None:
        feature_matrix2 = ngram_feature_matrix2
    else:
        return
    print "extracting feature matrix done."
    print "处理前样本总数:%d" % len(feature_matrix2)

    #print len(feature_matrix)
    #print len(classification)

    features = 400
    fsmodel2 = SelectKBest(chi2, k=features)
    raw_feature_matrix2 = feature_matrix2
    feature_matrix2 = fsmodel.fit_transform(feature_matrix2, classification2)

    features2 = 300
    svc = SVC(kernel="linear", C=1)
    fsmodel2 = RFE(estimator=svc, n_features_to_select=features2, step=1)
    feature_matrix2 = fsmodel2.fit_transform(feature_matrix2, classification2)

    #########################    DEBUG    ############################
    b_s = 5  #改这里也要改dl.py里面的默认值
    length = len(feature_matrix2)
    feature_matrix2 = feature_matrix2[length % b_s:]
    raw_feature_matrix2 = raw_feature_matrix2[length % b_s:]
    classification2 = classification2[length % b_s:]
    print "处理后样本总数:%d" % len(feature_matrix2)
    ##################################################################

    #########################    DEBUG    ############################
    fs_vec2 = []
    for i in range(len(raw_feature_matrix2[0])):
        fs_vec2.append(i)  #构造值等于编号的特殊向量

    fs_vec2 = fsmodel.transform(fs_vec2)
    #print fs_vec
    fs_vec2 = fsmodel2.transform(fs_vec2)
    #print fs_vec

    feature_matrix_dl2 = [x for x in range(len(raw_feature_matrix2))]
    for i in range(len(feature_matrix_dl2)):
        feature_matrix_dl2[i] = [
            x for x in range(len(raw_feature_matrix2[0]) - features2)
        ]
    temp = 0
    for i in range(len(raw_feature_matrix2[0])):
        if i not in fs_vec2:
            print "第%d列特征没有选用" % i
            for j in range(len(feature_matrix_dl2)):
                feature_matrix_dl2[j][temp] = raw_feature_matrix2[j][i]
            temp = temp + 1

    hiddeny2, da = test_rbm(feature_matrix_dl2, len(feature_matrix_dl2[0]))
    #print len(feature_matrix)
    print "浅度特征数:%d" % len(feature_matrix2[0])
    #print len(hiddeny)
    print "深度特征数:%d" % len(hiddeny2[0])
    # print (hiddeny == hiddeny2).all()

    # 深度特征融合
    feature_matrix2 = numpy.concatenate((feature_matrix2, hiddeny2), axis=1)

    Z = []
    count = 0
    for i in feature_matrix2:
        Z.append([])
        for j in i:
            Z[count].append(j)

        count += 1

    feature_matrix2 = Z

    # print feature_matrix

    Z = []
    for i in classification2:
        Z.append(int(i))

    classification2 = Z
    '''
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with RF..."
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    rf.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(rf, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with RF done.\n"
    pickle.dump(rf, open('dataset/model', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with GBDT..."
    gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,
    max_depth=100, min_samples_split=10, random_state=0)
    gbdt.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(gbdt, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with GBDT done.\n"
    pickle.dump(gbdt, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with AdaBoost..."
    ada = AdaBoostClassifier(n_estimators=300)
    ada.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(ada, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with AdaBoost done.\n"
    pickle.dump(ada, open('dataset/model3', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with LogisticRegression..."
    lr = LogisticRegression()
    lr.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    scores = cross_validation.cross_val_score(lr, feature_matrix2, classification2, cv=kf)
    print scores
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    print "learning with LogisticRegression done.\n"
    pickle.dump(lr, open('dataset/model4', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with RF..."
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    rf.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(rf, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with RF done.\n"
    pickle.dump(rf, open('dataset/model', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with GBDT..."
    gbdt = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,
    max_depth=100, min_samples_split=10, random_state=0)
    gbdt.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(gbdt, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with GBDT done.\n"
    pickle.dump(gbdt, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with AdaBoost..."
    ada = AdaBoostClassifier(n_estimators=300)
    ada.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(ada, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with AdaBoost done.\n"
    pickle.dump(ada, open('dataset/model3', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
	
    print "learning with LogisticRegression..."
    lr = LogisticRegression()
    lr.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(lr, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with LogisticRegression done.\n"
    pickle.dump(lr, open('dataset/model4', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    print "\nlearning with SVC..."
    slffork=SVC(kernel='rbf',probability = True)
    slffork.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(slffork, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with SVC done.\n"
    pickle.dump(slffork, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''
    print "learning with BaggingClassifier..."
    kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)
    baggingfork = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5,max_features=0.5)
    baggingfork.fit(feature_matrix2, classification2)
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(baggingfork, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    print "learning with BaggingClassifier done.\n"
    pickle.dump(baggingfork, open('dataset/model2', 'wb'))  # 固化训练结果
    #print 'time :%f'% (time.time() - start)
    '''
    '''kf = KFold(len(feature_matrix2), n_folds=5, shuffle = True)'''
    rf = RandomForestClassifier(n_estimators=300, min_samples_split=10)
    gbdt = GradientBoostingClassifier(n_estimators=300,
                                      learning_rate=1.0,
                                      max_depth=100,
                                      min_samples_split=10,
                                      random_state=0)
    ada = AdaBoostClassifier(n_estimators=300)
    #slf1=SVC(kernel='rbf',probability = True)
    bagging = BaggingClassifier(KNeighborsClassifier(),
                                max_samples=0.5,
                                max_features=0.5)

    print "learning with Voting Classifier..."
    vc = VotingClassifier(estimators=[('rf', rf), ('ada', ada),
                                      ('bagging', bagging), ('gbdt', gbdt)],
                          voting='soft',
                          weights=[1.5, 1.5, 1.3, 1.5])
    vc.fit(feature_matrix, classification)
    '''
    print "Cross Validating..."
    predicted = cross_validation.cross_val_predict(vc, feature_matrix2, classification2, cv=kf)
    print "Confusion matrix: "
    print metrics.confusion_matrix(classification2, predicted)
    print("Accuracy: %0.3f" % metrics.accuracy_score(classification2, predicted))
    print "Precision: "
    print metrics.precision_score(classification2, predicted, average=None)
    print "Recall: "
    print metrics.recall_score(classification2, predicted, average=None)
    print "F1 "
    print metrics.f1_score(classification2, predicted, average=None)
    '''
    print "learning with Ensemble Classifier done.\n"
    pickle.dump(vc, open('dataset/model_final', 'wb'))  # 固化训练结果
    print 'time :%f' % (time.time() - start)
Exemple #25
0
                                              batch_size=24,
                                              shuffle=False)

    input_dim, hidden_dim1, hidden_dim2, output_dim = 36, 36 * 2, 36, 10

    model = Net(input_dim, hidden_dim1, hidden_dim2, output_dim)
    learning_rate = 0.01
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    training_results = train(model=model,
                             criterion=criterion,
                             train_loader=train_loader,
                             test_loader=test_loader,
                             optimizer=optimizer,
                             epochs=10)

    #Part3. Restricted Boltzman Machine
    result_model = test_rbm()

    income_rbm = 0
    for index in range(9900):
        print(index)
        res = result_model[index][:37].mean()
        print(res)

        if res > 0.55:

            income_rbm += ALL.OPEN_Bid.iloc[index +
                                            10] - ALL.OPEN_Bid.iloc[index]

    print("Income is:", income_rbm)
Exemple #26
0
def executeForAFile(filename,output_file_name,humanExtractedYesOrNo_files,humanExtractGiven) :
    
    file = open(filename, 'r') 
    #edited by amritha
    # file = open('./gdrive/My Drive/TextSummarizer/article1','r')
    text = file.read()
    paragraphs = para_reader.show_paragraphs(filename)
    # print("Number of paras : %d",len(paragraphs))
    sentences = split_into_sentences(text)
    text_len = len(sentences)
    humanYesOrNo = []
    
    if humanExtractGiven == False :
        # humanYesOrNo = askHuman.humanGenerator(text)
        x = 0
    else:
        
        with open(humanExtractedYesOrNo_files) as fileobj:
            for word in fileobj:  
                for ch in word: 
                    humanYesOrNo.append(ord(ch)-48)
    
    tokenized_sentences = remove_stop_words(sentences)
    tagged = posTagger(remove_stop_words(sentences))

    thematicFeature(tokenized_sentences)
    # print(upperCaseFeature(sentences))
    sentencePosition(paragraphs)

    tfIsfScore = tfIsf(tokenized_sentences)
    similarityScore = similarityScores(tokenized_sentences)

    # print("\n\nProper Noun Score : \n")
    properNounScore = properNounScores(tagged)
    # print(properNounScore)
    centroidSimilarityScore = centroidSimilarity(sentences,tfIsfScore)
    numericTokenScore = numericToken(tokenized_sentences)
    namedEntityRecogScore = namedEntityRecog(sentences)
    sentencePosScore = sentencePos(sentences)
    sentenceLengthScore = sentenceLength(tokenized_sentences)


    featureMatrix = []
    featureMatrix.append(tfIsfScore)
    featureMatrix.append(similarityScore)
    featureMatrix.append(properNounScore)
    featureMatrix.append(centroidSimilarityScore)
    featureMatrix.append(numericTokenScore)
    featureMatrix.append(namedEntityRecogScore)
    featureMatrix.append(sentencePosScore)
    featureMatrix.append(sentenceLengthScore)


    featureMat = np.zeros((len(sentences),8))
    for i in range(8) :
        for j in range(len(sentences)):
            featureMat[j][i] = featureMatrix[i][j]

    featureMat_normed = featureMat
    temp = rbm.test_rbm(dataset = featureMat_normed,learning_rate=0.1, training_epochs=14, batch_size=5,n_chains=5,
             n_hidden=8)

    enhanced_feature_sum = []
    for i in range(len(np.sum(temp,axis=1))) :
        enhanced_feature_sum.append([np.sum(temp,axis=1)[i],i])

    enhanced_feature_sum.sort(key=lambda x: x[0])
    length_to_be_extracted = int(len(enhanced_feature_sum)/2)
    extracted_sentences = []
    extracted_sentences.append([sentences[0], 0])
    indeces_extracted = []
    indeces_extracted.append(0)

    for x in range(length_to_be_extracted) :
        if(enhanced_feature_sum[x][1] != 0) :
            extracted_sentences.append([sentences[enhanced_feature_sum[x][1]], enhanced_feature_sum[x][1]])
            indeces_extracted.append(enhanced_feature_sum[x][1])

    autoYesOrNo = askHuman.automaticGenerator(indeces_extracted,text_len)
    # Supervised learning
    # precision, recall, Fscore = askHuman.compareHumanAndAutomatic(humanYesOrNo,autoYesOrNo)

    # precision_values.append(precision)
    # recall_values.append(recall)
    # Fscore_values.append(Fscore)

    # print(extracted_sentences)
    extracted_sentences.sort(key=lambda x: x[1])
    # print(extracted_sentences)

    finalText = ""
    # print("\nExtracted Final Text : ")
    for i in range(len(extracted_sentences)):
        # print(extracted_sentences[i][0])
        finalText = finalText + extracted_sentences[i][0]
    
    


    # print("Precision : " + repr(precision) +"\nRecall : " + repr(recall) + "\nFscore : "+ repr(Fscore))
    file = open(output_file_name, "w") 
    file.write(finalText)
    # print(finalText)
    file.close()