def __init__(self, lsa, code, templates):
     
     match = filter(lambda tpl: code in tpl[0], templates)
     
     self.lsa = lsa
     distance_matrix = [lsa.project(d) 
                       for c,d in match]
     self.templates = MatrixHelper.gensim_to_numpy_array(distance_matrix, lsa.num_topics)
    def __init__(self, lsa, code, templates):

        match = filter(lambda tpl: code in tpl[0], templates)

        self.lsa = lsa
        distance_matrix = [lsa.project(d) for c, d in match]
        self.templates = MatrixHelper.gensim_to_numpy_array(
            distance_matrix, lsa.num_topics)
def get_data(xs):
    import WordTokenizer
    import TermFrequency
    import MatrixHelper
    import Converter
    
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    arr = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, Converter.to_binary)
    return arr
Esempio n. 4
0
def get_binary_data(xs):
    import WordTokenizer
    import TermFrequency
    import MatrixHelper
    import Converter

    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    arr = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0,
                                             Converter.to_binary)
    return arr
def train():

    #SETTINGS
    cv_folds = 10
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + +GwData.FOLDER
    num_lsa_topics = 100

    #TOKENIZE
    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)

    #NLTK SVM linear kernel
    xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix,
                                            initial_value=0)

    total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0

    all_results = "LSA Dimensions: " + str(num_lsa_topics)
    print all_results

    processed_code_count = 0
    #MIN_CODE_COUNT = 5

    MIN_CODE_COUNT = 1

    codes = [
        c for c in xs.sm_codes
        # Exclude pure vague codes
        if c != "v" and
        # Exclude doc codes. Need whole doc to classify them
        not c.startswith("s")
    ]

    for code in codes:

        code_count = xs.sm_code_count[code]
        if code_count <= MIN_CODE_COUNT:
            continue

        processed_code_count += 1
        labels = map(Converter.get_svm_val, xs.labels_for(code))
        classifier = svm.LinearSVC(C=1)
        recall, precision, f1_score = cross_validation_score(xs,
                                                             labels,
                                                             classifier,
                                                             cv_folds,
                                                             class_value=1.0)
        results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format(
            code.ljust(10), code_count, recall, precision, f1_score)

        all_results += results
        total_recall += recall
        total_precision += precision
        total_f1 += f1_score

        print results,

    #num_codes = len(xs.sm_codes)
    num_codes = processed_code_count
    result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format(
        total_recall / num_codes, total_precision / num_codes,
        total_f1 / num_codes)
    all_results += result
    print result

    #DUMP TO FILE
    fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str(
        num_lsa_topics) + ".txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()
Esempio n. 6
0
def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)

    #NLTK Decision Tree
    np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix,
                                                   initial_value=0)

    labels = data.causal_per_document

    def get_svm_val(x):
        if x <= 0:
            return -1
        return 1

    labels = map(get_svm_val, labels)

    td_size = int(0.75 * len(np_matrix))

    td_x = np_matrix[:td_size]
    td_y = labels[:td_size]

    vd_x = np_matrix[td_size:]
    vd_y = labels[td_size:]

    rng = array(range(1, 21, 1))

    c_vals = rng / 10.0

    all_results = ""
    for c in c_vals:
        classifier = svm.LinearSVC(C=c)
        classifier.fit(td_x, td_y)

        #RESULTS
        classifications = classifier.predict(vd_x)

        results = "\nC VALUE: " + str(c) + "\n"
        results += ResultsHelper.rfp(vd_y, classifications)
        print results

        all_results += results
    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE
    fName = results_dir + "Causal_Relation_SVM.txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()

    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)

    # Test with CL1 labels
    raw_input("Press Enter to quit")
def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = 5)
    tfidf = TfIdf.TfIdf(tokenized_docs)
    
    #NLTK Decision Tree
    np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix, initial_value = 0)
    
    labels = data.causal_per_document
    
    def get_svm_val(x):
        if x <= 0:
            return -1
        return 1

    labels = map(get_svm_val,labels)

    td_size = int(0.75 * len(np_matrix))
    
    td_x = np_matrix[:td_size]
    td_y = labels[:td_size]
    
    vd_x = np_matrix[td_size:]
    vd_y = labels[td_size:]
    
    rng = array(range(1,21,1))
    
    c_vals = rng / 10.0
    
    all_results = ""
    for c in c_vals:
        classifier = svm.LinearSVC(C = c)
        classifier.fit(td_x, td_y)
        
        #RESULTS
        classifications = classifier.predict(vd_x)
        
        results = "\nC VALUE: " + str(c) + "\n"
        results += ResultsHelper.rfp(vd_y, classifications)
        print results

        all_results += results
    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE    
    fName = results_dir + "Causal_Relation_SVM.txt"
    handle = open(fName, mode = "w+")
    handle.write(all_results)
    handle.close()
    
    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)
    
    # Test with CL1 labels
    raw_input("Press Enter to quit")
def test_dA(learning_rate=0.5, training_epochs=100,
            batch_size=270):

    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    
    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count = 5)
    tf = TermFrequency.TermFrequency(tokenized_docs)
    
    def to_binary(val):
        if val <= 0:
            return float(0)
        return float(1)
    
    distance_matrix = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, to_binary)[0:-94]
    
    cols = len(distance_matrix[0])
    rows = len(distance_matrix)
    
    train_set_x = theano.shared(numpy.asarray(distance_matrix,
                                               dtype=theano.config.floatX),
                                 borrow=True)
    
    #distance_matrix = numpy.ndarray((rows,cols), distance_matrix)
    #distance_matrix = shared(distance_matrix)

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(len(distance_matrix) / batch_size)
    
    hidden = 300
    corruption_level = 0.0

    # allocate symbolic variables for the xs
    index = T.iscalar()    # index to a [mini]batch
    x = T.distance_matrix('x')  # the xs is presented as rasterized images

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible= cols, n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=corruption_level,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible= cols, n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                  (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The 30% corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % (training_time / 60.))
Esempio n. 9
0
 def get_sparse_matrix_data(self, distance_matrix, id2word):
     return MatrixHelper.gensim_to_numpy_array(distance_matrix, initial_value = 0)
Esempio n. 10
0
def test_dA(learning_rate=0.5, training_epochs=100, batch_size=270):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """

    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    def to_binary(val):
        if val <= 0:
            return float(0)
        return float(1)

    distance_matrix = MatrixHelper.gensim_to_numpy_array(
        tf.distance_matrix, None, 0, to_binary)[0:-94]

    cols = len(distance_matrix[0])
    rows = len(distance_matrix)

    train_set_x = theano.shared(numpy.asarray(distance_matrix,
                                              dtype=theano.config.floatX),
                                borrow=True)

    #distance_matrix = numpy.ndarray((rows,cols), distance_matrix)
    #distance_matrix = shared(distance_matrix)

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(len(distance_matrix) / batch_size)

    hidden = 300
    corruption_level = 0.0

    # allocate symbolic variables for the xs
    index = T.iscalar()  # index to a [mini]batch
    x = T.distance_matrix('x')  # the xs is presented as rasterized images

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=cols,
            n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=corruption_level,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((training_time) / 60.))
    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=cols,
            n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The 30% corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          (training_time / 60.))