Python GwData.GwData Examples

Example #1

0

Show file

def split_by_class(dir, pctTest=10):

    if not dir.endswith("\\"):
        dir = dir + "\\"

    data = GwData.GwData()
    pctTrain = (1.0 - (pctTest / 100.0))

    for code in data.sm_codes:
        s = data.sentences_for_code(code)
        not_s = data.sentences_not_for_code(code)

        train_s_cnt = int(len(s) * pctTrain)
        train_s = s[:train_s_cnt]
        test_s = s[train_s_cnt:]

        train_not_s_cnt = int(len(not_s) * pctTrain)
        train_not_s = not_s[:train_not_s_cnt]
        test_not_s = not_s[train_not_s_cnt:]

        friendly_code = code.replace(".", "_")
        write_to_file(dir, friendly_code, friendly_code + ".txt", train_s)
        write_to_file(dir, friendly_code, "test_" + friendly_code + ".txt",
                      test_s)

        write_to_file(dir, friendly_code, "not_" + friendly_code + ".txt",
                      train_not_s)
        write_to_file(dir, friendly_code, "test_not_" + friendly_code + ".txt",
                      test_not_s)

    print "Done"

Example #2

0

Show file

def main():

    #SETTINGS
    best_n_words = 10000
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    term_freq = TermFrequency.TermFrequency(tokenized_docs)

    #NLTK Decision Tree
    list_of_dicts = Converter.vector_space_to_dict_list(
        term_freq.matrix, term_freq.id2Word, Converter.to_binary)

    labels = data.causal_per_document
    causal_count = sum(labels)
    relative_word_frequency = DocumentFrequency.document_frequency_ratio(
        list_of_dicts, labels, lambda l: l == 1)
    condensed_data = extract_best_n_words(relative_word_frequency,
                                          best_n_words, list_of_dicts)

    labelled_data = zip(condensed_data, labels)
    td_size = int(0.75 * len(labelled_data))

    training_data = labelled_data[:td_size]
    validation_data = labelled_data[td_size:]

    dt = nltk.DecisionTreeClassifier.train(training_data)

    #RESULTS
    classifications = [dt.classify(rcd) for rcd, lbl in validation_data]

    results = ResultsHelper.rfp(labels[td_size:], classifications)
    results += "Num Words Used              : " + str(best_n_words) + "\n"
    results += "\n"

    error = dt.error(labelled_data)
    results += "ERROR:                      : " + str(error * 100) + "%\n"
    results += "\n"

    results += "PSEUDOCODE:\n"
    results += dt.pseudocode(depth=1000) + "\n"
    print results

    #DUMP TO FILE
    fName = results_dir + "Causal_Relation_DT.txt"
    handle = open(fName, mode="w+")
    handle.write(results)
    handle.close()

    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)

    # Test with CL1 labels
    raw_input("Press Enter to quit")

Example #3

0

Show file

    def test_learner_on_data():
        import GwData
        import WordTokenizer

        code = "50"

        data = GwData.GwData()
        xs = WordTokenizer.tokenize(data.documents, spelling_correct=False)
        ys = data.labels_for(code)

        def rule_score_fn(act_ys, predicted):
            return precision(act_ys, predicted) * (recall(act_ys, predicted)**
                                                   0.5)

        learner = RegExLearner(precision, f1_score, 2.5)
        learner.fit(xs, ys)
        pred = learner.predict(xs)

        # TD Performance
        print_positives(xs, ys)
        r, p, f1 = rpf1(ys, pred)
        print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(
            r, p, f1)
        print str(learner)
        pass

Example #4

0

Show file

    def test_learner_on_data():
        import GwData
        import WordTokenizer
        import numpy as np

        MINIMUM_COVERAGE_PCT = 2.0
        code = "53"
        print "Learning rules for code: " + code
        # '%%' is how you print a '%' in python given that it is a special char
        print "Mininum coverage: %d%%\n" % (MINIMUM_COVERAGE_PCT)

        data = GwData.GwData()
        xs = WordTokenizer.tokenize(data.documents,
                                    stem=False,
                                    spelling_correct=False,
                                    remove_stop_words=False,
                                    min_word_count=1)

        ys = data.labels_for(code)

        def rule_score_fn(act_ys, predicted):
            r, p, f1 = rpf1(act_ys, predicted)
            return r * (p**0.5)

        shuffled_ixs = np.array(range(len(xs)))
        np.random.shuffle(shuffled_ixs)

        shuffled_xs = np.array(xs)[shuffled_ixs]
        shuffled_ys = np.array(ys)[shuffled_ixs]

        td_size = int(len(xs) * 0.9)

        td_xs, td_ys = shuffled_xs[0:td_size], shuffled_ys[0:td_size]
        vd_xs, vd_ys = shuffled_xs[td_size:], shuffled_ys[td_size:]

        assert len(td_xs) + len(vd_xs) == len(xs), "|TD| + |VD| == |D|"

        learner = RegExLearner(precision, f1_score, MINIMUM_COVERAGE_PCT)
        learner.fit(td_xs, td_ys)

        print_positives(xs, ys)
        print str(learner)

        # TD Performance
        td_pred = learner.predict(td_xs)
        r, p, f1 = rpf1(td_ys, td_pred)
        print "TD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(
            r, p, f1)

        # VD performance
        vd_pred = learner.predict(vd_xs)
        r, p, f1 = rpf1(vd_ys, vd_pred)
        print "VD:\n\tRecall: {0}\n\tPrecision: {1}\n\tF1: {2}\n".format(
            r, p, f1)

        pass

Example #5

0

Show file

File: hopfield_network.py Project: abhilashreddyy/PythonNlpResearch

    def test_on_data():
        import GwData
        import WordTokenizer
        import TfIdf

        import Converter
        import MatrixHelper

        data = GwData.GwData()
        tokenized = WordTokenizer.tokenize(data.documents)
        tfidf = TfIdf.TfIdf(data.documents)

Example #6

0

Show file

def run_supervised():
    import GwData
    gwData = GwData.GwData()

    xs = get_data(gwData)

    def flip(i):
        if i == 0:
            return 1
        return 0

    ys = [[lbl, flip(lbl)] for lbl in gwData.labels_for("50")]

    xs = np.array(xs)
    ys = np.array(ys)

    td_size = 2500

    td_x = xs[0:td_size]
    vd_x = xs[td_size:]

    dbnetwork = DeepNet([td_x.shape[1], 600, 400],
                        ['sigmoid', 'sigmoid', 'sigmoid'])
    dbnetwork.train(td_x, [1000, 1000], [0.1, 0.1])
    out = dbnetwork.run_through_network(xs)

    top_layer = backprop.NeuralNet(
        layer_sizes=[out.shape[1], int(out.shape[1]), 2],
        layer_types=['sigmoid', 'sigmoid', 'sigmoid'])

    o_td_x = out[0:td_size]
    o_vd_x = out[td_size:]

    td_y = ys[0:td_size]
    vd_y = ys[td_size:]

    top_layers = top_layer.train(top_layer.network, o_td_x, td_y, o_vd_x, vd_y,
                                 10, 'classification', 'crossEntropy', 0, 25)

    #TODO We need to train a top layer neural network from the top DBNN layer to the output
    #TODO Then we create a final network composed of the two concatenated together
    mlp = to_feed_forward_network(dbnetwork, top_layers)
    trained = mlp.train(mlp.network,
                        td_x,
                        td_y,
                        vd_x,
                        vd_y,
                        max_iter=30,
                        validErrFunc='classification',
                        targetCost='crossEntropy')

    print out.shape
    np.save('output.npy', out)

Example #7

0

Show file

def get_data():
    import GwData
    import WordTokenizer
    import TermFrequency
    import MatrixHelper
    import Converter

    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    ts = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0,
                                            Converter.to_binary)
    return ts

Example #8

0

Show file

File: CR_ClassifyUsingMaxEnt.py Project: IslamMohamedMosaad/PythonNlpResearch

def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    term_freq = TermFrequency.TermFrequency(tokenized_docs)
    #tfidf = TfIdf.TfIdf(tokenized_docs)

    #NLTK Decision Tree
    list_of_dicts = Converter.vector_space_to_dict_list(
        term_freq.distance_matrix, term_freq.id2Word, Converter.to_binary)
    #list_of_dicts = Converter.vector_space_to_dict_list(tfidf.matrix, tfidf.id2Word)

    labels = data.causal_per_document
    labelled_data = zip(list_of_dicts, labels)
    td_size = int(0.75 * len(labelled_data))

    training_data = labelled_data[:td_size]
    validation_data = labelled_data[td_size:]

    me = nltk.MaxentClassifier.train(training_data, algorithm="GIS")

    #RESULTS
    classifications = [me.classify(rcd) for rcd, lbl in validation_data]

    results = ResultsHelper.rfp(labels[td_size:], classifications)
    print results

    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE
    fName = results_dir + "Causal_Relation_MaxEnt.txt"
    handle = open(fName, mode="w+")
    handle.write(results)
    handle.close()

    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)

    # Test with CL1 labels
    raw_input("Press Enter to quit")

Example #9

0

Show file

File: Codes_ClassifyUsingVectorComposition_Word2Vec_WithSubSampling.py Project: IslamMohamedMosaad/PythonNlpResearch

        def __init__(self):
            import GwData
            import WordTokenizer
            from py_word2vec import Word2Vec

            data = GwData.GwData()
            # Ensure we train on all words here (as a sequence model), stem matches
            # above setting, and we do NOT remove stop words (breaks sequencing)
            # spelling correct must match below
            tokenized_docs = WordTokenizer.tokenize(data.documents,
                        min_word_count=1, stem = stem,
                        remove_stop_words=False, spelling_correct=spelling_correct,
                        number_fn=NumberStrategy.collapse_num)

            self.wd2vec = Word2Vec(tokenized_docs, topics, min_count=2)

Example #10

0

Show file

File: CR_ClassifyUsingNaiveBayes.py Project: IslamMohamedMosaad/PythonNlpResearch

def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = 5)
    term_freq = TermFrequency.TermFrequency(tokenized_docs)
    
    #NB
    list_of_dicts = Converter.vector_space_to_dict_list(term_freq.distance_matrix, term_freq.id2Word, Converter.to_binary)
    
    labels = data.causal_per_document
    labelled_data = zip(list_of_dicts, labels)
    td_size = int(0.75 * len(labelled_data))
    
    training_data = labelled_data[:td_size]
    validation_data = labelled_data[td_size:]

    nb = nltk.NaiveBayesClassifier.train(training_data)
    
    #RESULTS
    classifications = [nb.classify(rcd) for rcd,lbl in validation_data]
    
    results = ResultsHelper.rfp(labels[td_size:], classifications)

    results += "(100) MOST INFORMATIVE FEATURES:\n"
    features = nb.most_informative_features(100)
    for i,(f,val) in enumerate(features):
        results += "\t" + str(i + 1) + " : " + f + " -> " + str(val) + "\n"

    print results

    #DUMP TO FILE    
    fName = results_dir + "Causal_Relation_NB.txt"
    handle = open(fName, mode = "w+")
    handle.write(results)
    handle.close()
    
    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)
    
    # Test with CL1 labels
    raw_input("Press Enter to quit")

Example #11

0

Show file

	def __init__(self):
		QMainWindow.__init__(self)
		Ui_MainWindow.__init__(self)
		self.setupUi(self)
		self.setWindowTitle('GAMEWIKI')
		# set up configuration
		self.config = GwConfig.GwConfig(self)
		self.config.Load()
		self.data = GwData.GwData(self)
		# connect buttons to functions
		self.actionOpen_Project.triggered.connect(self.data.OpenProjectDialog)
		# initialize tree
		self.data.SetProjectPath(self.config.LastProject)
		self.treeView_2.clicked.connect(self.data.OpenClickedFile)
		# set up a var for our last open path
		self.openFilePath = ''
		# set up text processing
		self.parser = GwParse.GwParse(self)
		self.text = GwText.GwText(self)

Example #12

0

Show file

File: Codes_ClassifyUsingLsaNearestNeighbor_OnSourceText.py Project: IslamMohamedMosaad/PythonNlpResearch

    def __init__(self, lsa, k, code):
        self.lsa = lsa
        self.k = k
        self.code = code

        self.data = GwData.GwData(load_essays=False, load_source=True)
        tokenized_docs = self.data.documents
        self.labels = self.data.labels_for(code)

        if code != "bck":
            bck_codes = self.data.labels_for("bck")
            tokenized_docs = [
                d for i, d in enumerate(self.data.documents)
                if bck_codes[i] == 0
            ]
            self.labels = [
                lbl for i, lbl in enumerate(self.labels) if bck_codes[i] == 0
            ]

        tokenized_docs = WordTokenizer.tokenize(tokenized_docs)
        self.distance_matrix = self.lsa.project_matrix(tokenized_docs)

Example #13

0

Show file

 def get_data(self, settings):
     return GwData.GwData(directory=settings.data_directory + "\\" +
                          GwData.FOLDER)

Example #14

0

Show file

File: Codes_ClassifyUsingSVM_with_EssayBasedLSA.py Project: IslamMohamedMosaad/PythonNlpResearch

def train():

    #SETTINGS
    cv_folds = 10
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + +GwData.FOLDER
    num_lsa_topics = 100

    #TOKENIZE
    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)

    #NLTK SVM linear kernel
    xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix,
                                            initial_value=0)

    total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0

    all_results = "LSA Dimensions: " + str(num_lsa_topics)
    print all_results

    processed_code_count = 0
    #MIN_CODE_COUNT = 5

    MIN_CODE_COUNT = 1

    codes = [
        c for c in xs.sm_codes
        # Exclude pure vague codes
        if c != "v" and
        # Exclude doc codes. Need whole doc to classify them
        not c.startswith("s")
    ]

    for code in codes:

        code_count = xs.sm_code_count[code]
        if code_count <= MIN_CODE_COUNT:
            continue

        processed_code_count += 1
        labels = map(Converter.get_svm_val, xs.labels_for(code))
        classifier = svm.LinearSVC(C=1)
        recall, precision, f1_score = cross_validation_score(xs,
                                                             labels,
                                                             classifier,
                                                             cv_folds,
                                                             class_value=1.0)
        results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format(
            code.ljust(10), code_count, recall, precision, f1_score)

        all_results += results
        total_recall += recall
        total_precision += precision
        total_f1 += f1_score

        print results,

    #num_codes = len(xs.sm_codes)
    num_codes = processed_code_count
    result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format(
        total_recall / num_codes, total_precision / num_codes,
        total_f1 / num_codes)
    all_results += result
    print result

    #DUMP TO FILE
    fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str(
        num_lsa_topics) + ".txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()

Example #15

0

Show file

File: GwExperimentBase.py Project: IslamMohamedMosaad/PythonNlpResearch

 def get_data(self, settings):
     return GwData.GwData(directory=settings.data_directory +
                          self.sub_dir())

Example #16

0

Show file

def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)

    #NLTK Decision Tree
    np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix,
                                                   initial_value=0)

    labels = data.causal_per_document

    def get_svm_val(x):
        if x <= 0:
            return -1
        return 1

    labels = map(get_svm_val, labels)

    td_size = int(0.75 * len(np_matrix))

    td_x = np_matrix[:td_size]
    td_y = labels[:td_size]

    vd_x = np_matrix[td_size:]
    vd_y = labels[td_size:]

    rng = array(range(1, 21, 1))

    c_vals = rng / 10.0

    all_results = ""
    for c in c_vals:
        classifier = svm.LinearSVC(C=c)
        classifier.fit(td_x, td_y)

        #RESULTS
        classifications = classifier.predict(vd_x)

        results = "\nC VALUE: " + str(c) + "\n"
        results += ResultsHelper.rfp(vd_y, classifications)
        print results

        all_results += results
    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE
    fName = results_dir + "Causal_Relation_SVM.txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()

    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)

    # Test with CL1 labels
    raw_input("Press Enter to quit")

Example #17

0

Show file

File: WordTokenizer.py Project: abhilashreddyy/PythonNlpResearch

            print "I/O error({0}): {1}".format(e.errno, e.strerror)
        except:
            print "Unexpected error:", sys.exc_info()[0]


def tokenize(documents,
             min_word_count=5,
             stem=True,
             lemmatize=False,
             remove_stop_words=True,
             spelling_correct=True,
             number_fn=None):
    tokenizer = WordTokenizer(min_word_count=min_word_count,
                              stem=stem,
                              lemmatize=lemmatize,
                              remove_stop_words=remove_stop_words,
                              spelling_correct=spelling_correct,
                              number_fn=number_fn)
    return tokenizer.tokenize(documents)


if __name__ == "__main__":
    import GwData
    data = GwData.GwData()

    tokens = tokenize(data.documents,
                      stem=True,
                      remove_stop_words=True,
                      spelling_correct=True)
    pass

Example #18

0

Show file

def get_binary_data():
    import GwData
    ts = GwData.GwData().as_binary()
    return ts

Example #19

0

Show file

File: Codes_ClassifyUsingLsaNearestNeighbor_OnSourceText.py Project: IslamMohamedMosaad/PythonNlpResearch

 def get_binary_data(self):
     return GwData.GwData(load_essays=True, load_source=False)

Example #20

0

Show file

File: train.py Project: abhilashreddyy/PythonNlpResearch

from SDA_Layers import *
import numpy as np

if __name__ == "__main__":
    import GwData
    data = GwData.GwData.as_binary()
    fullData = GwData.GwData()
    y = np.asarray([[l] for l in fullData.labels_for("50")])

    autoencoder = StackedDA([300], alpha=0.1)
    autoencoder.pre_train(data, 50)
    autoencoder.finalLayer(y, 10, 1)
    autoencoder.fine_tune(data, y, 50)
    pass

Example #21

0

Show file

        return self.distance_matrix[self.words[wd]].flatten().tolist()[0]

    def project(self, item):
        if type(item) == type(""):
            return self.project(item)

        l = []
        for w in item:
            if w in self.words:
                l.append(self.project(w))
        return l


if __name__ == "__main__":
    import GwData
    import TfIdf
    import WordTokenizer

    e = Embeddings()

    d = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(d.documents,
                                            min_word_count=1,
                                            stem=False,
                                            remove_stop_words=False)
    tf = TfIdf.TfIdf(tokenized_docs)

    ewds = set(e.words)

    dwds = set([w for w in tf.id2Word.values()])
    pass

Example #22

0

Show file

def test_dA(learning_rate=0.5, training_epochs=100, batch_size=270):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """

    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    def to_binary(val):
        if val <= 0:
            return float(0)
        return float(1)

    distance_matrix = MatrixHelper.gensim_to_numpy_array(
        tf.distance_matrix, None, 0, to_binary)[0:-94]

    cols = len(distance_matrix[0])
    rows = len(distance_matrix)

    train_set_x = theano.shared(numpy.asarray(distance_matrix,
                                              dtype=theano.config.floatX),
                                borrow=True)

    #distance_matrix = numpy.ndarray((rows,cols), distance_matrix)
    #distance_matrix = shared(distance_matrix)

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(len(distance_matrix) / batch_size)

    hidden = 300
    corruption_level = 0.0

    # allocate symbolic variables for the xs
    index = T.iscalar()  # index to a [mini]batch
    x = T.distance_matrix('x')  # the xs is presented as rasterized images

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=cols,
            n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=corruption_level,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((training_time) / 60.))
    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=cols,
            n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The 30% corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          (training_time / 60.))

Example #23

0

Show file

        return len(self.rows)

    """ End Interface methods """


class PpmiLatentWordVectors(ProjectionABC.ProjectionABC):
    def __init__(self, tokenized_docs, num_topics=100):
        self.corpus = PpmiWordVectors(tokenized_docs)
        self.lsa = LsiModel(self.corpus, num_topics, self.corpus.id2word)
        distance_matrix = self.lsa[self.corpus]

        def gensim_to_vector(gensim_vect):
            return map(lambda (id, val): val, gensim_vect)

        self.rows = map(gensim_to_vector, distance_matrix)

    def project(self, item):
        ix = self.corpus.word2rowindex[item]
        return self.rows[ix]


if __name__ == "__main__":
    import GwData as d
    import WordTokenizer as t

    data = d.GwData()
    tokenized_docs = t.tokenize(data.documents, spelling_correct=False)

    model = PpmiLatentWordVectors(tokenized_docs)
    vector = model.project("essay")
    pass