def train(num_lsa_topics, k):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    #TOKENIZE
    xs = SentenceFragmentData.SentenceFragmentData()
    
    tokenizer = WordTokenizer.WordTokenizer(min_word_count = 5)
    tokenized_docs = tokenizer.tokenize(xs.documents)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics)
    
    #Filter To just sm codes
    sm_code_lsa_matrix = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices)
    
    #CLUSTER
    clusterer = Clusterer.Clusterer(k)
    labels = clusterer.Run(sm_code_lsa_matrix)

    #OUTPUT - Filter by SM Code only this time
    file_name_code_clusters = "LSA_SMCODES_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics)
    sm_codes_per_doc   = ListHelper.filter_list_by_index(xs.codes_per_document, xs.sm_code_indices)
    ClustersToFile.clusters_to_file(file_name_code_clusters, labels, sm_codes_per_doc, "Chicago")
    
    file_name_category_clusters = "LSA_Categories_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics)
    categories_per_doc = ListHelper.filter_list_by_index(xs.categories_per_document, xs.sm_code_indices)
    ClustersToFile.clusters_to_file(file_name_category_clusters, labels, categories_per_doc, "Chicago")
    
    print "Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k)
def train(num_lsa_topics, k, window_size):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()
    
    tokenizer = dbnetwork.WordTokenizer(min_word_count = 5)
    tokenized_docs = tokenizer.tokenize(xs.documents)
    windowed_docs, window_indices = split_documents_into_windows(tokenized_docs, window_size)
    
    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(windowed_docs)
    lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics)

    #CLUSTER
    clusterer = Clusterer.Clusterer(k)
    window_labels = clusterer.Run(full_lsa_matrix)
    
    #Extract the labeld for the original sentences using the indices build earlier
    labels = pivot_window_labels(window_labels, window_indices)

    #OUTPUT
    file_name_code_clusters = "Windowed_LSA_SMCODES_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(window_size, k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, labels, xs.codes_per_document, "Chicago")
    
    file_name_category_clusters = "Windowed_LSA_Categories_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(window_size, k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters, labels, xs.categories_per_document, "Chicago")
    
    logging.info("Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k))
 def __init__(self, lsa, code, templates):
     
     match = filter(lambda tpl: code in tpl[0], templates)
     
     self.lsa = lsa
     distance_matrix = [lsa.project(d) 
                       for c,d in match]
     self.templates = MatrixHelper.gensim_to_numpy_array(distance_matrix, lsa.num_topics)
    def __init__(self, lsa, code, templates):

        match = filter(lambda tpl: code in tpl[0], templates)

        self.lsa = lsa
        distance_matrix = [lsa.project(d) for c, d in match]
        self.templates = MatrixHelper.gensim_to_numpy_array(
            distance_matrix, lsa.num_topics)
def train(num_lsa_topics, k):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()
    
    tokenizer = WordTokenizer.WordTokenizer(min_word_count = 5)
    tokenized_docs = tokenizer.tokenize(xs.documents)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics)

    #TODO Partition into Docs by LSA sim
    txt_codes = xs.text_codes
    clusters_per_text_code = int(round( k/ float((len(txt_codes)))))
    
    #Extract the sm code rows from LSA
    smCodeRows = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices)
    smCodeClassifications = ListHelper.filter_list_by_index(xs.codes_per_document, xs.sm_code_indices)
    smCodeCategoryClassifications = ListHelper.filter_list_by_index(xs.categories_per_document, xs.sm_code_indices)
    
    # Dict of <code, list[list]]> - LSA row vectors
    logging.info("Partitioning LSA distance_matrix by Source Document")
    
    txtMatrixByCode = PartitionByCode.partition(full_lsa_matrix, xs, xs.text_codes)
    closest_docs = [find_closest_document(txtMatrixByCode, row) for row in smCodeRows]
    matrix_by_doc = collections.defaultdict(list)
    
    for i, doc in enumerate(closest_docs):
        matrix_by_doc[doc].append(smCodeRows[i])

    #Stores all cluster labels
    logging.info("Clustering within a document")
    all_smcode_labels = []
    label_offset = 0
    for doc in xs.text_codes:
        distance_matrix = matrix_by_doc[doc]
        #CLUSTER
        clusterer = Clusterer.Clusterer(clusters_per_text_code)
        labels = clusterer.Run(distance_matrix)
        all_smcode_labels = all_smcode_labels + [int(l + label_offset) for l in labels]
        label_offset += clusters_per_text_code

    #OUTPUT
    file_name_code_clusters = "Partition_By_Doc_LSA_SMCODES_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, all_smcode_labels, smCodeClassifications, "Chicago")

    file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeCategoryClassifications, "Chicago")
    
    #TODO - filter the category and the docs per docs to the sm codes and output
    #file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.txt".format(k, num_lsa_topics)
    #ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeClassifications, "Chicago")

    print "Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k)
    def binary_matrix(self):
        """ Turns a regular tf distance_matrix into a binary distance_matrix """
        def get_binary_data(val):
            if val <= 0:
                return 0
            return 1

        full_matrix = MatrixHelper.gensim_to_python_mdarray(
            self.distance_matrix, self.num_unique_words)
        return [[get_binary_data(cell) for cell in row] for row in full_matrix]
def get_data(xs):
    import WordTokenizer
    import TermFrequency
    import MatrixHelper
    import Converter
    
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    arr = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, Converter.to_binary)
    return arr
 def binary_matrix(self):
     """ Turns a regular tf distance_matrix into a binary distance_matrix """
     def get_binary_data(val):
         if val <= 0:
             return 0
         return 1
    
     full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words)
     return [[get_binary_data(cell)
             for cell in row]
             for row in full_matrix]
Exemple #9
0
def get_binary_data(xs):
    import WordTokenizer
    import TermFrequency
    import MatrixHelper
    import Converter

    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    arr = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0,
                                             Converter.to_binary)
    return arr
def train(num_lsa_topics, k, window_size):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()

    tokenizer = dbnetwork.WordTokenizer(min_word_count=5)
    tokenized_docs = tokenizer.tokenize(xs.documents)
    windowed_docs, window_indices = split_documents_into_windows(
        tokenized_docs, window_size)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(windowed_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(
        lsa.distance_matrix, num_lsa_topics)

    #CLUSTER
    clusterer = Clusterer.Clusterer(k)
    window_labels = clusterer.Run(full_lsa_matrix)

    #Extract the labeld for the original sentences using the indices build earlier
    labels = pivot_window_labels(window_labels, window_indices)

    #OUTPUT
    file_name_code_clusters = "Windowed_LSA_SMCODES_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(
        window_size, k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, labels,
                                    xs.codes_per_document, "Chicago")

    file_name_category_clusters = "Windowed_LSA_Categories_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(
        window_size, k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters, labels,
                                    xs.categories_per_document, "Chicago")

    logging.info(
        "Finished processing lsa clustering for dims: {0} and k: {1}".format(
            num_lsa_topics, k))
Exemple #11
0
def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)

    #NLTK Decision Tree
    np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix,
                                                   initial_value=0)

    labels = data.causal_per_document

    def get_svm_val(x):
        if x <= 0:
            return -1
        return 1

    labels = map(get_svm_val, labels)

    td_size = int(0.75 * len(np_matrix))

    td_x = np_matrix[:td_size]
    td_y = labels[:td_size]

    vd_x = np_matrix[td_size:]
    vd_y = labels[td_size:]

    rng = array(range(1, 21, 1))

    c_vals = rng / 10.0

    all_results = ""
    for c in c_vals:
        classifier = svm.LinearSVC(C=c)
        classifier.fit(td_x, td_y)

        #RESULTS
        classifications = classifier.predict(vd_x)

        results = "\nC VALUE: " + str(c) + "\n"
        results += ResultsHelper.rfp(vd_y, classifications)
        print results

        all_results += results
    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE
    fName = results_dir + "Causal_Relation_SVM.txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()

    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)

    # Test with CL1 labels
    raw_input("Press Enter to quit")
docs = data.documents

tokenized_docs = WordTokenizer.tokenize(
    docs,
    min_word_count=5,
    stem=False,
    lemmatize=True,
    remove_stop_words=True,
    spelling_correct=True,
    number_fn=NumberStrategy.collapse_dates)
lsa_v = LatentWordVectors.LsaSpace(tokenized_docs, 100)

wds = lsa_v.word_to_index.keys()
wds = sorted(wds)

u_vecs = [MatrixHelper.unit_vector(lsa_v.project(v)) for v in wds]

km = cluster.KMeans(n_clusters=50,
                    init='k-means++',
                    n_init=10,
                    verbose=1,
                    n_jobs=1)
predictions = km.fit_predict(u_vecs)

clusters = set(predictions)
word2cluster = dict(zip(wds, predictions))

km_clusters = extract_clusters_from_kmeans(km, wds)
km_clusters = sorted(km_clusters, key=lambda i: len(i))

for cl in km_clusters:
Exemple #13
0
    def Run(self, results_file_name, cv_folds = 10, min_word_count = 5, stem = True, lemmatize = False, remove_stop_words = True, one_code = None, spelling_correct = True, one_fold = False):

        self.min_word_count = min_word_count

        #SETTINGS
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        results_dir = self.__get_results_folder__()
        self.__ensure_dir__(results_dir)

        print "Results filename: " + results_file_name
        results_file_path = results_dir + results_file_name
        vd_hits_and_misses_fname = results_file_path.replace(".txt", "_VD_hits_misses.txt")
        td_hits_and_misses_fname = results_file_path.replace(".txt", "_TD_hits_misses.txt")

        #TOKENIZE
        data = self.get_data(ExperimentBase.__settings__)
        tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = min_word_count, stem = stem, lemmatize=lemmatize, remove_stop_words = remove_stop_words, spelling_correct=spelling_correct, number_fn = NumberStrategy.collapse_num)
    
        empty_ixs = set([i for i, doc in enumerate(tokenized_docs) if len(doc) < ExperimentBase.MIN_DOC_LENGTH])
        tokenized_docs = [t for i, t in enumerate(tokenized_docs) if i not in empty_ixs]
       
        #TRAINING DATA
        #TODO Make this one call from docs -> td
        (distance_matrix, id2word) = self.get_vector_space(tokenized_docs)
        xs = self.get_training_data(distance_matrix, id2word)
        
        matrix_mapper = self.matrix_value_mapper()
        if matrix_mapper:
            xs = MatrixHelper.map_matrix(matrix_mapper, xs)
        
        all_results = self.get_params() + "\n"
        print all_results,
        
        MIN_CODE_COUNT = 1
        
        vd_metrics, td_metrics = [], []
        label_mapper = self.label_mapper()

        # Stop logging now
        logging.disable(logging.INFO)
        
        # So we can test on one code only
        codes_to_process = self.__get_codes_to_process__(data, one_code)

        # Store the indices into the inputs that detail
        # the true and false positives and negatives
        vd_hits_misses_by_code = dict()
        td_hits_misses_by_code = dict()

        for code in codes_to_process:

            ys = self.__get_labels_for_code__(code, data, empty_ixs, label_mapper, xs)
                  
            total_codes = len([item for item in ys if item == 1])
            if total_codes <= MIN_CODE_COUNT:
                continue

            # Yes, that is a lot I know
            vd_r, vd_p, vd_f1, vd_a, \
            td_r, td_p, td_f1, td_a, \
            vd_tp_ix, vd_fp_ix, vd_fn_ix, vd_tn_ix, \
            td_tp_ix, td_fp_ix, td_fn_ix, td_tn_ix \
                = cross_validation_score_generic(
                    xs, ys,
                    self.create_classifier(code),
                    self.classify(),
                    cv_folds,
                    class_value = self.get_class_value(),
                    one_fold = one_fold)

            vd_metric, td_metric = rpfa(vd_r, vd_p, vd_f1, vd_a, total_codes), rpfa(td_r, td_p, td_f1, td_a, total_codes)
            vd_metrics.append(vd_metric)
            td_metrics.append(td_metric)

            vd_hits_misses_by_code[code] = (vd_tp_ix, vd_fp_ix, vd_fn_ix, vd_tn_ix)
            td_hits_misses_by_code[code] = (td_tp_ix, td_fp_ix, td_fn_ix, td_tn_ix)

            results = "Code: {0} Count: {1} VD[ {2} ]\tTD[ {3} ]\n".format(code.ljust(7), str(total_codes).rjust(4), vd_metric.to_str(), td_metric.to_str())
            print results,
            all_results += results

            """ Dump results to file in case of crash """
            self.__dump_results_to_file__(all_results, results_file_path)
            dump_hits_and_misses(vd_hits_misses_by_code, xs, vd_hits_and_misses_fname)
            dump_hits_and_misses(td_hits_misses_by_code, xs, td_hits_and_misses_fname)

        """ Compute mean metrics """
        """ MEAN """
        mean_vd_metrics,    mean_td_metrics     = mean_rpfa(vd_metrics),           mean_rpfa(td_metrics)
        """ WEIGHTED MEAN """
        wt_mean_vd_metrics, wt_mean_td_metrics  = weighted_mean_rpfa(vd_metrics),  weighted_mean_rpfa(td_metrics)

        str_aggregate_results = self.__build_aggregate_results_string__(mean_td_metrics, mean_vd_metrics,
                                                                    wt_mean_td_metrics, wt_mean_vd_metrics)
        print str_aggregate_results
        all_results += str_aggregate_results
            
        #DUMP TO FILE
        print "Writing results to: " + results_file_path
        print "TD Hits and Misses: " + td_hits_and_misses_fname
        print "VD Hits and Misses: " + vd_hits_and_misses_fname

        self.__dump_results_to_file__(all_results, results_file_path)
        return (mean_vd_metrics, wt_mean_vd_metrics)
def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = 5)
    tfidf = TfIdf.TfIdf(tokenized_docs)
    
    #NLTK Decision Tree
    np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix, initial_value = 0)
    
    labels = data.causal_per_document
    
    def get_svm_val(x):
        if x <= 0:
            return -1
        return 1

    labels = map(get_svm_val,labels)

    td_size = int(0.75 * len(np_matrix))
    
    td_x = np_matrix[:td_size]
    td_y = labels[:td_size]
    
    vd_x = np_matrix[td_size:]
    vd_y = labels[td_size:]
    
    rng = array(range(1,21,1))
    
    c_vals = rng / 10.0
    
    all_results = ""
    for c in c_vals:
        classifier = svm.LinearSVC(C = c)
        classifier.fit(td_x, td_y)
        
        #RESULTS
        classifications = classifier.predict(vd_x)
        
        results = "\nC VALUE: " + str(c) + "\n"
        results += ResultsHelper.rfp(vd_y, classifications)
        print results

        all_results += results
    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE    
    fName = results_dir + "Causal_Relation_SVM.txt"
    handle = open(fName, mode = "w+")
    handle.write(all_results)
    handle.close()
    
    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)
    
    # Test with CL1 labels
    raw_input("Press Enter to quit")
Exemple #15
0
def test_dA(learning_rate=0.5, training_epochs=100, batch_size=270):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """

    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tf = TermFrequency.TermFrequency(tokenized_docs)

    def to_binary(val):
        if val <= 0:
            return float(0)
        return float(1)

    distance_matrix = MatrixHelper.gensim_to_numpy_array(
        tf.distance_matrix, None, 0, to_binary)[0:-94]

    cols = len(distance_matrix[0])
    rows = len(distance_matrix)

    train_set_x = theano.shared(numpy.asarray(distance_matrix,
                                              dtype=theano.config.floatX),
                                borrow=True)

    #distance_matrix = numpy.ndarray((rows,cols), distance_matrix)
    #distance_matrix = shared(distance_matrix)

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(len(distance_matrix) / batch_size)

    hidden = 300
    corruption_level = 0.0

    # allocate symbolic variables for the xs
    index = T.iscalar()  # index to a [mini]batch
    x = T.distance_matrix('x')  # the xs is presented as rasterized images

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=cols,
            n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=corruption_level,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((training_time) / 60.))
    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=cols,
            n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The 30% corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          (training_time / 60.))
Exemple #16
0
 def get_sparse_matrix_data(self, distance_matrix, id2word):
     return MatrixHelper.gensim_to_numpy_array(distance_matrix, initial_value = 0)
Exemple #17
0
def train(num_lsa_topics, k):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()

    tokenizer = WordTokenizer.WordTokenizer(min_word_count=5)
    tokenized_docs = tokenizer.tokenize(xs.documents)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(
        lsa.distance_matrix, num_lsa_topics)

    #TODO Partition into Docs by LSA sim
    txt_codes = xs.text_codes
    clusters_per_text_code = int(round(k / float((len(txt_codes)))))

    #Extract the sm code rows from LSA
    smCodeRows = ListHelper.filter_list_by_index(full_lsa_matrix,
                                                 xs.sm_code_indices)
    smCodeClassifications = ListHelper.filter_list_by_index(
        xs.codes_per_document, xs.sm_code_indices)
    smCodeCategoryClassifications = ListHelper.filter_list_by_index(
        xs.categories_per_document, xs.sm_code_indices)

    # Dict of <code, list[list]]> - LSA row vectors
    logging.info("Partitioning LSA distance_matrix by Source Document")

    txtMatrixByCode = PartitionByCode.partition(full_lsa_matrix, xs,
                                                xs.text_codes)
    closest_docs = [
        find_closest_document(txtMatrixByCode, row) for row in smCodeRows
    ]
    matrix_by_doc = collections.defaultdict(list)

    for i, doc in enumerate(closest_docs):
        matrix_by_doc[doc].append(smCodeRows[i])

    #Stores all cluster labels
    logging.info("Clustering within a document")
    all_smcode_labels = []
    label_offset = 0
    for doc in xs.text_codes:
        distance_matrix = matrix_by_doc[doc]
        #CLUSTER
        clusterer = Clusterer.Clusterer(clusters_per_text_code)
        labels = clusterer.Run(distance_matrix)
        all_smcode_labels = all_smcode_labels + [
            int(l + label_offset) for l in labels
        ]
        label_offset += clusters_per_text_code

    #OUTPUT
    file_name_code_clusters = "Partition_By_Doc_LSA_SMCODES_k-means_k_{0}_dims_{1}.csv".format(
        k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, all_smcode_labels,
                                    smCodeClassifications, "Chicago")

    file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.csv".format(
        k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters,
                                    all_smcode_labels,
                                    smCodeCategoryClassifications, "Chicago")

    #TODO - filter the category and the docs per docs to the sm codes and output
    #file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.txt".format(k, num_lsa_topics)
    #ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeClassifications, "Chicago")

    print "Finished processing lsa clustering for dims: {0} and k: {1}".format(
        num_lsa_topics, k)
def test_dA(learning_rate=0.5, training_epochs=100,
            batch_size=270):

    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    
    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count = 5)
    tf = TermFrequency.TermFrequency(tokenized_docs)
    
    def to_binary(val):
        if val <= 0:
            return float(0)
        return float(1)
    
    distance_matrix = MatrixHelper.gensim_to_numpy_array(tf.distance_matrix, None, 0, to_binary)[0:-94]
    
    cols = len(distance_matrix[0])
    rows = len(distance_matrix)
    
    train_set_x = theano.shared(numpy.asarray(distance_matrix,
                                               dtype=theano.config.floatX),
                                 borrow=True)
    
    #distance_matrix = numpy.ndarray((rows,cols), distance_matrix)
    #distance_matrix = shared(distance_matrix)

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(len(distance_matrix) / batch_size)
    
    hidden = 300
    corruption_level = 0.0

    # allocate symbolic variables for the xs
    index = T.iscalar()    # index to a [mini]batch
    x = T.distance_matrix('x')  # the xs is presented as rasterized images

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible= cols, n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=corruption_level,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x,
            n_visible= cols, n_hidden=hidden)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function([index], cost, updates=updates,
         givens={x: train_set_x[index * batch_size:
                                  (index + 1) * batch_size]})

    start_time = time.clock()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        print 'Training epoch %d, cost ' % epoch, numpy.mean(c)

    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The 30% corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % (training_time / 60.))
def train():

    #SETTINGS
    cv_folds = 10
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + +GwData.FOLDER
    num_lsa_topics = 100

    #TOKENIZE
    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)

    #NLTK SVM linear kernel
    xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix,
                                            initial_value=0)

    total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0

    all_results = "LSA Dimensions: " + str(num_lsa_topics)
    print all_results

    processed_code_count = 0
    #MIN_CODE_COUNT = 5

    MIN_CODE_COUNT = 1

    codes = [
        c for c in xs.sm_codes
        # Exclude pure vague codes
        if c != "v" and
        # Exclude doc codes. Need whole doc to classify them
        not c.startswith("s")
    ]

    for code in codes:

        code_count = xs.sm_code_count[code]
        if code_count <= MIN_CODE_COUNT:
            continue

        processed_code_count += 1
        labels = map(Converter.get_svm_val, xs.labels_for(code))
        classifier = svm.LinearSVC(C=1)
        recall, precision, f1_score = cross_validation_score(xs,
                                                             labels,
                                                             classifier,
                                                             cv_folds,
                                                             class_value=1.0)
        results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format(
            code.ljust(10), code_count, recall, precision, f1_score)

        all_results += results
        total_recall += recall
        total_precision += precision
        total_f1 += f1_score

        print results,

    #num_codes = len(xs.sm_codes)
    num_codes = processed_code_count
    result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format(
        total_recall / num_codes, total_precision / num_codes,
        total_f1 / num_codes)
    all_results += result
    print result

    #DUMP TO FILE
    fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str(
        num_lsa_topics) + ".txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()
Exemple #20
0
    def RunStacked(self, results_file, cv_folds = 10, min_word_count = 5,
                   stem = True, lemmatize = False, remove_stop_words = True, layers = 2):

        #SETTINGS
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        print "Results filename: " + results_file
        settings = Settings.Settings()

        results_dir = settings.results_directory + self.sub_dir() + "\\"

        fName = results_dir + results_file

        #TOKENIZE
        data = self.get_data(settings)
        tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=min_word_count, stem=stem,
                                                lemmatize=lemmatize, remove_stop_words=remove_stop_words,
                                                spelling_correct=True, number_fn=NumberStrategy.collapse_num)

        empty_ixs = set([i for i, doc in enumerate(tokenized_docs) if len(doc) < StackedExperimentRunner.__MIN_DOC_LENGTH__])
        tokenized_docs = [t for i, t in enumerate(tokenized_docs) if i not in empty_ixs]

        #TRAINING DATA
        #TODO Make this one call from docs -> td
        (distance_matrix, id2word) = self.get_vector_space(tokenized_docs)
        xs = self.get_training_data(distance_matrix, id2word)

        matrix_mapper = self.matrix_value_mapper()
        if matrix_mapper:
            xs = MatrixHelper.map_matrix(matrix_mapper, xs)

        all_results = self.get_params() + "\n"
        print all_results,

        MIN_CODE_COUNT = 3

        codes = set(self.get_codes(data.sm_codes))
        label_mapper = self.label_mapper()

        # Stop logging now
        logging.disable(logging.INFO)

        xs = ensure_np_array(xs)
        edges = cross_validation_edges(len(xs), cv_folds)

        ys_by_code = {}
        positive_count_by_code = {}
        for code in codes.copy():
            ys = self.get_ys(code, data, empty_ixs, label_mapper, xs)
            ys_by_code[code] = ys

            positive_count = len([item for item in ys if item == 1])
            positive_count_by_code[code] = positive_count

            if positive_count < MIN_CODE_COUNT:
                codes.remove(code)

        dct_td_predictions_by_fold = {}
        dct_vd_predictions_by_fold = {}
        dct_actual_by_fold = {}

        for layer in range(layers):

            print("Layer: {0}".format(layer))
            vd_metrics_for_layer, td_metrics_for_layer = [], []

            vd_metrics_by_code = defaultdict(lambda: [])
            td_metrics_by_code = defaultdict(lambda: [])

            for fold in range(cv_folds):

                l, r = edges[fold]

                #Note these are numpy obj's and cannot be treated as lists
                td_x = np.concatenate((xs[:l], xs[r:]))
                vd_x = xs[l:r]

                predictions_from_previous_layer = None
                if layer > 0:
                    # Seed with an empty lists
                    lst_td_preds = self.__extract_predictions__(codes, dct_td_predictions_by_fold[fold], td_x)
                    td_x = np.concatenate((td_x, np.array(lst_td_preds)), 1)

                    lst_vd_preds = self.__extract_predictions__(codes, dct_vd_predictions_by_fold[fold], vd_x)
                    vd_x = np.concatenate((vd_x, np.array(lst_vd_preds)), 1)

                dct_td_predictions_per_code = {}
                dct_vd_predictions_per_code = {}
                dct_actual_per_code = {}

                dct_td_predictions_by_fold[fold] = dct_td_predictions_per_code
                dct_vd_predictions_by_fold[fold] = dct_vd_predictions_per_code
                dct_actual_by_fold[fold] = dct_actual_per_code

                class_value = self.get_class_value()

                for code in codes:

                    total_codes = positive_count_by_code[code]

                    ys = ys_by_code[code]
                    td_y = np.concatenate((ys[:l], ys[r:]))
                    vd_y = ys[l:r]

                    if min(td_y) == max(td_y):
                        val = td_y[0]
                        td_predictions = np.array([val for y in td_y])
                        vd_predictions = np.array([val for y in vd_y])
                    else:
                        create_classifier_func = self.create_classifier(code)
                        classify_func = self.classify()

                        classifier = create_classifier_func(td_x, td_y)
                        td_predictions = classify_func(classifier, td_x)
                        vd_predictions = classify_func(classifier, vd_x)

                    dct_td_predictions_per_code[code]  = td_predictions
                    dct_vd_predictions_per_code[code]  = vd_predictions
                    dct_actual_per_code[code]       = td_y

                    td_r, td_p, td_f1, td_a = Metrics.rpf1a(td_y, td_predictions, class_value=class_value)
                    vd_r, vd_p, vd_f1, vd_a = Metrics.rpf1a(vd_y, vd_predictions, class_value=class_value)

                    vd_metric, td_metric = self.rpfa(vd_r, vd_p, vd_f1, vd_a, total_codes), \
                                           self.rpfa(td_r, td_p, td_f1, td_a, total_codes)

                    vd_metrics_for_layer.append(vd_metric)
                    td_metrics_for_layer.append(td_metric)

                    vd_metrics_by_code[code].append(vd_metric)
                    td_metrics_by_code[code].append(td_metric)

                pass # End for code in codes

            pass #END for fold in folds

            for code in sorted(codes):
                positive_count = positive_count_by_code[code]
                vd_metric, td_metric = self.mean_rpfa(vd_metrics_by_code[code]), self.mean_rpfa(td_metrics_by_code[code])

                results = "Code: {0} Count: {1} VD[ {2} ]\tTD[ {3} ]\n".format(code.ljust(7), str(positive_count).rjust(4),
                                                                               vd_metric.to_str(), td_metric.to_str())
                print results,

            mean_vd_metrics, mean_td_metrics = self.mean_rpfa(vd_metrics_for_layer), self.mean_rpfa(td_metrics_for_layer)
            wt_mean_vd_metrics, wt_mean_td_metrics = self.weighted_mean_rpfa(vd_metrics_for_layer), self.weighted_mean_rpfa(
                td_metrics_for_layer)

            aggregate_results = "\n"
            aggregate_results += "VALIDATION DATA -\n"
            aggregate_results += "\tMEAN\n\t\t {0}\n".format(mean_vd_metrics.to_str(True))
            aggregate_results += "\tWEIGHTED MEAN\n\t\t {0}\n".format(wt_mean_vd_metrics.to_str(True))

            aggregate_results += "\n"
            aggregate_results += "TRAINING DATA -\n"
            aggregate_results += "\tMEAN\n\t\t {0}\n".format(mean_td_metrics.to_str(True))
            aggregate_results += "\tWEIGHTED MEAN\n\t\t {0}\n".format(wt_mean_td_metrics.to_str(True))

            print aggregate_results
            pass #End for layer in layers

        pass #End fold

        """ Dump results to file in case of crash """

        #DUMP TO FILE
        """
        print "Writing results to: " + fName
        handle = open(fName, mode="w+")
        handle.write(all_results)
        handle.close()
        """
        #return (mean_vd_metrics, wt_mean_vd_metrics)