class CharacterClassifier:

	DATA_DIRECTORY = "./trainingData/"

	def __init__(self,image):
		self.vectorSpace = VectorSpace(image)
		self.image = image

	def getCharacterProbabilityList(self):

		probabilityDict = {}
		for f in os.listdir(self.DATA_DIRECTORY):
			try:
				os.listdir(self.DATA_DIRECTORY+f)
			except OSError:
				continue
			for x in os.listdir(self.DATA_DIRECTORY+f):
				im = Image.open(self.DATA_DIRECTORY+f+"/"+x)
				cosineSimilarity = self.vectorSpace.cosine(VectorSpace(im))
				if probabilityDict.has_key(f):
					probabilityDict[f]=max(probabilityDict[f],cosineSimilarity)
				else:
					probabilityDict[f]=cosineSimilarity
		characterProbabilityList = []
		for j,k in sorted(probabilityDict.items(), key=itemgetter(1), reverse=True)[:10]:
			characterProbabilityList.append((j,k))
		return characterProbabilityList
Beispiel #2
0
    def init_embeddings(self):
        #To construct embeddings, we need to topologically sort
        #all types
        #Construct the dictionary of dependencies
        depends = {}
        for kind in self.interpreter_state.application_tables:
            depends[kind] = {kind.arg_type, kind.ret_type}

        #Contains types in dependency order
        sorted_types = toposort_flatten(depends, sort=False)

        for kind in sorted_types:
            if isinstance(kind, VecType):
                #Don't need to derive embeddings for vector types
                #But should give them spaces
                self.spaces[kind] = new VectorSpace(kind.n)
                continue
            #Otherwise, must be a function type
            arg_type = 
Beispiel #3
0
"""
Checks VectorSpace functionality by touching almost all defined functions
"""
import VectorSpace as vs
import sympy as sym
import numpy as np

x = sym.Symbol('x')

vBase = vs.vectors([[1, 2, 0], [0, 1, 3], [1, 0, 1]], 'F3')
wBase = vs.vectors([[1, 0, 0], [0, 1, 0], [0, 0, 1]], 'F3')

pBase = vs.vectors([1, x, x**2 - 1], 'P2')
qBase = vs.vectors([1, x], 'P1')

vList = vs.vectors([[0, 0, 0], [1, 1, 0], [0, 0, 0], [2, 1, 0]], 'F3')
pList = vs.vectors([0, 1, x, 0, x**2 - 1, 2 * x], 'P2')

v = vs.vector([1, 2, 3], 'F3')
p = vs.vector(1 + 2 * x + 3 * x**2, 'P2')


def f(X):
    """A Linear Map!"""
    return np.array([X[0] + X[1] + X[2], 2 * X[1], 2 * X[2]])


def g(p):
    """ Differential Map!"""
    return sym.diff(p, x)
Beispiel #4
0
#print result in a format
def printResult(top5Lst):
    print('{}     {}'.format('DocID', 'Score'))
    for each in top5Lst:
        print('{}    {}'.format(each[0], each[1]))
    print('')

if __name__ == '__main__':

    # read files from 'documents. directory
    indexes, contents = ReadFiles.readDocuments()
    

    # new a vector space model
    vectorspace = VectorSpace.VectorSpace(contents)
    

    # search a test query
    query = arg.query
    queryVector = np.array(vectorspace.makeTfidfVector(query))
    [tf_cos, tf_dist] = vectorspace.searchTf(query)
    [tfidf_cos, tfidf_dist] = vectorspace.searchTfidf(query)

    # bind indexes to ratings then sort
    top5_tf_cos = sorted(list(zip(indexes, tf_cos)), reverse=True, key=sortByRatings)[:5]
    top5_tf_dist = sorted(list(zip(indexes, tf_dist)), reverse=False, key=sortByRatings)[:5]
    top5_tfidf_cos = sorted(list(zip(indexes, tfidf_cos)), reverse=True, key=sortByRatings)[:5]
    top5_tfidf_dist = sorted(list(zip(indexes, tfidf_dist)), reverse=False, key=sortByRatings)[:5]
    
    print('Term Frequency Weighting + Cosine Similarity:')
def main(argv):
    sampler.input_raw_directory = argv[1]
    output_directory = argv[2]
    training_file_name = argv[3]

    # sampler.handle_output_directory(output_directory, clean_directory=False)
    sampler.training_file_path = sampler.input_raw_directory + '/' + training_file_name
    training_file = sampler.read_training_df(sampler.training_file_path)

    # print('Building samples in ratio')
    # samples = build_samples_with_ratio(training_file, 3000, False)

    train_directory = output_directory + '/train'

    test_files = [
        os.path.basename(x)
        for x in glob.glob(output_directory + '/' + 'test' + '/*.txt')
    ]
    testFileBlobs = build_list_of_docs(output_directory, 'test', test_files)

    #===================================================================================================================
    #================================================    NAIVE BAYES    ================================================

    train_naive_bayes(training_file, train_directory)

    #for test_file_name, test_file_blob in testFileBlobs.items():
    for key in testFileBlobs:
        predicted_class = naivebayes.nb_classifier(testFileBlobs[key], classes)
        append_in_testCSV(output_directory, key, predicted_class, 'NB')

    #===================================================================================================================

    #===================================================================================================================
    #================================================    VECTOR SPACE    ===============================================

    all_document_list = train_vector_space_model(train_directory)
    # for of_class in ['unsponsored', 'sponsored']:
    #     file_samples = [os.path.basename(x) for x in glob.glob(train_directory+'/'+of_class+'/*.txt')]
    #     all_document_list[of_class] = (build_list_of_docs(train_directory, of_class,file_samples)).values()
    for index, key in enumerate(testFileBlobs):
        print(str(index) + ': Testing file - ' + key)
        predicted_class = VectorSpace.vectorspace_classifier(
            testFileBlobs[key].string, all_document_list)
        append_in_testCSV(output_directory, key, predicted_class, 'VSM')

    #===================================================================================================================

    #===================================================================================================================
    #================================================    RANDOMFOREST    ===============================================

    rfc = randomforest.random_forest_classifier(train_directory, classes)
    test_files = [
        os.path.basename(x)
        for x in glob.glob(output_directory + '/' + 'test' + '/*.txt')
    ]
    test_directory = output_directory + '/test'

    print('Testing Random Forest')
    for index, file_name in enumerate(test_files):
        print(str(index) + ': Testing file - ' + file_name)
        X_test = randomforest.extract_features(test_directory + '/' +
                                               file_name)
        Y_predict = rfc.predict([X_test])
        if (Y_predict[0] == 0):
            prediction = 'unsponsored'
        else:
            prediction = 'sponsored'
        append_in_testCSV(output_directory, file_name, prediction, 'RF')

    #===================================================================================================================
    #===========================================    EVALUATING CLASSIFIERS   ===========================================

    calculate_evaluation_parameters(output_directory, 'NB')

    calculate_evaluation_parameters(output_directory, 'VSM')

    calculate_evaluation_parameters(output_directory, 'RF')
Beispiel #6
0
def build_point_schmear(mean):
    k = mean.shape[0]
    return Schmear(VectorSpace(k), mean, np.zeros(k))
	def __init__(self,image):
		self.vectorSpace = VectorSpace(image)
		self.image = image
Beispiel #8
0
    def update_embeddings(self):
        updated_means = {}
        updated_covariances = {}
        kernel_spaces = {}

        #To construct embeddings, we need to topologically sort
        #all types
        #Construct the dictionary of dependencies
        depends = {}
        for kind in self.interpreter_state.application_tables:
            depends[kind] = {kind.arg_type, kind.ret_type}

        #Contains types in dependency order
        sorted_types = toposort_flatten(depends, sort=False)

        for kind in sorted_types:
            if isinstance(kind, VecType):
                #Don't need to derive embeddings for vector types
                #but we should derive their spaces
                self.spaces[kind] = VectorSpace(kind.n)
                continue
            #Otherwise, must be a function

            #Derive the embedding for the type 
            arg_type = kind.arg_type
            ret_type = kind.ret_type
            space = self.interpreter_state.get_type_space(kind)
            table = self.interpreter_state.get_application_table(kind)


            #Get the argument kernel space if it already exists
            kernel_space = None
            if (arg_type in self.kernel_spaces):
                kernel_space = kernel_spaces[arg_type]
            else:
                #Otherwise, create it from scratch
                X = get_means_array(arg_type)
                scaler = get_rescaling(X)
                kernel_space = GaussianKernelSpace(X, scaler)
                kernel_spaces[arg_type] = kernel_space

            #Get the return space
            ret_space = self.spaces[ret_type]

            #build the function space
            func_space = KernelSumFunctionSpace(kernel_space, ret_space)
            self.spaces[kind] = func_space

            #Now actually compute the embedding for the function
            
            func_means = []
            func_covariances = []
            for func_ind in range(len(space.terms)):
                func_ptr = TermPointer(space, func_ind)

                func_row = table.table[func_ind]
                #Obtain the tuple X_kernelized, Y, out_inv_covar_mats
                X_kernelized, Y, out_inv_covar_mats = self.get_func_row_embeddings(kernel_space, ret_type, func_row)

                func_prior_covar = func_space.get_prior_covariance()
                func_prior_mean = np.zeros(func_space.get_dimension())
                func_prior_schmear = Schmear(func_space, func_prior_mean, func_prior_covar)

                #Find all applications which yield the func ptr
                #as a result [this will be used to set the prior]
                func_origins = self.interpreter_state.get_term_applications_yielding(func_ptr)

                for term_app in func_origins:
                    higher_order_func_ptr = term_app.func_ptr
                    higher_order_arg_ptr = term_app.arg_ptr
                    higher_order_func_schmear = self.get_schmear_from_ptr(higher_order_func_ptr)
                    higher_order_arg_schmear = self.get_schmear_from_ptr(higher_order_arg_ptr)
                    prior_contrib = impute_application_schmear(higher_order_func_schmear, 
                                                               higher_order_arg_schmear, func_space)

                    #Update the prior with the estimated applications from that
                    func_prior_schmear = func_prior_schmear.combine_fuse(prior_contrib)

                #Obtain the prior mean and the prior covariance for the function
                prior_func_mean = func_prior_schmear.mean
                prior_func_covariance = func_prior_schmear.covar_mat

                _, k = X_kernelized.shape 
                _, t = Y.shape

                prior_func_mean = np.reshape(prior_func_mean, (k, t))
                prior_func_covariance = np.reshape(prior_func_covariance, (k, t, k, t))

                #Do the regression
                post_func_mean, post_func_covariance = bayesian_multivariate_lin_reg(X_kernelized, Y,  out_inv_covar_mats, prior_func_mean, prior_func_covariance)

                #Flatten the func mean/covariance vecs
                k, t = post_func_mean.shape
                flat_mean = np.reshape(post_func_mean, k * t)
                flat_covariance = np.reshape(post_func_covariance, (k * t, k * t))

                func_means.append(flat_mean)
                func_covariances.append(flat_covariance)
            func_means = np.stack(func_means)
            func_covariances = np.stack(func_covariances)
            updated_means[kind] = func_means
            updated_covariances[kind] = func_covariances
        self.means = updated_means
        self.covariances = updated_covariances
Beispiel #9
0
from VectorSpace import *

f = open('data/description_ga.txt','r')


descs = []
while True:
	line = f.readline()
	if line == '':
		break
	descs.append(line)

vs = VectorSpace(descs)
print vs.related(0)
s = vs.search(["upbeat"])
print s

for k in range(0,len(s)):
	if k > 0.1:
		print descs[k]

Beispiel #10
0
    indexList = []
    query = arg.query
    # test data
    yourPath = './EnglishNews'
    # 列出指定路徑底下所有檔案(包含資料夾)
    allFileList = os.listdir(yourPath)
    # 逐一查詢檔案清單
    for file in allFileList:
        f = open("EnglishNews/" + file, 'r')
        txt = f.read()
        doc.append(txt)
        indexList.append(file[:10])

    time_start = datetime.datetime.now()

    vectorSpace = VectorSpace.VectorSpace(doc)

    searchVector = np.array(vectorSpace.makeVectorforTFidf(query))

    tf_cosine = vectorSpace.searchTFCos(query)
    tf_euclidean = vectorSpace.searchTFdist(query)
    tfidf_cosine = vectorSpace.searchTFidfCos(query)
    tfidf_euclidean = vectorSpace.searchTFidfdist(query)

    top5_tf_cos = sorted(list(zip(indexList, tf_cosine)),
                         reverse=True,
                         key=itemgetter(1))[:5]
    top5_tf_dist = sorted(list(zip(indexList, tf_euclidean)),
                          reverse=False,
                          key=itemgetter(1))[:5]
    top5_tfidf_cos = sorted(list(zip(indexList, tfidf_cosine)),
Beispiel #11
0
def main(argv):
    sampler.input_raw_directory = argv[1]
    output_directory = argv[2]
    training_file_name = argv[3]

    # sampler.handle_output_directory(output_directory, clean_directory=False)
    sampler.training_file_path = sampler.input_raw_directory + '/' + training_file_name
    training_file = sampler.read_training_df(sampler.training_file_path)

    # print('Building samples in ratio')
    # samples = build_samples_with_ratio(training_file, 3000, False)

    train_directory = output_directory+'/train'

    test_files = [os.path.basename(x) for x in glob.glob(output_directory+'/'+'test'+'/*.txt')]
    testFileBlobs = build_list_of_docs(output_directory,'test',test_files)

    #===================================================================================================================
    #================================================    NAIVE BAYES    ================================================

    train_naive_bayes(training_file, train_directory)

    #for test_file_name, test_file_blob in testFileBlobs.items():
    for key in testFileBlobs:
        predicted_class = naivebayes.nb_classifier(testFileBlobs[key], classes)
        append_in_testCSV(output_directory,key,predicted_class, 'NB')

    #===================================================================================================================

    #===================================================================================================================
    #================================================    VECTOR SPACE    ===============================================

    all_document_list = train_vector_space_model(train_directory)
    # for of_class in ['unsponsored', 'sponsored']:
    #     file_samples = [os.path.basename(x) for x in glob.glob(train_directory+'/'+of_class+'/*.txt')]
    #     all_document_list[of_class] = (build_list_of_docs(train_directory, of_class,file_samples)).values()
    for index, key in enumerate(testFileBlobs):
        print(str(index)+': Testing file - '+key)
        predicted_class = VectorSpace.vectorspace_classifier(testFileBlobs[key].string,all_document_list)
        append_in_testCSV(output_directory,key,predicted_class, 'VSM')

    #===================================================================================================================

    #===================================================================================================================
    #================================================    RANDOMFOREST    ===============================================

    rfc = randomforest.random_forest_classifier(train_directory, classes)
    test_files = [os.path.basename(x) for x in glob.glob(output_directory+'/'+'test'+'/*.txt')]
    test_directory = output_directory+'/test'

    print('Testing Random Forest')
    for index, file_name in enumerate(test_files):
        print(str(index)+': Testing file - '+file_name)
        X_test = randomforest.extract_features(test_directory+'/'+file_name)
        Y_predict = rfc.predict([X_test])
        if(Y_predict[0] == 0):
            prediction = 'unsponsored'
        else:
            prediction = 'sponsored'
        append_in_testCSV(output_directory,file_name, prediction, 'RF')

    #===================================================================================================================
    #===========================================    EVALUATING CLASSIFIERS   ===========================================

    calculate_evaluation_parameters(output_directory, 'NB')

    calculate_evaluation_parameters(output_directory, 'VSM')

    calculate_evaluation_parameters(output_directory, 'RF')