class CharacterClassifier: DATA_DIRECTORY = "./trainingData/" def __init__(self,image): self.vectorSpace = VectorSpace(image) self.image = image def getCharacterProbabilityList(self): probabilityDict = {} for f in os.listdir(self.DATA_DIRECTORY): try: os.listdir(self.DATA_DIRECTORY+f) except OSError: continue for x in os.listdir(self.DATA_DIRECTORY+f): im = Image.open(self.DATA_DIRECTORY+f+"/"+x) cosineSimilarity = self.vectorSpace.cosine(VectorSpace(im)) if probabilityDict.has_key(f): probabilityDict[f]=max(probabilityDict[f],cosineSimilarity) else: probabilityDict[f]=cosineSimilarity characterProbabilityList = [] for j,k in sorted(probabilityDict.items(), key=itemgetter(1), reverse=True)[:10]: characterProbabilityList.append((j,k)) return characterProbabilityList
def init_embeddings(self): #To construct embeddings, we need to topologically sort #all types #Construct the dictionary of dependencies depends = {} for kind in self.interpreter_state.application_tables: depends[kind] = {kind.arg_type, kind.ret_type} #Contains types in dependency order sorted_types = toposort_flatten(depends, sort=False) for kind in sorted_types: if isinstance(kind, VecType): #Don't need to derive embeddings for vector types #But should give them spaces self.spaces[kind] = new VectorSpace(kind.n) continue #Otherwise, must be a function type arg_type =
""" Checks VectorSpace functionality by touching almost all defined functions """ import VectorSpace as vs import sympy as sym import numpy as np x = sym.Symbol('x') vBase = vs.vectors([[1, 2, 0], [0, 1, 3], [1, 0, 1]], 'F3') wBase = vs.vectors([[1, 0, 0], [0, 1, 0], [0, 0, 1]], 'F3') pBase = vs.vectors([1, x, x**2 - 1], 'P2') qBase = vs.vectors([1, x], 'P1') vList = vs.vectors([[0, 0, 0], [1, 1, 0], [0, 0, 0], [2, 1, 0]], 'F3') pList = vs.vectors([0, 1, x, 0, x**2 - 1, 2 * x], 'P2') v = vs.vector([1, 2, 3], 'F3') p = vs.vector(1 + 2 * x + 3 * x**2, 'P2') def f(X): """A Linear Map!""" return np.array([X[0] + X[1] + X[2], 2 * X[1], 2 * X[2]]) def g(p): """ Differential Map!""" return sym.diff(p, x)
#print result in a format def printResult(top5Lst): print('{} {}'.format('DocID', 'Score')) for each in top5Lst: print('{} {}'.format(each[0], each[1])) print('') if __name__ == '__main__': # read files from 'documents. directory indexes, contents = ReadFiles.readDocuments() # new a vector space model vectorspace = VectorSpace.VectorSpace(contents) # search a test query query = arg.query queryVector = np.array(vectorspace.makeTfidfVector(query)) [tf_cos, tf_dist] = vectorspace.searchTf(query) [tfidf_cos, tfidf_dist] = vectorspace.searchTfidf(query) # bind indexes to ratings then sort top5_tf_cos = sorted(list(zip(indexes, tf_cos)), reverse=True, key=sortByRatings)[:5] top5_tf_dist = sorted(list(zip(indexes, tf_dist)), reverse=False, key=sortByRatings)[:5] top5_tfidf_cos = sorted(list(zip(indexes, tfidf_cos)), reverse=True, key=sortByRatings)[:5] top5_tfidf_dist = sorted(list(zip(indexes, tfidf_dist)), reverse=False, key=sortByRatings)[:5] print('Term Frequency Weighting + Cosine Similarity:')
def main(argv): sampler.input_raw_directory = argv[1] output_directory = argv[2] training_file_name = argv[3] # sampler.handle_output_directory(output_directory, clean_directory=False) sampler.training_file_path = sampler.input_raw_directory + '/' + training_file_name training_file = sampler.read_training_df(sampler.training_file_path) # print('Building samples in ratio') # samples = build_samples_with_ratio(training_file, 3000, False) train_directory = output_directory + '/train' test_files = [ os.path.basename(x) for x in glob.glob(output_directory + '/' + 'test' + '/*.txt') ] testFileBlobs = build_list_of_docs(output_directory, 'test', test_files) #=================================================================================================================== #================================================ NAIVE BAYES ================================================ train_naive_bayes(training_file, train_directory) #for test_file_name, test_file_blob in testFileBlobs.items(): for key in testFileBlobs: predicted_class = naivebayes.nb_classifier(testFileBlobs[key], classes) append_in_testCSV(output_directory, key, predicted_class, 'NB') #=================================================================================================================== #=================================================================================================================== #================================================ VECTOR SPACE =============================================== all_document_list = train_vector_space_model(train_directory) # for of_class in ['unsponsored', 'sponsored']: # file_samples = [os.path.basename(x) for x in glob.glob(train_directory+'/'+of_class+'/*.txt')] # all_document_list[of_class] = (build_list_of_docs(train_directory, of_class,file_samples)).values() for index, key in enumerate(testFileBlobs): print(str(index) + ': Testing file - ' + key) predicted_class = VectorSpace.vectorspace_classifier( testFileBlobs[key].string, all_document_list) append_in_testCSV(output_directory, key, predicted_class, 'VSM') #=================================================================================================================== #=================================================================================================================== #================================================ RANDOMFOREST =============================================== rfc = randomforest.random_forest_classifier(train_directory, classes) test_files = [ os.path.basename(x) for x in glob.glob(output_directory + '/' + 'test' + '/*.txt') ] test_directory = output_directory + '/test' print('Testing Random Forest') for index, file_name in enumerate(test_files): print(str(index) + ': Testing file - ' + file_name) X_test = randomforest.extract_features(test_directory + '/' + file_name) Y_predict = rfc.predict([X_test]) if (Y_predict[0] == 0): prediction = 'unsponsored' else: prediction = 'sponsored' append_in_testCSV(output_directory, file_name, prediction, 'RF') #=================================================================================================================== #=========================================== EVALUATING CLASSIFIERS =========================================== calculate_evaluation_parameters(output_directory, 'NB') calculate_evaluation_parameters(output_directory, 'VSM') calculate_evaluation_parameters(output_directory, 'RF')
def build_point_schmear(mean): k = mean.shape[0] return Schmear(VectorSpace(k), mean, np.zeros(k))
def __init__(self,image): self.vectorSpace = VectorSpace(image) self.image = image
def update_embeddings(self): updated_means = {} updated_covariances = {} kernel_spaces = {} #To construct embeddings, we need to topologically sort #all types #Construct the dictionary of dependencies depends = {} for kind in self.interpreter_state.application_tables: depends[kind] = {kind.arg_type, kind.ret_type} #Contains types in dependency order sorted_types = toposort_flatten(depends, sort=False) for kind in sorted_types: if isinstance(kind, VecType): #Don't need to derive embeddings for vector types #but we should derive their spaces self.spaces[kind] = VectorSpace(kind.n) continue #Otherwise, must be a function #Derive the embedding for the type arg_type = kind.arg_type ret_type = kind.ret_type space = self.interpreter_state.get_type_space(kind) table = self.interpreter_state.get_application_table(kind) #Get the argument kernel space if it already exists kernel_space = None if (arg_type in self.kernel_spaces): kernel_space = kernel_spaces[arg_type] else: #Otherwise, create it from scratch X = get_means_array(arg_type) scaler = get_rescaling(X) kernel_space = GaussianKernelSpace(X, scaler) kernel_spaces[arg_type] = kernel_space #Get the return space ret_space = self.spaces[ret_type] #build the function space func_space = KernelSumFunctionSpace(kernel_space, ret_space) self.spaces[kind] = func_space #Now actually compute the embedding for the function func_means = [] func_covariances = [] for func_ind in range(len(space.terms)): func_ptr = TermPointer(space, func_ind) func_row = table.table[func_ind] #Obtain the tuple X_kernelized, Y, out_inv_covar_mats X_kernelized, Y, out_inv_covar_mats = self.get_func_row_embeddings(kernel_space, ret_type, func_row) func_prior_covar = func_space.get_prior_covariance() func_prior_mean = np.zeros(func_space.get_dimension()) func_prior_schmear = Schmear(func_space, func_prior_mean, func_prior_covar) #Find all applications which yield the func ptr #as a result [this will be used to set the prior] func_origins = self.interpreter_state.get_term_applications_yielding(func_ptr) for term_app in func_origins: higher_order_func_ptr = term_app.func_ptr higher_order_arg_ptr = term_app.arg_ptr higher_order_func_schmear = self.get_schmear_from_ptr(higher_order_func_ptr) higher_order_arg_schmear = self.get_schmear_from_ptr(higher_order_arg_ptr) prior_contrib = impute_application_schmear(higher_order_func_schmear, higher_order_arg_schmear, func_space) #Update the prior with the estimated applications from that func_prior_schmear = func_prior_schmear.combine_fuse(prior_contrib) #Obtain the prior mean and the prior covariance for the function prior_func_mean = func_prior_schmear.mean prior_func_covariance = func_prior_schmear.covar_mat _, k = X_kernelized.shape _, t = Y.shape prior_func_mean = np.reshape(prior_func_mean, (k, t)) prior_func_covariance = np.reshape(prior_func_covariance, (k, t, k, t)) #Do the regression post_func_mean, post_func_covariance = bayesian_multivariate_lin_reg(X_kernelized, Y, out_inv_covar_mats, prior_func_mean, prior_func_covariance) #Flatten the func mean/covariance vecs k, t = post_func_mean.shape flat_mean = np.reshape(post_func_mean, k * t) flat_covariance = np.reshape(post_func_covariance, (k * t, k * t)) func_means.append(flat_mean) func_covariances.append(flat_covariance) func_means = np.stack(func_means) func_covariances = np.stack(func_covariances) updated_means[kind] = func_means updated_covariances[kind] = func_covariances self.means = updated_means self.covariances = updated_covariances
from VectorSpace import * f = open('data/description_ga.txt','r') descs = [] while True: line = f.readline() if line == '': break descs.append(line) vs = VectorSpace(descs) print vs.related(0) s = vs.search(["upbeat"]) print s for k in range(0,len(s)): if k > 0.1: print descs[k]
indexList = [] query = arg.query # test data yourPath = './EnglishNews' # 列出指定路徑底下所有檔案(包含資料夾) allFileList = os.listdir(yourPath) # 逐一查詢檔案清單 for file in allFileList: f = open("EnglishNews/" + file, 'r') txt = f.read() doc.append(txt) indexList.append(file[:10]) time_start = datetime.datetime.now() vectorSpace = VectorSpace.VectorSpace(doc) searchVector = np.array(vectorSpace.makeVectorforTFidf(query)) tf_cosine = vectorSpace.searchTFCos(query) tf_euclidean = vectorSpace.searchTFdist(query) tfidf_cosine = vectorSpace.searchTFidfCos(query) tfidf_euclidean = vectorSpace.searchTFidfdist(query) top5_tf_cos = sorted(list(zip(indexList, tf_cosine)), reverse=True, key=itemgetter(1))[:5] top5_tf_dist = sorted(list(zip(indexList, tf_euclidean)), reverse=False, key=itemgetter(1))[:5] top5_tfidf_cos = sorted(list(zip(indexList, tfidf_cosine)),
def main(argv): sampler.input_raw_directory = argv[1] output_directory = argv[2] training_file_name = argv[3] # sampler.handle_output_directory(output_directory, clean_directory=False) sampler.training_file_path = sampler.input_raw_directory + '/' + training_file_name training_file = sampler.read_training_df(sampler.training_file_path) # print('Building samples in ratio') # samples = build_samples_with_ratio(training_file, 3000, False) train_directory = output_directory+'/train' test_files = [os.path.basename(x) for x in glob.glob(output_directory+'/'+'test'+'/*.txt')] testFileBlobs = build_list_of_docs(output_directory,'test',test_files) #=================================================================================================================== #================================================ NAIVE BAYES ================================================ train_naive_bayes(training_file, train_directory) #for test_file_name, test_file_blob in testFileBlobs.items(): for key in testFileBlobs: predicted_class = naivebayes.nb_classifier(testFileBlobs[key], classes) append_in_testCSV(output_directory,key,predicted_class, 'NB') #=================================================================================================================== #=================================================================================================================== #================================================ VECTOR SPACE =============================================== all_document_list = train_vector_space_model(train_directory) # for of_class in ['unsponsored', 'sponsored']: # file_samples = [os.path.basename(x) for x in glob.glob(train_directory+'/'+of_class+'/*.txt')] # all_document_list[of_class] = (build_list_of_docs(train_directory, of_class,file_samples)).values() for index, key in enumerate(testFileBlobs): print(str(index)+': Testing file - '+key) predicted_class = VectorSpace.vectorspace_classifier(testFileBlobs[key].string,all_document_list) append_in_testCSV(output_directory,key,predicted_class, 'VSM') #=================================================================================================================== #=================================================================================================================== #================================================ RANDOMFOREST =============================================== rfc = randomforest.random_forest_classifier(train_directory, classes) test_files = [os.path.basename(x) for x in glob.glob(output_directory+'/'+'test'+'/*.txt')] test_directory = output_directory+'/test' print('Testing Random Forest') for index, file_name in enumerate(test_files): print(str(index)+': Testing file - '+file_name) X_test = randomforest.extract_features(test_directory+'/'+file_name) Y_predict = rfc.predict([X_test]) if(Y_predict[0] == 0): prediction = 'unsponsored' else: prediction = 'sponsored' append_in_testCSV(output_directory,file_name, prediction, 'RF') #=================================================================================================================== #=========================================== EVALUATING CLASSIFIERS =========================================== calculate_evaluation_parameters(output_directory, 'NB') calculate_evaluation_parameters(output_directory, 'VSM') calculate_evaluation_parameters(output_directory, 'RF')