def __init__(self, train_dir, test_dir, clusters=600, step_sz=5, batch_sz=2000, dense=True): '''The initialization of the class. :param int clusters: the number of clusters :param int batch_sz: the batch size for minibatch kmeans training :param int step_sz: the length of the gap between two sift descriptors :param boolean dense: If dense is True, dense sift descriptors will be used. ''' BagOfWords.__init__(self, train_dir, test_dir) self.clusters = clusters self.step_sz = step_sz self.batch_sz = batch_sz self.dense = dense self.scaler = StandardScaler() self.image_h = ImageHelper() self.file_h = FileHelper() self.kmeans_model = None self.clf = None print( 'Current parameters:\n n_clusters: %s, batch_size: %s, step_size %s, dense: %s' % (self.clusters, self.batch_sz, self.step_sz, self.dense))
def __init__(self, train_dir, resize_resolution, step_size=6): '''The initialization of the class. :param tuple resize_resolution: the resolution of the resized image :param int step_sz: the length of the gap between two sift descriptors ''' self.train_dir = train_dir self.resize_r = resize_resolution self.image_h = ImageHelper() self.file_h = FileHelper() self.step_sz = step_size
def __init__(self, train_dir, test_dir, normalize, K_range): '''Initialization of the class. :param str train_dir: the directory of the training set; :param str test_dir: the directory of the test set; :param boolean normalize: If set to True, the image will be made to have zero mean and unit length. :param int K_range: the range of K values that will be used in KNN training. ''' self.train_dir = train_dir self.test_dir = test_dir self.normalize = normalize self.K_range = K_range self.image_h = ImageHelper() self.file_h = FileHelper()
def search_commoncrawl(self): record_list = set() for j in range(1): unconsumed_text = '' filename = 'cdx-%05d.gz' % 260 cc_url = BASEURL + INDEXURL + filename print("Trying archive %s" % cc_url) # CsvHelper.write_index(cc_url) response = requests.get(cc_url, stream=True) decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS) i = 0 for chunk in response.iter_content(chunk_size=2048): i += 1 if i % 20000 == 0: print("Iteration: %s" % i) if len(decompressor.unused_data) > 0: # restart decompressor if end of a chunk to_decompress = decompressor.unused_data + chunk decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS) else: to_decompress = decompressor.unconsumed_tail + chunk s = unconsumed_text + decompressor.decompress( to_decompress).decode('utf-8') unconsumed_text = '' for l in s.split('\n'): pieces = l.split(' ') if len(pieces) < 3 or l[-1] != '}': unconsumed_text = l else: json_string = ' '.join(pieces[2:]) try: rec = json.loads(json_string) url = get_base_url(rec) if url.endswith('.nl') and url not in record_list: print(url) record_list.add(url) except: print('JSON load failed: ') assert False print("Done searching, found %d urls" % len(record_list)) FileHelper.write_file('urls.txt', sorted(record_list)) print("Done writing to file")
def download_found(self): # Put all the found urls into a queue for the threads to read from self.queue = Queue() [self.queue.put(url) for url in FileHelper.read_file('urls.txt')] # Create the threads and wait for them to finish self.create_threads() for t in self.threads: t.join()
def __init__(self, train_dir, test_dir, n_c=600, batch_sz=5000, patch_size=[6, 6], stride=4): '''The initialization of the class. :param int n_c: the number of clusters :param int batch_sz: the batch size for minibatch kmeans training :param list patch_size: the dimensions of the patch :param int stride: the length of the gap between the start of one patch and the start of the next consecutive patch ''' BagOfWords.__init__(self, train_dir, test_dir) self.n_clusters = n_c self.batch_size = batch_sz self.patch_size = patch_size self.stride = stride self.image_h = ImageHelper() self.file_h = FileHelper() self.kmeans_model = None self.clf = None print('Current parameters:\n n_clusters: %s, batch_size: %s, patch_size %s, stride: %s' % (self.n_clusters, self.batch_size, self.patch_size, self.stride))
def delete_client(name=None): fileHelper = FileHelper() data = fileHelper.openReadOnlyJSONFileASObject('data.txt') clients = data['clients'] if name: i = 0 for client in clients: if client['name'] == name: cli = clients[i] clients.remove(cli) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Removed client: ' + name break i = i + 1 else: nombre = raw_input("Write the client account owner: ") i = 0 for client in clients: if client['name'] == nombre: cli = clients[i] clients.remove(cli) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Removed client: ' + nombre break i = i + 1
def delete_account(name=None, account=None): fileHelper = FileHelper() data = fileHelper.openReadOnlyJSONFileASObject('data.txt') clients = data['clients'] if name and account: for client in clients: if client['name'] == name: accounts = client['accounts'] for ac in range(0, len(accounts)): if client['accounts'][ac].split(':')[0] == account: client['accounts'].pop(ac) fileHelper.writeJSONDataToAFile('data.txt', data) break break else: nombre = raw_input("Write the client account owner: ") cuenta = raw_input("Write the account name: ") for client in clients: if client['name'] == nombre: accounts = client['accounts'] for ac in range(0, len(accounts)): if client['accounts'][ac].split(':')[0] == cuenta: client['accounts'].pop(ac) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Deleted account: ' + cuenta break break
def create_account_group(name=None, accounts=None, console=True): fileHelper = FileHelper() data = fileHelper.openReadOnlyJSONFileASObject('data.txt') exist = False if name: clients = data['clients'] for client in clients: if client['name'] == name: exist = True break if not exist: clients.append({ 'name':name, 'accounts':accounts }) fileHelper.writeJSONDataToAFile('data.txt', data) return not exist elif console: clientName = raw_input('Insert the clien name: ') twitter = 'twitter:' + raw_input('Insert the twitter account (left blank if you dont have one): ') youtube = 'youtube:' + raw_input('Insert the youtube account (left blank if you dont have one): ') googleplus = 'googleplus:' + raw_input('Insert the googleplus account (left blank if you dont have one): ') linkedin = 'linkedin:' + raw_input('Insert the linkedin account (left blank if you dont have one): ') facebook = 'facebook:' + raw_input('Insert the facebook account (left blank if you dont have one): ') pinterest = 'pinterest:' + raw_input('Insert the pinterest account (left blank if you dont have one): ') if clientName: clients = data['clients'] for client in clients: if client['name'] == name: exist = True break if not exist: clients.append({ 'name':clientName, 'accounts':[ twitter, youtube, googleplus, linkedin, facebook, pinterest ]}) fileHelper.writeJSONDataToAFile('data.txt', data) return not exist
def __init__(self): self.mapping = FileHelper.get_classification_names() self.queue = Queue() self.threads = [] self.clf = None
class ImageClassifierKNN: def __init__(self, train_dir, test_dir, normalize, K_range): '''Initialization of the class. :param str train_dir: the directory of the training set; :param str test_dir: the directory of the test set; :param boolean normalize: If set to True, the image will be made to have zero mean and unit length. :param int K_range: the range of K values that will be used in KNN training. ''' self.train_dir = train_dir self.test_dir = test_dir self.normalize = normalize self.K_range = K_range self.image_h = ImageHelper() self.file_h = FileHelper() def preprocess_train_dataset(self): ''' Preprocess the training set. :return: a dataframe which contains the features and the corresponding label. :rtype: DataFrame ''' # get the images and their labels from the training set img_dict = self.file_h.get_all_files(self.train_dir, 'train') dataset = [] for label, img_list in img_dict.items(): for img in img_list: # get the tiny image feature vector vector = self.image_h.tiny_image(img, self.normalize) row = np.append(vector, label) dataset.append(row) df_training = DataFrame(dataset) return df_training def preprocess_test_dataset(self): ''' Preprocess the test set. :return: A tuple containing the features of the testing set and corresponding image names. :rtype: tuple ''' feature_set = [] # get the images and their filenames from the testing set img_list, name_list = self.file_h.get_all_files(self.test_dir, 'test') for img in img_list: vector = self.image_h.tiny_image(img, self.normalize) feature_set.append(vector) return feature_set, name_list def train(self, k_range): ''' Use GridSearchCV to train models and find the best model. ''' print('preprocessing training dataset...') df = self.preprocess_train_dataset() # get the labels of the images label = df.iloc[:, -1] # get the features of the images dataset = df.iloc[:, :-1] # specify the range of K values for KNN K_values = list(range(1, k_range)) # prepare the parameters for GridSearchCV params = dict(n_neighbors=K_values) knn = KNeighborsClassifier() clf = GridSearchCV(knn, params, cv=10, scoring='accuracy', refit=True) print('The training process begins...') clf.fit(dataset, label) print('The best model found!') best_k = clf.best_params_['n_neighbors'] best_score = clf.best_score_ # save the best model for later use in testing set self.best_model = clf.best_estimator_ print('optimal k value:', best_k) print('best score:', best_score) def test(self): ''' Predict the labels of the test set using the best model based on GridSearchCV and ouput the run1.txt ''' test_features, images = self.preprocess_test_dataset() labels_predicted = self.best_model.predict(test_features) with open('run1.txt', 'w') as f: for image, label in zip(images, labels_predicted): f.write(' '.join([image, label]) + '\n') def main(self): ''' Run the routines for the run1 task ''' # Use the training dataset to train and find the optimal K value. self.train(self.K_range) # Predict the labels of the test set with the best model # and output the result. self.test()
def __init__(self, url, content, path): self.url = url self.content = content self.result = UrlResult(url) self.words = FileHelper.read_file(path)
class RunTwo(BagOfWords): def __init__(self, train_dir, test_dir, n_c=600, batch_sz=5000, patch_size=[6, 6], stride=4): '''The initialization of the class. :param int n_c: the number of clusters :param int batch_sz: the batch size for minibatch kmeans training :param list patch_size: the dimensions of the patch :param int stride: the length of the gap between the start of one patch and the start of the next consecutive patch ''' BagOfWords.__init__(self, train_dir, test_dir) self.n_clusters = n_c self.batch_size = batch_sz self.patch_size = patch_size self.stride = stride self.image_h = ImageHelper() self.file_h = FileHelper() self.kmeans_model = None self.clf = None print('Current parameters:\n n_clusters: %s, batch_size: %s, patch_size %s, stride: %s' % (self.n_clusters, self.batch_size, self.patch_size, self.stride)) @timeit def preprocess_training_set(self): '''Find all the patches of the images in the training set. Extract patches and normalize them. ''' # get the images and their labels from the training set img_dict = self.file_h.get_all_files(self.train_dir, 'train') all_patches = [] for label, img_list in img_dict.items(): for img in img_list: patches = self.image_h.extract_patches(img, self.patch_size, self.stride) all_patches.append(patches) # Stack arrays of patches in sequence vertically all_patches = np.vstack(all_patches) print('The shape of all patches: ', all_patches.shape) # mean-centring and nomalizing each patch normalized = preprocessing.scale(np.float64(all_patches), axis=1) return normalized def generate_BOVW(self, image): '''Convert an image to the presentation of Bag of Visual Words. :param numpy.ndarray image: the given image ''' patches = self.image_h.extract_patches(image, self.patch_size, self.stride) normalized = preprocessing.scale(np.float64(patches), axis=1) words = self.kmeans_model.predict(normalized) bovw = np.zeros(self.n_clusters, dtype=int) for w in words: bovw[w] += 1 return bovw @timeit def train(self): '''The training process for run2. 1. Use MiniBatchKmeans to learn a vocabulary. 2. Convert the training set using the presentation of Bag of Visual Words. 3. Build a linear classifier and measure its performance. ''' print('Training starts...') training_patches = self.preprocess_training_set() print('Start kmeans training...') # save the trained kmeans model for the use of testing set self.kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters, random_state=0, batch_size=self.batch_size, compute_labels=False) self.kmeans_model.fit(training_patches) del training_patches # release the memory used print('Kmeans training completed.') train_data = self.BOVW_training_set() print('Start building a classifier...') labels = train_data.iloc[:, -1] # get the labels of the images features = train_data.iloc[:, :-1] # get the features of the images clf = LogisticRegression(random_state=0, multi_class='ovr', solver='sag', n_jobs=-1) # measure the performance of the model with cross validation cv_score = cross_val_score(clf, features, labels, cv=10) print('Average score of 10-fold cross validation for LR: %.2f' % np.mean(cv_score)) # feed the model with all the training set clf.fit(features, labels) # # save the trained model for predicting testing set self.clf = clf @timeit def test(self): ''' Predict the labels of the test set using the classifier that has been trained and ouput the run2.txt ''' test_data, image_names = self.BOVW_testing_set() labels_predicted = self.clf.predict(test_data) with open('run2.txt', 'w') as f: for image, label in zip(image_names, labels_predicted): f.write(' '.join([image, label]) + '\n') def main(self): self.train() self.test()
def __init__(self, filePath): self.filePath = filePath FileHelper.makedirsifnotexists(self.filePath)
class RunThree(BagOfWords): ''' Our second method to explore the performance of run3. Sift features + bag of visual words + classifier(naive bayes or non-linear svm). ''' def __init__(self, train_dir, test_dir, clusters=600, step_sz=5, batch_sz=2000, dense=True): '''The initialization of the class. :param int clusters: the number of clusters :param int batch_sz: the batch size for minibatch kmeans training :param int step_sz: the length of the gap between two sift descriptors :param boolean dense: If dense is True, dense sift descriptors will be used. ''' BagOfWords.__init__(self, train_dir, test_dir) self.clusters = clusters self.step_sz = step_sz self.batch_sz = batch_sz self.dense = dense self.scaler = StandardScaler() self.image_h = ImageHelper() self.file_h = FileHelper() self.kmeans_model = None self.clf = None print( 'Current parameters:\n n_clusters: %s, batch_size: %s, step_size %s, dense: %s' % (self.clusters, self.batch_sz, self.step_sz, self.dense)) @timeit def preprocess_training_set(self): ''' Find all the sift descriptors of the images in the training set. ''' # get a dictionary containing images and corresponding labels img_dict = self.file_h.get_all_files(self.train_dir, 'train') all_des = [] for label, img_list in img_dict.items(): for img in img_list: des = self.image_h.gen_sift_des(img, self.dense, self.step_sz) all_des.append(des) # Stack arrays of sift descriptors in sequence vertically all_des = np.vstack(all_des) print('The shape of all sift descriptors:', all_des.shape) # fit all descriptors with a scaler self.scaler.fit(all_des) return self.scaler.transform(all_des) def generate_BOVW(self, image): ''' generate bovw for one image ''' # compute the sift features and normalize them dsift = self.image_h.gen_sift_des(image, self.dense, self.step_sz) # scale the features with the scaler that was trained with the training set dsift = self.scaler.transform(dsift) words = self.kmeans_model.predict(dsift) bovw = np.zeros(self.clusters, dtype=int) for w in words: bovw[w] += 1 return bovw def measure_clf(self, clf, X, y, fold=10): ''' Measure the performance of a classifier with 10-fold cross validation ''' cv_score = cross_val_score(clf, X, y, cv=fold) print('Accuracy: %.2f' % (np.mean(cv_score))) @timeit def train(self): # get sift descriptors of all images des = self.preprocess_training_set() print('Start kmeans training...') self.kmeans_model = MiniBatchKMeans(n_clusters=self.clusters, random_state=0, batch_size=self.batch_sz, compute_labels=False) self.kmeans_model.fit(des) print('Kmeans training completed.') df = self.BOVW_training_set( ) # convert the training set to Bag of Visual Words labels = df.iloc[:, -1] # get the labels of the images features = df.iloc[:, :-1] # get the features of the images # clf = MultinomialNB() # self.measure_clf(clf, features, labels) # clf = GaussianNB() # self.measure_clf(clf, features, labels) # clf = BernoulliNB() # self.measure_clf(clf, features, labels) clf = SVC(kernel='poly') self.clf = clf.fit(features, labels) X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print('score: ', score) @timeit def test(self): ''' Predict the labels of the test set using the classifier that has been trained and ouput the run3.txt ''' test_data, image_names = self.BOVW_testing_set() labels_predicted = self.clf.predict(test_data) with open('run3.txt', 'w') as f: for image, label in zip(image_names, labels_predicted): f.write(' '.join([image, label]) + '\n') def main(self): self.train() self.test()
def add_account(name=None, account=None, dataAccount=None): fileHelper = FileHelper() data = fileHelper.openReadOnlyJSONFileASObject('data.txt') clients = data['clients'] if name and account: for client in clients: if client['name'] == name: numCuentas = len(client['accounts']) if numCuentas == 0: client['accounts'].append(account+':'+dataAccount) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Account added **' else: cuentas = client['accounts'] exist = False for ac in range(0, numCuentas): if cuentas[ac].split(':')[0] == account: exist = True break if exist: cuentas[ac] = account+':'+dataAccount else: cuentas.append(account+':'+dataAccount) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Account added ' + name + ' '+ account + ' '+dataAccount else: nombre = raw_input("Write the client account owner: ") cuenta = raw_input("Write the account name: ") accountData = raw_input("Write the account data: ") for client in clients: if client['name'] == nombre: numCuentas = len(client['accounts']) if numCuentas == 0: client['accounts'].append(cuenta+':'+accountData) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Account added **' else: cuentas = client['accounts'] exist = False for ac in range(0, numCuentas): if cuentas[ac].split(':')[0] == cuenta: exist = True break if exist: cuentas[ac] = account+':'+dataAccount else: cuentas.append(account+':'+dataAccount) fileHelper.writeJSONDataToAFile('data.txt', data) print 'Account added'
def __init__(self, sitename, file): self.queue = Queue() [self.queue.put(url) for url in FileHelper.read_file(file)] self.result = UrlResult(sitename)
class RunThree: ''' Our first method to explore the performance of run3. Use dense sift and a classifier(naive bayes or non-linear svm). The performance is not as good as that of the model which we used in the first method so there is not a method which generates a result of the testing set here. ''' def __init__(self, train_dir, resize_resolution, step_size=6): '''The initialization of the class. :param tuple resize_resolution: the resolution of the resized image :param int step_sz: the length of the gap between two sift descriptors ''' self.train_dir = train_dir self.resize_r = resize_resolution self.image_h = ImageHelper() self.file_h = FileHelper() self.step_sz = step_size @timeit def preprocess_training_set(self): ''' :return: the sift descriptors and labels of all the images in the training set ''' # get a dictionary containing images and corresponding labels img_dict = self.file_h.get_all_files(self.train_dir, 'train') all_des = [] all_labels = [] for label, img_list in img_dict.items(): for img in img_list: crop_img = self.image_h.crop_square(img) image = cv2.resize(crop_img, self.resize_r) des = self.image_h.gen_sift_des(image, dense=True, step_size=self.step_sz) # flatten the sift features into one vector for every image all_des.append(des.flatten()) all_labels.append(label) return all_des, all_labels # @timeit # def train_naive_bayes(self, X, y): # clf = MultinomialNB() # cv = cross_val_score(clf, X, y, cv=10) # print(cv) # print(np.mean(cv)) @timeit def train_svm(self, X, y): ''' The training of svm takes much more time than naive bayes, so we used a part of the training set to evaluate the model instead of using cross validation. ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) clf = SVC(kernel='poly') clf.fit(X_train, y_train) s = clf.score(X_test, y_test) print(s) def main(self): dataset, labels = self.preprocess_training_set() # use svm classifier self.train_svm(dataset, labels)