Example #1
 def __init__(self,
     '''The initialization of the class.
     :param int clusters: the number of clusters
     :param int batch_sz: the batch size for minibatch kmeans training
     :param int step_sz: the length of the gap between two sift descriptors
     :param boolean dense: If dense is True, dense sift descriptors will be used.
     BagOfWords.__init__(self, train_dir, test_dir)
     self.clusters = clusters
     self.step_sz = step_sz
     self.batch_sz = batch_sz
     self.dense = dense
     self.scaler = StandardScaler()
     self.image_h = ImageHelper()
     self.file_h = FileHelper()
     self.kmeans_model = None
     self.clf = None
         'Current parameters:\n n_clusters: %s, batch_size: %s, step_size %s, dense: %s'
         % (self.clusters, self.batch_sz, self.step_sz, self.dense))
 def __init__(self, train_dir, resize_resolution, step_size=6):
     '''The initialization of the class.
     :param tuple resize_resolution: the resolution of the resized image
     :param int step_sz: the length of the gap between two sift descriptors
     self.train_dir = train_dir
     self.resize_r = resize_resolution
     self.image_h = ImageHelper()
     self.file_h = FileHelper()
     self.step_sz = step_size
Example #3
    def __init__(self, train_dir, test_dir, normalize, K_range):
        '''Initialization of the class.

        :param str train_dir: the directory of the training set;
        :param str test_dir: the directory of the test set;
        :param boolean normalize: If set to True, the image will be made to have zero mean and unit length.
        :param int K_range: the range of K values that will be used in KNN training.
        self.train_dir = train_dir
        self.test_dir = test_dir
        self.normalize = normalize
        self.K_range = K_range
        self.image_h = ImageHelper()
        self.file_h = FileHelper()
Example #4
    def search_commoncrawl(self):
        record_list = set()

        for j in range(1):
            unconsumed_text = ''
            filename = 'cdx-%05d.gz' % 260
            cc_url = BASEURL + INDEXURL + filename

            print("Trying archive %s" % cc_url)
            # CsvHelper.write_index(cc_url)

            response = requests.get(cc_url, stream=True)
            decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)

            i = 0
            for chunk in response.iter_content(chunk_size=2048):
                i += 1
                if i % 20000 == 0:
                    print("Iteration: %s" % i)
                if len(decompressor.unused_data) > 0:
                    # restart decompressor if end of a chunk
                    to_decompress = decompressor.unused_data + chunk
                    decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
                    to_decompress = decompressor.unconsumed_tail + chunk
                s = unconsumed_text + decompressor.decompress(
                unconsumed_text = ''

                for l in s.split('\n'):
                    pieces = l.split(' ')
                    if len(pieces) < 3 or l[-1] != '}':
                        unconsumed_text = l
                        json_string = ' '.join(pieces[2:])
                            rec = json.loads(json_string)
                            url = get_base_url(rec)

                            if url.endswith('.nl') and url not in record_list:
                            print('JSON load failed: ')
                            assert False

            print("Done searching, found %d urls" % len(record_list))
            FileHelper.write_file('urls.txt', sorted(record_list))
            print("Done writing to file")
Example #5
    def download_found(self):
        # Put all the found urls into a queue for the threads to read from
        self.queue = Queue()
        [self.queue.put(url) for url in FileHelper.read_file('urls.txt')]

        # Create the threads and wait for them to finish

        for t in self.threads:
Example #6
 def __init__(self, train_dir, test_dir, n_c=600, batch_sz=5000, patch_size=[6, 6], stride=4):
     '''The initialization of the class.
     :param int n_c: the number of clusters
     :param int batch_sz: the batch size for minibatch kmeans training
     :param list patch_size: the dimensions of the patch
     :param int stride: the length of the gap between the start of one patch and the start of the next consecutive patch
     BagOfWords.__init__(self, train_dir, test_dir)
     self.n_clusters = n_c
     self.batch_size = batch_sz 
     self.patch_size = patch_size
     self.stride = stride
     self.image_h = ImageHelper()
     self.file_h = FileHelper()
     self.kmeans_model = None
     self.clf = None
     print('Current parameters:\n n_clusters: %s, batch_size: %s, patch_size %s, stride: %s' 
         % (self.n_clusters, self.batch_size, self.patch_size, self.stride))
def delete_client(name=None):

	fileHelper = FileHelper()

	data = fileHelper.openReadOnlyJSONFileASObject('data.txt')

	clients = data['clients']

	if name:
		i = 0
		for client in clients:
			if client['name'] == name:
				cli = clients[i]

				fileHelper.writeJSONDataToAFile('data.txt', data)
				print 'Removed client: ' + name
			i = i + 1
		nombre = raw_input("Write the client account owner: ")
		i = 0
		for client in clients:
			if client['name'] == nombre:
				cli = clients[i]

				fileHelper.writeJSONDataToAFile('data.txt', data)
				print 'Removed client: ' + nombre
			i = i + 1
def delete_account(name=None, account=None):

	fileHelper = FileHelper()

	data = fileHelper.openReadOnlyJSONFileASObject('data.txt')

	clients = data['clients']

	if name and account:
		for client in clients:
			if client['name'] == name:
				accounts = client['accounts']
				for ac in range(0, len(accounts)):
					if client['accounts'][ac].split(':')[0] == account:

						fileHelper.writeJSONDataToAFile('data.txt', data)
		nombre = raw_input("Write the client account owner: ")
		cuenta = raw_input("Write the account name: ")

		for client in clients:
			if client['name'] == nombre:
				accounts = client['accounts']
				for ac in range(0, len(accounts)):
					if client['accounts'][ac].split(':')[0] == cuenta:

						fileHelper.writeJSONDataToAFile('data.txt', data)
						print 'Deleted account: ' + cuenta
def create_account_group(name=None, accounts=None, console=True):

	fileHelper = FileHelper()

	data = fileHelper.openReadOnlyJSONFileASObject('data.txt')

	exist = False

	if name:
		clients = data['clients']

		for client in clients:
			if client['name'] == name:
				exist = True

		if not exist:
			clients.append({ 'name':name, 'accounts':accounts })

		fileHelper.writeJSONDataToAFile('data.txt', data)

		return not exist

	elif console:
		clientName = raw_input('Insert the clien name: ')

		twitter = 'twitter:' + raw_input('Insert the twitter account (left blank if you dont have one): ')
		youtube = 'youtube:' + raw_input('Insert the youtube account (left blank if you dont have one): ')
		googleplus = 'googleplus:' + raw_input('Insert the googleplus account (left blank if you dont have one): ')
		linkedin = 'linkedin:' + raw_input('Insert the linkedin account (left blank if you dont have one): ')
		facebook = 'facebook:' + raw_input('Insert the facebook account (left blank if you dont have one): ')
		pinterest = 'pinterest:' + raw_input('Insert the pinterest account (left blank if you dont have one): ')

		if clientName:
			clients = data['clients']

			for client in clients:
				if client['name'] == name:
					exist = True
			if not exist:
				clients.append({ 'name':clientName, 'accounts':[ twitter, youtube, googleplus, linkedin, facebook, pinterest ]})

			fileHelper.writeJSONDataToAFile('data.txt', data)

		return not exist
Example #10
 def __init__(self):
     self.mapping = FileHelper.get_classification_names()
     self.queue = Queue()
     self.threads = []
     self.clf = None
Example #11
class ImageClassifierKNN:
    def preprocess_train_dataset(self):
        Preprocess the training set.

        :return: a dataframe which contains the features and the corresponding label.
        :rtype: DataFrame
        # get the images and their labels from the training set
        img_dict = self.file_h.get_all_files(self.train_dir, 'train')
        dataset = []
        for label, img_list in img_dict.items():
            for img in img_list:
                # get the tiny image feature vector
                vector = self.image_h.tiny_image(img, self.normalize)
                row = np.append(vector, label)
        df_training = DataFrame(dataset)
        return df_training

    def preprocess_test_dataset(self):
        Preprocess the test set.

        :return: A tuple containing the features of the testing set
                and corresponding image names.
        :rtype: tuple
        feature_set = []
        # get the images and their filenames from the testing set
        img_list, name_list = self.file_h.get_all_files(self.test_dir, 'test')
        for img in img_list:
            vector = self.image_h.tiny_image(img, self.normalize)
        return feature_set, name_list

    def train(self, k_range):
        Use GridSearchCV to train models and find the best model.
        print('preprocessing training dataset...')
        df = self.preprocess_train_dataset()
        # get the labels of the images
        label = df.iloc[:, -1]
        # get the features of the images
        dataset = df.iloc[:, :-1]
        # specify the range of K values for KNN
        K_values = list(range(1, k_range))
        # prepare the parameters for GridSearchCV
        params = dict(n_neighbors=K_values)
        knn = KNeighborsClassifier()
        clf = GridSearchCV(knn, params, cv=10, scoring='accuracy', refit=True)
        print('The training process begins...')
        clf.fit(dataset, label)
        print('The best model found!')
        best_k = clf.best_params_['n_neighbors']
        best_score = clf.best_score_
        # save the best model for later use in testing set
        self.best_model = clf.best_estimator_
        print('optimal k value:', best_k)
        print('best score:', best_score)

    def test(self):
        Predict the labels of the test set using the best model based on GridSearchCV
        and ouput the run1.txt
        test_features, images = self.preprocess_test_dataset()
        labels_predicted = self.best_model.predict(test_features)
        with open('run1.txt', 'w') as f:
            for image, label in zip(images, labels_predicted):
                f.write(' '.join([image, label]) + '\n')

    def main(self):
        Run the routines for the run1 task
        # Use the training dataset to train and find the optimal K value.
        # Predict the labels of the test set with the best model
        # and output the result.
Example #12
 def __init__(self, url, content, path):
     self.url = url
     self.content = content
     self.result = UrlResult(url)
     self.words = FileHelper.read_file(path)
Example #13
class RunTwo(BagOfWords):

    def preprocess_training_set(self):
        '''Find all the patches of the images in the training set.

        Extract patches and normalize them.
        # get the images and their labels from the training set
        img_dict = self.file_h.get_all_files(self.train_dir, 'train')
        all_patches = []
        for label, img_list in img_dict.items():
            for img in img_list:
                patches = self.image_h.extract_patches(img, self.patch_size, self.stride)
        # Stack arrays of patches in sequence vertically
        all_patches = np.vstack(all_patches)
        print('The shape of all patches: ', all_patches.shape)
        # mean-centring and nomalizing each patch
        normalized = preprocessing.scale(np.float64(all_patches), axis=1)
        return normalized

    def generate_BOVW(self, image):
        '''Convert an image to the presentation of Bag of Visual Words.

        :param numpy.ndarray image: the given image
        patches = self.image_h.extract_patches(image, self.patch_size, self.stride)
        normalized = preprocessing.scale(np.float64(patches), axis=1)
        words = self.kmeans_model.predict(normalized)
        bovw = np.zeros(self.n_clusters, dtype=int)
        for w in words:
            bovw[w] += 1
        return bovw

    def train(self):
        '''The training process for run2.

        1. Use MiniBatchKmeans to learn a vocabulary.
        2. Convert the training set using the presentation of Bag of Visual Words.
        3. Build a linear classifier and measure its performance.
        print('Training starts...')
        training_patches = self.preprocess_training_set()
        print('Start kmeans training...')
        # save the trained kmeans model for the use of testing set
        self.kmeans_model = MiniBatchKMeans(n_clusters=self.n_clusters,
                                random_state=0, batch_size=self.batch_size, compute_labels=False)
        del training_patches  # release the memory used
        print('Kmeans training completed.')
        train_data = self.BOVW_training_set()
        print('Start building a classifier...')
        labels = train_data.iloc[:, -1]  # get the labels of the images
        features = train_data.iloc[:, :-1]  # get the features of the images
        clf = LogisticRegression(random_state=0, multi_class='ovr', solver='sag', n_jobs=-1)
        # measure the performance of the model with cross validation
        cv_score = cross_val_score(clf, features, labels, cv=10)
        print('Average score of 10-fold cross validation for LR: %.2f' % np.mean(cv_score))
        # feed the model with all the training set
        clf.fit(features, labels)
        # # save the trained model for predicting testing set
        self.clf = clf

    def test(self):
        Predict the labels of the test set using the classifier that has been trained
        and ouput the run2.txt
        test_data, image_names = self.BOVW_testing_set()
        labels_predicted = self.clf.predict(test_data)
        with open('run2.txt', 'w') as f:
            for image, label in zip(image_names, labels_predicted):
                f.write(' '.join([image, label]) + '\n')

    def main(self):
	def __init__(self, filePath):
		self.filePath = filePath
Example #15
class RunThree(BagOfWords):
    Our second method to explore the performance of run3.
    Sift features + bag of visual words + classifier(naive bayes or non-linear svm).
    def preprocess_training_set(self):
        Find all the sift descriptors of the images in the training set.
        # get a dictionary containing images and corresponding labels
        img_dict = self.file_h.get_all_files(self.train_dir, 'train')
        all_des = []
        for label, img_list in img_dict.items():
            for img in img_list:
                des = self.image_h.gen_sift_des(img, self.dense, self.step_sz)
        # Stack arrays of sift descriptors in sequence vertically
        all_des = np.vstack(all_des)
        print('The shape of all sift descriptors:', all_des.shape)
        # fit all descriptors with a scaler
        return self.scaler.transform(all_des)

    def generate_BOVW(self, image):
        generate bovw for one image
        # compute the sift features and normalize them
        dsift = self.image_h.gen_sift_des(image, self.dense, self.step_sz)
        # scale the features with the scaler that was trained with the training set
        dsift = self.scaler.transform(dsift)
        words = self.kmeans_model.predict(dsift)
        bovw = np.zeros(self.clusters, dtype=int)
        for w in words:
            bovw[w] += 1
        return bovw

    def measure_clf(self, clf, X, y, fold=10):
        Measure the performance of a classifier with 10-fold cross validation
        cv_score = cross_val_score(clf, X, y, cv=fold)
        print('Accuracy: %.2f' % (np.mean(cv_score)))

    def train(self):
        # get sift descriptors of all images
        des = self.preprocess_training_set()
        print('Start kmeans training...')
        self.kmeans_model = MiniBatchKMeans(n_clusters=self.clusters,
        print('Kmeans training completed.')
        df = self.BOVW_training_set(
        )  # convert the training set to Bag of Visual Words
        labels = df.iloc[:, -1]  # get the labels of the images
        features = df.iloc[:, :-1]  # get the features of the images
        # clf = MultinomialNB()
        # self.measure_clf(clf, features, labels)
        # clf = GaussianNB()
        # self.measure_clf(clf, features, labels)
        # clf = BernoulliNB()
        # self.measure_clf(clf, features, labels)
        clf = SVC(kernel='poly')
        self.clf = clf.fit(features, labels)
        X_train, X_test, y_train, y_test = train_test_split(features,
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print('score: ', score)

    def test(self):
        Predict the labels of the test set using the classifier that has been trained
        and ouput the run3.txt
        test_data, image_names = self.BOVW_testing_set()
        labels_predicted = self.clf.predict(test_data)
        with open('run3.txt', 'w') as f:
            for image, label in zip(image_names, labels_predicted):
                f.write(' '.join([image, label]) + '\n')

    def main(self):
def add_account(name=None, account=None, dataAccount=None):
	fileHelper = FileHelper()

	data = fileHelper.openReadOnlyJSONFileASObject('data.txt')

	clients = data['clients']

	if name and account:
		for client in clients:
			if client['name'] == name:

				numCuentas = len(client['accounts'])

				if numCuentas == 0:

					fileHelper.writeJSONDataToAFile('data.txt', data)
					print 'Account added **'
					cuentas = client['accounts']
					exist = False

					for ac in range(0, numCuentas):

						if cuentas[ac].split(':')[0] == account:
							exist = True

					if exist:
						cuentas[ac] = account+':'+dataAccount

					fileHelper.writeJSONDataToAFile('data.txt', data)
					print 'Account added ' + name + ' '+ account + ' '+dataAccount
		nombre = raw_input("Write the client account owner: ")
		cuenta = raw_input("Write the account name: ")
		accountData = raw_input("Write the account data: ")

		for client in clients:
			if client['name'] == nombre:

				numCuentas = len(client['accounts'])

				if numCuentas == 0:

					fileHelper.writeJSONDataToAFile('data.txt', data)
					print 'Account added **'

					cuentas = client['accounts']
					exist = False

					for ac in range(0, numCuentas):

						if cuentas[ac].split(':')[0] == cuenta:
							exist = True

					if exist:
						cuentas[ac] = account+':'+dataAccount

					fileHelper.writeJSONDataToAFile('data.txt', data)
					print 'Account added'
Example #17
    def __init__(self, sitename, file):
        self.queue = Queue()
        [self.queue.put(url) for url in FileHelper.read_file(file)]

        self.result = UrlResult(sitename)
Example #18
class RunThree:
    Our first method to explore the performance of run3.
    Use dense sift and a classifier(naive bayes or non-linear svm).

    The performance is not as good as that of the model which we used in the first method
    so there is not a method which generates a result of the testing set here.
    def preprocess_training_set(self):
        :return: the sift descriptors and labels of all the images in the training set
        # get a dictionary containing images and corresponding labels
        img_dict = self.file_h.get_all_files(self.train_dir, 'train')
        all_des = []
        all_labels = []
        for label, img_list in img_dict.items():
            for img in img_list:
                crop_img = self.image_h.crop_square(img)
                image = cv2.resize(crop_img, self.resize_r)
                des = self.image_h.gen_sift_des(image,
                # flatten the sift features into one vector for every image
        return all_des, all_labels

    # @timeit
    # def train_naive_bayes(self, X, y):
    #     clf = MultinomialNB()
    #     cv = cross_val_score(clf, X, y, cv=10)
    #     print(cv)
    #     print(np.mean(cv))

    def train_svm(self, X, y):
        The training of svm takes much more time than naive bayes,
        so we used a part of the training set to evaluate the model
        instead of using cross validation.
        X_train, X_test, y_train, y_test = train_test_split(X,
        clf = SVC(kernel='poly')
        clf.fit(X_train, y_train)
        s = clf.score(X_test, y_test)

    def main(self):
        dataset, labels = self.preprocess_training_set()
        # use svm classifier
        self.train_svm(dataset, labels)