/
6140projectcomputervision.py
576 lines (445 loc) · 17.6 KB
/
6140projectcomputervision.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
# -*- coding: utf-8 -*-
"""6140projectComputerVision.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1POZ8zamhX-J1kJb-Dw2qCS97Dcdp65vV
**Pattern Recognition in Daily Top Trending YouTube Videos**
---
**Load Data From Google Drive**
"""
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
"""**Read-in CSV**"""
united_states_video_path = "/content/drive/My Drive/cs6140 project/data/USvideos.csv"
videos = pd.read_csv(united_states_video_path)
videos.head()
"""## **Hypotheses**
Sub-Experiment #2: **Video Thumbnail Image Analysis**:
- Given a video, use web scraping to get thumbnails from given links:
- perform clustering on thumbnail images
- perform analysis of clusters with number of views, comments and likes
- Come up with a Hypothesis and then test on some new data.
## **Thumbnail Image Analysis**
# **1 - Scrape site and download images**
# Access the Thumbnail links and download Images.
```
import requests
import os
import argparse
path = "images" # relative path of folder to store the file images.
total = 0 # keep track of number of images downloaded.
## Get urls from Pandas data frame.
for index, row in videos.iterrows():
url = row['thumbnail_link'] # get the thumbnail url. the thubmnails aren't unique
## Get image from url and store in data.
try:
# try to download the image
r = requests.get(url, timeout=60)
#print(r.status_code)
# save the image to disk
p = os.path.join(path, str(index) + ".jpg")
# create file pointer and specify path, the write to file in binary format("wb")
f = open(p, "wb")
f.write(r.content)
f.close()
# update the counter
total += 1
print(str(total) + " [INFO] downloaded: {}".format(p))
# handle if any exceptions are thrown during the download process
except:
print("[INFO] error downloading {}...skipping".format(p))
# check if all images were taken
assert(total == 40949), "All links had valid image files" # assert returns true hence all links are valid.
```
# Verify that all images can be opened (No bad images)
```
import cv2
from imutils import paths
badFiles = []
# loop over the image paths we just downloaded
for imagePath in list(paths.list_images('/images')):
# initialize if the image should be deleted or not
delete = False
# try to load the image
try:
image = cv2.imread(imagePath)
# if the image is `None` then we could not properly load it
# from disk, so delete it
if image is None:
delete = True
# if OpenCV cannot load the image then the image is likely
# corrupt so we should delete it
except:
print("Except")
delete = True
# check to see if the image should be deleted
if delete:
badFiles.append(imagePath)
print(badFiles) # list returns empty hence all image files can be opened.
```
## Remove bad images from dataset (Pre-processing)
```
def remove_duplicates(dir):
for filename in os.listdir(dir):
file_path = os.path.join(dir, filename)
bad_image = os.path.join(os.getcwd(), "bad.jpg") # store bad image.
img = cv2.imread(filename, 1)
if open(file_path, "rb").read() == open(bad_image, "rb").read(): # check if two images are the same.
os.remove(file_path)
print("removed : ", filename)
remove_duplicates("images/")
```
Removing bad images reduced the data set size from 6455 to 6082.
# **2 - Load images and Perform Unsupervised Image clustering**
Start with Feature extraction and then move to Dimensionality reduction (from m to p space)
* Color-based
* Texture-based
* Shape-based
* Deep Methods
Next, use K means to cluster images accordingly.
## 1. **Color-based**
"""
# Commented out IPython magic to ensure Python compatibility.
import requests
import os
import argparse
import matplotlib.pyplot as plt
import cv2
from imutils import paths
import numpy as np
from tqdm import tqdm
from keras.preprocessing import image
from PIL import Image
from mpl_toolkits.mplot3d import Axes3D
import pylab as pl
import glob
from sklearn.cluster import KMeans
# %matplotlib inline
from sklearn import datasets
import matplotlib.image as mpimg
import random
from numpy import array
from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input
from keras.models import Model
from keras.preprocessing.image import load_img, img_to_array
from keras.applications import imagenet_utils
path = "/content/drive/My Drive/cs6140 project/images/"
files = glob.glob (path + "*.jpg") # get files
# number of images to use or U-matrix. Can be changed to whatever.
#n_images = 1001
n_images = len(files) + 1
"""## **Feature 1: RGB Scatter plot**
**Displaying a few images to have a picture of the dataset**
"""
# Zero mean input for preprocessing layer by subtracting the mean image values.
def process(image):
mean = [103.939, 116.779, 123.68]
image[..., 0] -= mean[0]
image[..., 1] -= mean[1]
image[..., 2] -= mean[2]
return image
fig = plt.figure(figsize=(12, 9)) # to display 10 images.
count = 1
print(len(files))
images = [] # store images.
preprocess = imagenet_utils.preprocess_input
for file_path in files:
try:
image = load_img(file_path, target_size=(224, 224))
if count < 10:
plt.subplot(3, 3, count)
plt.title(os.path.basename(file_path)) # extract file name
plt.imshow(image)
plt.axis('off')
# add image to images array.
if count == n_images: break
image = img_to_array(image) # convert image pixels to a numpy array
image = np.expand_dims(image, axis=0)
image = process(image) # subtract the mean RGB value computed on the training set from each pixel.
images.append(image)
count += 1
except: # catch any exceptions
count += 1
continue
#
images = np.concatenate(images, axis = 0)
print(images.shape)
fig.tight_layout()
"""***Find mean of all pixels in Image and store images in array.***"""
red, green, blue = [], [], []
count = 0
for file in files:
if (count == n_images): break
count += 1
try:
img = Image.open(file, 'r')
pix_val = list(img.getdata())
red_color, green_color, blue_color = 0, 0, 0
for sets in pix_val:
red_color += sets[0]
green_color += sets[1]
blue_color += sets[2]
red.append(red_color / len(pix_val))
green.append(green_color / len(pix_val))
blue.append(blue_color / len(pix_val))
except OSError:
continue
#images = np.concatenate(images, axis=0)
fig = plt.figure(None, figsize=(7,7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(red, green, blue, c='b', marker='o', alpha=0.2)
ax.set_xlabel('Red Label')
ax.set_ylabel('Green Label')
ax.set_zlabel('Blue Label')
plt.title("Visualize", fontsize=14)
plt.show()
"""### Perform K Means clustering with this extracted feature"""
# KMeans clustering
X = []
num_clusters = 2
for i in range(len(red)): # Create 3D array of RGB values.
r, g, b = red[i], green[i], blue[i]
X.append([r, g, b])
km = KMeans(n_clusters = num_clusters)
km.fit(X)
km.predict(X)
labels = km.labels_ #Plotting
fig = plt.figure(1, figsize=(7,7))
ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=15, azim=134)
ax.scatter(red, green, blue,
c=labels.astype(np.float), edgecolor="k", s=50, alpha=.5)
ax.set_xlabel("Red")
ax.set_ylabel("Green")
ax.set_zlabel("Blue")
plt.title("K Means", fontsize=14)
"""#### From applying K means with 4 clusters, we can't really differentiate the data points clearly. Hence, we need more features. We examine the performance of the feature space with the ***Silhouette Coefficient*** in Sci-kit learn."""
from sklearn import metrics
metrics.silhouette_score(X, labels, metric='euclidean')
"""## **Next, is we try to identify the best number of clusters for our dataset.**
### Internal cluster validation with Silhouette coefficient.
where a higher Silhouette Coefficient score relates to a model with better defined clusters. The Silhouette Coefficient is defined for each sample and is composed of two scores:
1. The mean distance between a sample and all other points in the same class.
2. The mean distance between a sample and all other points in the next nearest cluster.
- The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. Scores around zero indicate overlapping clusters.
- The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.
"""
# experiment with clusters from 2 to 10.
cluster_scores = {} # dictionary to score clustering scores.
for num_of_clusters in range(2, 11):
km = KMeans(n_clusters = num_of_clusters)
km.fit(X)
km.predict(X)
labels = km.labels_#Plotting
score = metrics.silhouette_score(X, labels, metric='euclidean')
cluster_scores[num_of_clusters] = score
# Plot graph.
scores = sorted(cluster_scores.items())
clusters, score = zip(*scores) # unpack a list of pairs into two tuples
plt.figure(figsize=(6,6)) # set the size of plot.
plt.plot(clusters, score)
plt.title('Number of Clusters against score')
plt.ylabel('silhouette score')
plt.xlabel('num of clusters')
plt.show()
"""### We realize that the optimam number of clusters with the ***mean RGB values*** as features is 2.
## **Feature 2 - Texture-based**
**- Apply Gabor Filter to image as from [link text](https://github.com/Shikhargupta/computer-vision-techniques/blob/master/GaborFilter/gabor.py)**
The extracted features will then be fed into a classifier (neural network, SVM classifier etc.) which then further trains the classifier.
From the Link:
- The input image is convolved with each of the filters. So, now we have 40 different images (response matrices) corresponding to one input.
- Now, we have to extract features from each image. Several features could be used but here we have used 2:
- Local Energy - Summing up the squared values of each element of response matrix.
- Mean Amplitude - Summing up the absolute value of each element of the response matrix.
- Hence for every response matrix we have 2 values - its local energy and mean amplitude. 2 seperate matrices are formed for each value and are appended in the corresponding matrix for each response matrix. So now, we have 2 matrices of 1x40 size corresponding to each input image. Append one matrix to the other. This single matrix acts as the feature vector for the input image.
"""
# define gabor filter bank with different orientations and at different scales
def build_filters():
filters = []
ksize = 9
#define the range for theta and nu
for theta in np.arange(0, np.pi, np.pi / 8):
for nu in np.arange(0, 6*np.pi/4 , np.pi / 4):
kern = cv2.getGaborKernel((ksize, ksize), 1.0, theta, nu, 0.5, 0, ktype=cv2.CV_32F)
kern /= 1.5*kern.sum()
filters.append(kern)
return filters
#function to convolve the image with the filters
def process(img, filters):
accum = np.zeros_like(img)
for kern in filters:
fimg = cv2.filter2D(img, cv2.CV_8UC3, kern)
np.maximum(accum, fimg, accum) # get element-wise maximum between filter and accum.
return accum
count = 0
feature_vectors = []
for file in files:
if (count == n_images): break
count += 1
try:
#instantiating the filters
filters = build_filters()
f = np.asarray(filters)
#reading the input image in RGB Format
imgg = cv2.imread(file)
# convert to gray image to reduce dimensionality.
imgg = cv2.cvtColor(imgg, cv2.COLOR_BGR2GRAY)
#initializing the feature vector
feat = []
#calculating the local energy for each convolved image
for j in range(40):
res = process(imgg, f[j])
temp = 0
for p in range(90):
for q in range(120):
temp = int(temp) + int(res[p][q])*int(res[p][q])
feat.append(temp)
#calculating the mean amplitude for each convolved image
for j in range(40):
res = process(imgg, f[j])
temp = 0
for p in range(90):
for q in range(120):
temp = temp + abs(res[p][q])
feat.append(temp)
#feature matrix is the feature vector for the image
feature_vectors.append(feat)
except: # catch any exceptions
continue
"""### Perform K means clustering with new features vector (80 x 1)"""
num_clusters = 2
km = KMeans(n_clusters = num_clusters)
km.fit(feature_vectors)
km.predict(feature_vectors)
labels = km.labels_ #Plotting
metrics.silhouette_score(feature_vectors, labels, metric='euclidean')
# experiment with clusters from 2 to 10.
cluster_scores = {} # dictionary to score clustering scores.
for num_of_clusters in range(2, 11):
km = KMeans(n_clusters = num_of_clusters)
km.fit(feature_vectors)
km.predict(feature_vectors)
labels = km.labels_#Plotting
score = metrics.silhouette_score(feature_vectors, labels, metric='euclidean')
cluster_scores[num_of_clusters] = score
# Plot graph.
scores = sorted(cluster_scores.items())
clusters, score = zip(*scores) # unpack a list of pairs into two tuples
plt.figure(figsize=(6,6)) # set the size of plot.
plt.plot(clusters, score)
plt.title('Number of Clusters against score')
plt.ylabel('silhouette score')
plt.xlabel('num of clusters')
plt.show()
"""### Compare different clustering validation methods against number of clusters.
## **3 - Feed features vector to an Artificial Neural Network such as SOM(Self-Organizing map) to reduce the dimensionality of the feature vector and then perform clustering**.
Used [Neupy Tutorials](https://github.com/itdxer/neupy/blob/master/notebooks/Looking%20inside%20of%20the%20VGG19%20using%20SOFM.ipynb) for reference
"""
!pip install neupy
"""## Initializing VGG19 Architecture"""
from neupy import architectures, storage
vgg19 = architectures.vgg19()
vgg19
"""## Loading pre-trained parameters from ImageNet"""
storage.load(vgg19, "/content/drive/My Drive/cs6140 project/vgg/vgg19.hdf5")
"""## Propagating images through network"""
dense_2 = vgg19.end('dense_2')
batch_size = 16
outputs = []
#batch_x = np.reshape(batc, [-1, 28, 28, 1])
print(images.shape)
for batch in tqdm(range(0, len(images), batch_size)):
output = dense_2.predict(images[batch:batch + batch_size])
outputs.append(output)
dense_2_output = np.concatenate(outputs, axis=0)
dense_2_output.shape
"""## Train the SOM Neural nework on VGG19 Network."""
# data normalization
#feature_vectors = np.apply_along_axis(lambda x: x/np.linalg.norm(x), 1, feature_vectors)
from neupy import algorithms, utils
utils.reproducible()
sofm = algorithms.SOFM(
#n_inputs = len(feature_vectors[0]),
n_inputs = dense_2_output.shape[1],
# Feature map grid is 2 dimensions and has
# 900 output clusters (30 * 30).
features_grid=(30, 30),
# Closest neuron (winning neuron) measures
# using cosine similarity
distance='cos',
# Sample weights from the data.
# Every weight vector will be just a sample
# from the input data. In this way we can
# ensure that initialized map will cover data
# at the very beggining.
weight='sample_from_data',
# Defines radius within we consider near by
# neurons as neighbours relatively to the
# winning neuron
learning_radius=5,
# Large radius is efficient only for the first
# iterations, that's why we reduce it by 1
# every 5 epochs.
reduce_radius_after=5,
# The further the neighbour neuron from the winning
# neuron the smaller learning rate for it. How much
# smaller the learning rate controls by the `std`
# parameter. The smaller `std` the smaller learning
# rate for neighboring neurons.
std = 0.1,
# Neighbours within
reduce_std_after = 5,
# Learning rate
step = 0.001,
# Learning rate is going to be reduced every 5 epochs
reduce_step_after=5,
# Shows training progress in terminal
verbose=True,
# Shuffle data samples before every training epoch
shuffle_data=True,
)
sofm.train(dense_2_output, epochs=10)
"""**Visualizing Self-Organizing Features map**"""
pip install scipy==1.1.0
#1) Converting the image(s) from RGB to BGR
#2) Subtracting the dataset mean from the image(s)
def deprocess(image):
mean = [103.939, 116.779, 123.68]
# Zero-center by mean pixel
image[..., 0] += mean[0]
image[..., 1] += mean[1]
image[..., 2] += mean[2]
return image
"""## Code to draw the U-Matrix"""
from __future__ import division
#from scipy.misc import imread
import matplotlib.gridspec as gridspec
def draw_grid(sofm, images, output_features):
data = images
clusters = sofm.predict(output_features).argmax(axis=1)
grid_height, grid_weight = sofm.features_grid
plt.figure(figsize=(40, 40))
grid = gridspec.GridSpec(grid_height, grid_weight)
grid.update(wspace=0, hspace=0)
for row_id in range(grid_height):
print("Progress: {:.2%}".format(row_id / grid_weight))
for col_id in range(grid_weight):
index = row_id * grid_height + col_id
clustered_samples = data[clusters == index]
if len(clustered_samples) > 0:
# We take the first sample, but it can be any
# sample from this cluster (random or the one
# that is closer to the center)
sample = deprocess(clustered_samples[0])
else:
# If we don't have samples in cluster then
# it means that there is a gap in space
sample = np.zeros((224, 224, 3))
plt.subplot(grid[index])
plt.imshow(sample.astype('uint8'))
#plt.imshow(sample)
plt.axis('off')
print("Progress: 100%")
return sample
sample = draw_grid(sofm, images, dense_2_output)