/
clean_data.py
61 lines (44 loc) · 1.49 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Import the necessary packages
import pandas as pd
import argparse
import cv2
from utils import classify, describe, train
# Load Random Forest Classifier
model = train()
# Set up the argument parser
ap = argparse.ArgumentParser()
# Define arguments
ap.add_argument("-f", "--file", required=True, help="Path to the pandas dataframe with Instagram metadata.")
# Parse arguments
args = vars(ap.parse_args())
# Assign arguments to variables
df_file = args["file"]
# Load dataframe
df = pd.read_pickle(df_file)
print "*** Loading Instagram metadata from {} ...".format(df_file)
# Loop over images in the dataframe
for index, row in df.iterrows():
# Define path
ipath = "test_output/" + row['Filename']
# Load image
image = cv2.imread(ipath)
# Extract features
features = describe(image)
# Classify image
prediction = classify(features, model)
print "*** Classifying {} ... prediction: {}".format(ipath, prediction)
# Take action based on prediction
if prediction == 'photo':
cv2.imwrite("test_output/photos/%s" % row['Filename'], image)
if prediction == 'other':
df = df[df.index != index]
cv2.imwrite("test_output/others/%s" % row['Filename'], image)
# Reset dataframe index
df = df.reset_index(drop=True)
df.index += 1
print "*** Updating dataframe index ..."
# Pickle cleaned data
new_df_file = "test_output/cleaned.pkl"
print "*** Saving the cleaned dataframe into {}".format(new_df_file)
df.to_pickle(new_df_file)
print "*** ... Done."