forked from ibalazevic/SMS_guru_text_mining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_features_train_test.py
140 lines (131 loc) · 6.45 KB
/
extract_features_train_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
import unicodecsv
from nltk.corpus import stopwords
import re
def extract_features_train_test(qfile_test='question_test.csv', qfile_train='question_train.csv', catfile='category.csv',
subcats=True, outfile='features_train_test.npz'):
"""
extract the numerical features from text documents
qfile_test - .csv file containing the test data
qfile_train - .csv file containing the train data
catfile - .csv file containing the categories
subcats - Boolean parameter which decides whether to perform the analysis for categories or subcategories
outfile - .npz file to store the output arrays, dictionary and list
"""
# read from the .csv files
question_train, question_test, category = read_files(qfile_train, qfile_test, catfile)
# create a dictionary with the category names
categories, par_sub_relation = create_categories_dict(category, subcats)
questions_train = []
questions_test = []
featurenames = set()
valid_questions_train = []
valid_questions_test = []
# from all the questions remove the ones that don't have a category or whose length after the preprocessing is 0
valid_questions_train, featurenames = preprocess(question_train[1:], featurenames, 'train')
valid_questions_test, featurenames = preprocess(question_test[1:], featurenames, 'test')
featurenames = list(featurenames)
categoryids_train = np.zeros((1, len(valid_questions_train)))
# iterate over the valid train questions and create the categoryids array with the question id-s and the list of questions
for i in valid_questions_train:
if subcats:
category_id = i[3]
else:
category_id = par_sub_relation[int(i[3])]
categoryids_train[:, valid_questions_train.index(i)] = category_id
category_text = i[4]
questions_train.append(category_text)
# iterate over the valid test questions and create the list of questions
for i in valid_questions_test:
category_text = i[4]
questions_test.append(category_text)
# make the features matrix where each column is a data point and each row is a vector with a position for each word from the
# featurenames list -> the column contains 1-s if the word appears in that question and 0-s if the word doesn't appear
features_train = np.zeros((len(featurenames), len(valid_questions_train)))
features_test = np.zeros((len(featurenames), len(valid_questions_test)))
for feature_id in range(len(featurenames)):
for q_id in range(len(questions_train)):
if featurenames[feature_id] in questions_train[q_id]:
features_train[feature_id, q_id] = 1.
for q_id in range(len(questions_test)):
if featurenames[feature_id] in questions_test[q_id]:
features_test[feature_id, q_id] = 1.
# save the arrays to the output file
np.savez(outfile, features_train=features_train, features_test=features_test, featurenames=featurenames,
categoryids_train=categoryids_train, categories=categories)
def read_files(qfile_train, qfile_test, catfile):
"""
read from .csv files
qfile_train - .csv file containing the SMS Guru questions for the train set
qfile_test - .csv file containing the SMS Guru questions for the test set
catfile - .csv file containing the categories
"""
with open(qfile_train, 'rb') as csvfile:
question_train = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
with open(qfile_test, 'rb') as csvfile:
question_test = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
with open(catfile, 'rb') as csvfile:
category = list(unicodecsv.reader(csvfile, delimiter=",", quoting=unicodecsv.QUOTE_ALL, escapechar="\\", encoding='utf-8'))
return question_train, question_test, category
def create_categories_dict(category, subcats):
"""
create a dictionary with the category id as the key and the category description as the valid_questions
category - list of entries from the category.csv file
subcats - Boolean parameter which decides whether to perform the analysis for categories or subcategories
return:
categories - dictionary of (sub)categories
par_sub_relation - dictionary containing the relations between categories and subcategories
"""
categories = {}
par_sub_relation = {}
# iterate through all the categories
for cat in range(1, len(category)):
if int(category[cat][1]) != 0:
subcat_id, subcat_description = int(category[cat][0]), category[cat][2]
parent_id, parent_description = [(int(category[i][0]), category[i][2]) for i in range(1, len(category)) if category[i][0]==category[cat][1]][0]
# create a dictionary par_sub_relation with categories and corresponding subcategories, might be useful later in the project...
par_sub_relation[subcat_id] = parent_id
# two cases, when subcats is True or False, depending on that we extract subcategories (66) or categories(14)
if subcats:
categories[subcat_id] = subcat_description
else:
categories[parent_id] = parent_description
return categories, par_sub_relation
def preprocess(question, featurenames, name):
"""
preprocess the questions
question - string containing the text of the question
featurenames - set of feature names
name - string containing 'train' or 'test', depending on what file we're currently preprocessing
return:
valid_questions - list of non-empty questions
featurenames - list of feature names
"""
# make a list with german stop-words
stop_words = stopwords.words('german')
stop_words = [i.decode('utf-8') for i in stop_words]
valid_questions = []
for i in question:
# check if the question in the train data is valid (i.e. has a category)
if 'name'=='train' and i[3] == 'N':
continue
category_text = i[4]
# remove the punctuation
category_text = re.sub(r'[^a-zA-Z ]',' ', category_text)
# remove the stop words and split questions into words
category_text = category_text.split()
category_text = [w for w in category_text if w not in stop_words]
for k in range(len(category_text)):
# convert words to lowercase
category_text[k] = category_text[k].lower()
#save the words as features if they're from the training set (because feature selection is only being done on the training set)
if name=='train' and category_text[k] :
featurenames.add(category_text[k])
# if the text of the question is not empty, append the question to the list of valid questions
if category_text:
i[4] = category_text
valid_questions.append(i)
return valid_questions, featurenames
if __name__ == '__main__':
# extract the features
extract_features_train_test(subcats=False)