forked from flecox/CsvToArff
/
csv_to_arff.py
197 lines (151 loc) · 6.01 KB
/
csv_to_arff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
from optparse import OptionParser
from collections import defaultdict
import arff
ARFF_TYPES = ('NUMERIC', 'STRING', 'NOMINAL', 'REAL', 'INTEGER')
#Exceptions
class NotEnoughTypesException(Exception):
pass
class AttrMissingException(Exception):
pass
class NotEnoughAttributesException(Exception):
pass
def types_callback(option, opt, value, parser):
setattr(parser.values, option.dest, value.split(','))
def check_selected_attrs(selected_attrs, header):
"""Check if the selected attrs are in the csv header."""
for attr in selected_attrs:
if attr not in header:
raise AttrMissingException(attr)
def get_selected_columns_map(selected_attrs, header):
selected_columns = []
for attr in header:
#list the selected columns
if selected_attrs and attr not in selected_attrs:
selected_columns.append(False)
else:
selected_columns.append(True)
return selected_columns
def init_attributes_list(header, selected_columns, type_list):
#set the attributes
attributes = []
index = 0
for selected, attr in zip(selected_columns, header):
#list the selected columns
if selected:
attr = unicode(attr, 'utf-8')
if index >= len(type_list):
raise NotEnoughTypesException
arff_type = type_list[index]
if arff_type.upper() == "NOMINAL":
tmp = (attr, [])
else:
tmp = (attr, arff_type)
index += 1
attributes.append(tmp)
return attributes
def data_to_dict(data, type_list, relation_name, selected_attrs=None):
"""Reads a data matrix with first row as header, a list of arff types and
returns a string with the arff format."""
#this will be dictionria sent to to_arff module
arff_content = defaultdict(list)
#set relation
arff_content['relation'] = relation_name
is_nominal_column = [t.upper() == 'NOMINAL' for t in type_list]
#get header
header = data.next()
attributes = []
check_selected_attrs(selected_attrs, header)
selected_columns = get_selected_columns_map(selected_attrs, header)
attributes = init_attributes_list(header, selected_columns, type_list)
assert len(attributes) <= len(type_list)
#create data rows
arff_data = []
for line in data:
#only use the selected columns
new_line = []
index = 0
for i, item in enumerate(line):
if selected_columns[i]:
if item != '':
new_line.append(unicode(item, 'utf-8'))
else: #missing value, must be none not ''
new_line.append(None)
assert index < len(is_nominal_column)
#add to attributes if its nominal column
if is_nominal_column[index] and item:
attributes[index][1].append(unicode(item, 'utf-8'))
index += 1
if len(type_list) < len(new_line):
raise NotEnoughAttributesException
#append new row to the data list
arff_data.append(new_line)
arff_content['attributes'] = attributes
arff_content['data'] = arff_data
return arff_content
def csv_to_arff(fileinput, type_list, relation_name, selected_attrs):
with open(fileinput, 'r') as inputfile:
data = csv.reader(inputfile, delimiter=',')
arff_content = data_to_dict(data, type_list, relation_name, selected_attrs)
return arff.dumps(arff_content)
def main():
usage = "Usage: %prog <options>'"
parser = OptionParser(usage=usage)
parser.add_option('-r', '--relation-name',
type='string',
dest="relation",
help="the relation name")
parser.add_option('-t', '--types',
type='string',
dest="typespec",
action='callback',
help="arff types list. e.g: NOMINAL,REAL,INTEGER,INTEGER",
callback=types_callback)
parser.add_option('-a', '--attributes',
type='string',
dest="attrs",
action='callback',
help="atttributes to be procesed eg: casa,auto,cama\\larga",
callback=types_callback)
parser.add_option("-f", "--file", dest="fileinput",
help="csv input file", metavar="FILE")
#get args and option from commandline
(options, args) = parser.parse_args()
type_list = options.typespec
fileinput = options.fileinput
selected_attrs = options.attrs or []
relation_name = options.relation
#check relation name
if not relation_name:
parser.error(
"Please specify a relation name e.g. -r test"
)
#the list of types and file input are mandatory
if type_list and len(type_list) < 1:
parser.error(
"Please specify the list of arff types e.g. -t NOMINAL,STRING"
)
if not options.fileinput:
parser.error("Please specify file input: -f example.csv")
for arff_type in type_list:
if arff_type.upper() not in ARFF_TYPES:
#check is the types are legal
parser.error("%s is not a legal type use %s" % (arff_type,
str(ARFF_TYPES)))
try:
print csv_to_arff(fileinput, type_list, relation_name, selected_attrs)
except IOError:
parser.error("the file %s does not exists" % options.fileinput)
except NotEnoughTypesException:
parser.error("there are more columns than types specified")
except NotEnoughAttributesException:
parser.error("there are more types than attributes specified")
except AttrMissingException, e:
parser.error("attribute '%s' not in csv header" % e)
except arff.WrongTypeException, e:
parser.error("Type Error, '%s' can't be a %s type.\n\nrow: %s" % (e[0], e[1], e[2]))
#main program
if __name__ == "__main__":
main()