-
Notifications
You must be signed in to change notification settings - Fork 3
/
csv2json.py
153 lines (138 loc) · 3.66 KB
/
csv2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Convert a CSV file into a JSON object with distribution
import classifiedunicodevalue
from classifiedunicodevalue import ClassifiedUnicodeValue
from datautil import compressedValueSequence, compressedValues
import unicodecsv
from version import savutilName, savutilVersion
def blankNone (x):
if x is None:
return u""
else:
return unicode (x)
def objectify (x):
if x == "":
return None
else:
try:
i = int (x)
return i
except:
try:
f = float (x)
return g
except:
return x
if __name__ == "__main__":
import getopt
import json
import os
import sys
import xlrd
optlist, args = getopt.getopt(sys.argv[1:], 'ad:h:s:e:o:w:')
delimiter = ","
headerIndex = None
skipLines = None
encoding = "cp1252"
outputPath = ""
worksheetName = None
for (option, value) in optlist:
if option == "-d":
delimiter = value
if option == "-e":
encoding = value
if option == "-h":
headerIndex = int (value)
if option == "-o":
outputPath = value
if option == "-s":
skipLines = int (value)
if option == "-w":
worksheetName = value
if skipLines is None:
if headerIndex is None:
headerIndex = 1
skipLines = headerIndex
if len (args) < 1 or\
headerIndex > skipLines:
print "--Usage: [-d,] [-ecp1252] [-h1] [-s1] <inputFile> [<outputFile>]"
sys.exit (0)
(root, csvExt) = os.path.splitext (args [0])
if not csvExt:
if worksheetName:
csvExt = ".xlsx"
else:
csvExt = ".csv"
inputFilename = root + csvExt
if len (args) > 1:
outputFilename = args [1]
else:
outputFilename = os.path.join (outputPath, root + ".json")
if headerIndex:
print "..Using line %d for headers" % headerIndex
if not (skipLines == 1 and headerIndex == 1):
print "..Taking data from line %d onwards" % skipLines
if worksheetName:
print "..Looking for worksheet '%s' in workbook %s" %\
(worksheetName, inputFilename)
wb = xlrd.open_workbook (inputFilename)
ws = wb.sheet_by_name (worksheetName)
print ws.ncols, ws.nrows
csvRows = [
[ws.cell_value (rowx, colx) for colx in xrange (ws.ncols)]
for rowx in xrange (ws.nrows)
]
else:
csvFile = open (inputFilename)
csv = unicodecsv.UnicodeReader (csvFile, encoding=encoding, delimiter=delimiter)
csvRows = list (csv)
csvFile.close ()
if skipLines > len (csvRows):
print "--Only %d row(s) found in CSV file, %d required for header" %\
(len (csvRows), skipLines)
sys.exit (0)
if headerIndex:
headers = csvRows [headerIndex-1]
csvRows = csvRows [skipLines:]
print "..%d row(s) found in input" % len (csvRows)
jsonObject = {
"origin": "csv2json %s from %s" %
(savutilVersion, inputFilename),
"code_lists": {},
"variable_sequence": headers,
"total_count": len (csvRows),
"variables": {},
"data": {}
}
variables = jsonObject ["variables"]
data = jsonObject ["data"]
for index, variableName in enumerate (headers):
values = [ClassifiedUnicodeValue (row [index]).value for row in csvRows]
distribution = {}
for value in values:
if distribution.has_key (value):
distribution [value] += 1
else:
distribution [value] = 1
cd = classifiedunicodevalue.ClassifiedDistribution (distribution)
if cd.dataType == "integer":
jsonType = "integer"
elif cd.dataType == "decimal":
jsonType = "decimal"
elif cd.dataType == "text":
jsonType = "string"
else:
jsonType = "null"
variables [variableName] = {
"sequence": index + 1,
"name": variableName,
"json_type": jsonType,
"distribution": cd.toObject (includeTotal=False)
}
data [variableName] = compressedValues (values, jsonType)
jsonFile = open (outputFilename, 'wb')
json.dump (jsonObject, jsonFile,
sort_keys=True,
indent=4,
separators=(',', ': ')
)
jsonFile.close ()