/
compare.py
326 lines (313 loc) · 13.8 KB
/
compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import tkinter.filedialog
import xlrd
from xlwt import Workbook
import sys
import json
import re
import cProfile
import time
import collections
#all sheets aggregation of excel files
allSheets={}
#读取文件名
def getexcelfiledict(readFile):
excelFileDict={}
for iter in readFile:
excelFileDict[iter]=''
return excelFileDict
#读取每个文件中的表名
def getsheetsdict(excelFileDict,setting):
allFilesDict={}
#pdb.set_trace()
#excelfiledict从列表改为字典
if setting:
for key in excelFileDict.keys():
wb=xlrd.open_workbook(key)
oneSheet={}
for setting_key,setting_value in setting.items():
if setting_value!='':
zone=convertstrtonumber(key,setting_value)
for sheet in wb.sheets():
if sheet.name==setting_key:
allCellsinOneSheet=readcelltodict(key,sheet,zone)
oneSheet[sheet.name]=allCellsinOneSheet
allCellsinOneSheet_keys=allCellsinOneSheet.keys()
allSheets[sheet.name]=dict.fromkeys(allCellsinOneSheet_keys,'')
break
else:
for sheet in wb.sheets():
if sheet.name==setting_key:
allCellsinOneSheet=readcelltodict(key,sheet)
allCellsinOneSheet_keys=allCellsinOneSheet.keys()
try:
allSheets_keys=allSheets[sheet.name].keys()
except KeyError:
allSheets[sheet.name]=dict.fromkeys(allCellsinOneSheet_keys,'')
else:
allCellinSheet_keys=list(set(allCellsinOneSheet_keys).union(set(allSheets_keys)))
allSheets[sheet.name]=dict.fromkeys(allCellinSheet_keys)
for allSheet_value_key in allSheets[sheet.name].keys():
if allSheet_value_key in allCellsinOneSheet_keys:
pass
else:
allCellsinOneSheet[allSheet_value_key]=''
for allFilesDict_key,allFilesDict_value in allFilesDict.items():
if sheet.name in allFilesDict_value.keys():
for allSheet_value_key in allSheets[sheet.name].keys():
if allSheet_value_key in allFilesDict_value[sheet.name].keys():
pass
else:
allFilesDict_value[sheet.name][allSheet_value_key]=''
else:
pass
oneSheet[sheet.name]=allCellsinOneSheet
break
allFilesDict[key]=oneSheet
else:
for key in excelFileDict.keys():
wb=xlrd.open_workbook(key)
oneSheet={}
for sheet in wb.sheets():
allCellsinOneSheet=readcelltodict(key,sheet)
allCellsinOneSheet_keys=allCellsinOneSheet.keys()
try:
allSheets_keys=allSheets[sheet.name].keys()
except KeyError:
allSheets[sheet.name]=dict.fromkeys(allCellsinOneSheet_keys)
else:
allCellinSheet_keys=list(set(allCellsinOneSheet_keys).union(set(allSheets_keys)))
allSheets[sheet.name]=dict.fromkeys(allCellinSheet_keys)
oneSheet[sheet.name]=allCellsinOneSheet
allFilesDict[key]=oneSheet
for allSheet_key,allSheet_value in allSheets.items():
for allFilesDict_key,allFilesDict_value in allFilesDict.items():
if allSheet_key in allFilesDict_value.keys():
for allSheet_value_key in allSheet_value.keys():
if allSheet_value_key in allFilesDict_value[allSheet_key].keys():
pass
else:
allFilesDict_value[allSheet_key][allSheet_value_key]=''
else:
pass
return allFilesDict
def readcelltodict(key,sheet,zone=None):
allCellsinOneSheet={}
if zone:
rowstart=zone['startRow']
rowend=zone['endRow']+1
colstart=zone['startColumn']
colend=zone['endColumn']+1
for row in range(rowstart,rowend):
for col in range(colstart,colend):
try:
cellvalue=sheet.cell(row,col).value
allCellsinOneSheet[xlrd.cellname(row,col)]=cellvalue
except IndexError:
allCellsinOneSheet[xlrd.cellname(row,col)]=''
else:
for row in range(sheet.nrows):
for col in range(sheet.ncols):
allCellsinOneSheet[xlrd.cellname(row,col)]=sheet.cell(row,col).value
return allCellsinOneSheet
def convertstrtonumber(key,setting_value):
xlsmaxrow=65536
xlsmaxcolumn=256
xlsxmaxrow=16384
xlsxmaxcolumn=1048576
if key[-1]=='s' or key[-1]=='S':
maxrow=xlsmaxrow
maxcolumn=xlsmaxcolumn
else:
maxrow=xlsxmaxrow
maxcolumn=xlsxmaxcolumn
cellRange=setting_value.split(':')
zone={}
for cellAdrress in cellRange:
#pdb.set_trace()
colAlphabet=re.match("^\w[a-z,A-Z]*",cellAdrress).group()
colAlphabet=colAlphabet.upper()
colNumber=convertalphabettonumber(colAlphabet)-1
rowNumber=re.search("\d[0-9]*",cellAdrress).group()
rowNumber=int(rowNumber)-1
if rowNumber>maxrow:
rowNumber=maxrow
if colNumber>maxcolumn:
colNumber=maxcolumn
if zone:
zone['endRow']=rowNumber
zone['endColumn']=colNumber
else:
zone['startRow']=rowNumber
zone['startColumn']=colNumber
return zone
alphabet={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,'J':10,'K':11,'L':12,'M':13,'N':14,'O':15,'P':16,'Q':17,'R':18,'S':19,'T':20,'U':21,'V':22,'W':23,'X':24,'Y':25,'Z':26}
def convertalphabettonumber(colAlphabet):
length=len(colAlphabet)
colNumber=0
for i in range(0,length):
colNumber=colNumber+alphabet[colAlphabet[i]]*(26**(length-i-1))
return colNumber
def readfilesetting(choice='n'):
setting={}
if choice=='y':
try:
settingFile=open('setting.json','r')
#pdb.set_trace()
setting=json.load(settingFile)
except IOError:
print('open file error')
except FileNotFoundError:
print('can not find setting.json')
readFile=tkinter.filedialog.askopenfilenames()
#readFile=["1.xlsx","2.xlsx","3.xlsx"]
excelFileDict=getexcelfiledict(readFile)
allFileDict=getsheetsdict(excelFileDict,setting)
outputDict=compare(allFileDict)
output(outputDict)
#def convertlisttolist(value,SEPARATE):
# temp=[]
# for element in value:
# if isinstance(element,str)==False:
# tempdict={}
# tempdict[element[0]]=element[1]
# temp.append(tempdict)
# else:
# temp.append(element)
# return temp
def output(outputDict):
result=[]
line={}
SEPARATE=','
headLineStr='Sheet'+SEPARATE+'Cell'
for outputDict_key,outputDict_value in outputDict.items():
headLineStr=headLineStr+ SEPARATE + outputDict_key
if line:
pass
else:
line=dict.fromkeys(outputDict_value.keys(),'')
for outputDict_value_key,outputDict_value_value in outputDict_value.items():
if line[outputDict_value_key]!='':
for outputDict_value_value_key,outputDict_value_value_value in outputDict_value_value.items():
if outputDict_value_value_key in line[outputDict_value_key]:
temp=[]
temp= line[outputDict_value_key][outputDict_value_value_key]
temp.extend(outputDict_value_value_value)
line[outputDict_value_key][outputDict_value_value_key]=temp
else:
line[outputDict_value_key][outputDict_value_value_key]=outputDict_value_value_value
else:
line[outputDict_value_key]=outputDict_value_value
book=Workbook()
sheet1=book.add_sheet('Sheet1')
headLineStrlist=headLineStr.split(',')
row=0
col=0
for iter in headLineStrlist:
sheet1.write(row,col,iter)
col=col+1
row=row+1
for line_key,line_value in line.items():
col=0
sheet1.write(row,col,line_key)
row=row+1
colsheet=col+1
for line_value_key,line_value_value in line_value.items():
if line_value_key=='NoneorEmpty':
col=colsheet+1
else:
col=colsheet
sheet1.write(row,col,line_value_key)
col=col+1
for iter in line_value_value:
sheet1.write(row,col,iter)
col=col+1
row=row+1
book.save('result.xls')
print("output to result.xls")
def compare(allFilesDict):
outputDict=dict.fromkeys(allFilesDict.keys())
for ouputDict_key,ouputDict_value in outputDict.items():
outputDict[ouputDict_key]=dict.fromkeys(allSheets.keys(),{})
for allSheets_key,allSheets_value in allSheets.items(): #allSheets_key表名,allSheets_value单元格字典
temp={}
sameCellinEachFiledDict={}
for allFilesDict_key,allFilesDict_value in allFilesDict.items(): #allFilesDict_key文件名,allFilesDict_value表字典
if allSheets_key in allFilesDict_value: #如果表在此文件的表字典中
if allSheets_value.keys():
for allSheets_value_key in allSheets_value.keys():
cell=[]
#################
cell.append(allFilesDict_key)
cell.append(allFilesDict_value[allSheets_key][allSheets_value_key])
if allSheets_value_key in sameCellinEachFiledDict:
sameCellinEachFiledDict[allSheets_value_key].append(cell)
else:
allFileCell=[]
allFileCell.append(cell)
sameCellinEachFiledDict[allSheets_value_key]=allFileCell
#################
else:
temp={}
temp['NoneorEmpty']=['Empty']
outputDict[allFilesDict_key][allSheets_key]=temp
else:
temp={}
temp['NoneorEmpty']=['None']
outputDict[allFilesDict_key][allSheets_key]=temp
for sameCellinEachFiledDict_key,sameCellinEachFiledDict_value in sameCellinEachFiledDict.items():
for i in range(0,len(sameCellinEachFiledDict_value)-1):
for j in range(i+1,len(sameCellinEachFiledDict_value)):
if sameCellinEachFiledDict_value[i][1]!=sameCellinEachFiledDict_value[j][1]:
for iter in sameCellinEachFiledDict_value:
temp=collections.OrderedDict()
templist=[]
templist.append(iter[1])
temp[sameCellinEachFiledDict_key]=templist
tempoutput=outputDict[iter[0]][allSheets_key]
if tempoutput!={}:
templist=[]
templist.append(iter[1])
temp[sameCellinEachFiledDict_key]=templist
tempoutput[sameCellinEachFiledDict_key]=templist
tempoutput=collections.OrderedDict(sorted(tempoutput.items(),key=lambda t:t[0]))
outputDict[iter[0]][allSheets_key]=tempoutput
else:
outputDict[iter[0]][allSheets_key]=temp
break
else:
if j==len(sameCellinEachFiledDict_value)-1:
break
break
#outputDict=sorted(outputDict.items(),key=lambda d:d[0],reverse=False)
return outputDict
#主函数
def main():
while True:
#choice='y'
#readfilesetting(choice)
try:
choice=input('''Compare excel files with setting.json or not(y or n):
e to exit, h to help\r\n''')
if choice!='y' and choice!='Y' and choice!='n' and choice!='N' and choice!='e' and choice!='E' and choice!='h' and choice!='H':
raise ValueError
else:
if choice=='y'or choice=='Y' or choice=='n' or choice=='N':
readfilesetting(choice)
#finish=time.time()
#print(finish-start)
elif choice=='h' or choice=='H':
print('''Introduction
This program used to compare excel files.Out put the difference between mutli files.
===================================================================================
Paramaters
y you can modify setting.json file,to specify sheets and cells you want to compare.
So Before you input y, you must modify setting.json first.
n you just select excel files,the program will compare each cell of each sheet of each file.
e quit program.
h help.
''')
elif choice=='e' or choice=='E':
sys.exit()
except ValueError:
print('Please enter y or n or e')
main()