/
Log.py
390 lines (297 loc) · 14.2 KB
/
Log.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
import Commit
import File
import time
import Programmer
import Output
import re
from time import mktime
from datetime import datetime
import textdistance
import sys
from pm4py.objects.log.importer.xes import factory as xes_import_factory
#class that represents a parsed log
#contains a list of commits and a list of programmers
#READ ME: the parsing it for a log generated by Tortoise SVN of for a VCS file in the form of an event log (XES)
class Log:
def __init__(self,logFile):
self.listOfCommits = []
self.filename = logFile
self.listOfProgrammers = []
self.programmerIterator = 1
self.fileIterator = 1
self.listOfFiles = []
#if input is event log:
if(self.filename.lower().endswith('.xes')):
self.parseXESLog()
#if input is svn log:
elif(self.filename.lower().endswith('.txt')):
self.parseSVNLog()
else:
sys.exit("Error: File format is not supported!")
#self.totalImportanceAllFiles = self.calculateFileImportance()
self.calculateFileImportance()
def getNodeIterator(self):
return self.programmerIterator
def getlistOfProgrammers(self):
return self.listOfProgrammers
def getListOfCommits(self):
return self.listOfCommits
#parses the tortoise svn log file and constructs a list of commits
def parseSVNLog(self):
with open(self.filename,'r',encoding="utf8") as f:
#Read file line by line, 'cause more memory efficient
for line in f:
if line != "\n":
#read 1 revision
#Need: revision number, author, date, message, files, collaborators,modifier
#remove leading & ending whitespaces
line = line.strip()
#revision number
rev_number = line.split(":")[1]
rev_number = rev_number.strip()
#author
line = f.readline()
line = line.strip()
author = line.split(":")[1]
author = author.lower()
author = author.strip()
#date
line = f.readline()
line = line.strip()
date = line.split(":",1)[1]
date = date.strip()
#The date is in dutch, uncommit this is your log is in English
date = self.translateDateDutch(date)
date = self.parseDate(date)
date = datetime.fromtimestamp(mktime(date))
#message
f.readline()
message = ""
line = f.readline()
line = line.strip()
message += line
line = f.readline()
line = line.strip()
while line != "----":
message +=" "+ line
line = f.readline()
line = line.strip()
#parse the programmers from the message
programmers = self.getProgrammers(message)
#files
line = f.readline()
files = []
fileStatus = {}
fileStatIterator = 1
while line != "\n":
line = line.strip()
modifier = line.split(":")[0]
modifier = modifier.strip()
path = line.split(":")[1]
path = path.strip()
#delete the (Copy from path: .....)
path = path.replace(" (Copy from path","")
###
#check if file already exists
file = self.searchFile(path)
#file does not exist
if(not isinstance(file,File.File)):
file = self.makeFile(path, self.fileIterator)
#add to dictionary to add to the right list of commits later on
fileStatus[fileStatIterator] = {'File':file,'Status': modifier}
fileStatIterator += 1
files.append(file)
line = f.readline()
#end of revision
commit = Commit.Commit(rev_number,author,date,programmers,message,files)
self.listOfCommits.append(commit)
#add this commit to the programmers commit listOfCommits
for p in programmers:
p.addToCommitList(commit)
#for each file in this commit: add the commit to the right list
for fKey, fInfo in fileStatus.items():
file = fInfo['File']
file.addCommit(commit,fInfo['Status'])
f.close()
#function that parses a date with dutch weekdays & months
def translateDateDutch(self,date1):
date1 = date1.replace("maandag","monday")
date1 = date1.replace("dinsdag","tuesday")
date1 = date1.replace("woensdag","wednesday")
date1 = date1.replace("donderdag","thursday")
date1 = date1.replace("vrijdag","friday")
date1 = date1.replace("zaterdag","saturday")
date1 = date1.replace("zondag","sunday")
date1 = date1.replace("januari","january")
date1 = date1.replace("februari","february")
date1 = date1.replace("maart","march")
date1 = date1.replace("mei","may")
date1 = date1.replace("juni","june")
date1 = date1.replace("juli","july")
date1 = date1.replace("augustus","august")
date1 = date1.replace("oktober","october")
return date1
#parses date into a struct_time format
def parseDate(self,date):
date = time.strptime(date,"%A %d %B %Y %H:%M:%S")
return date
#parse the programmers from the message and return as list
def getProgrammers(self,message):
programmers = []
#first remove http(s):// 'cause this can cause problems later on
message = message.replace("http://","")
message = message.replace("https://","")
#split on occurrence of ':', last part is actual message
parts = message.split(":")
for i in range(0,len(parts)-1):
if(parts[i].strip().find(" ") == -1): #filter out anything that isn't a setName
#programmer found, check if they already exist
progName = parts[i].strip()
#if programmerName contains a '.' it isn't a programmer but the message contains a :, so this is already part of the message
if('.' in progName):
#We've already reached the message, the message just contains a ':' and our program is confused
return programmers
if(progName == ""):
#We've already reached the message, the message just contains a '::' and our program is confused
return programmers
#if programmer name contains 'STORY' or 'STOR' we've already reached the message
if (re.search('story', progName, re.IGNORECASE)):
return programmers
if (re.search('stor[0-9]',progName,re.IGNORECASE)):
return programmers
prog = self.getProgrammerObject(progName)
if(prog not in programmers):
programmers.append(prog)
return programmers
def getProgrammerObject(self,progName):
prog = self.searchProgrammer(progName)
#no existing programmer found with this name, create one
if(not isinstance(prog,Programmer.Programmer)):
prog = Programmer.Programmer(progName,self.programmerIterator)
#this iterator will also be the node number
self.programmerIterator = self.programmerIterator +1
#add it to the general overview list
self.listOfProgrammers.append(prog)
return prog
#search if the programmer with this name already exists in the list
#@param name = name of the programmer
#@returns that programmer object
def searchProgrammer(self,name):
for elem in self.listOfProgrammers:
#get list of element's names
programmerAliases = elem.getNames()
for programmerAlias in programmerAliases:
#comparison is case insensitive
if(programmerAlias.lower() == name.lower()):
#if(elem.getName() == name):
return elem
#Take typing errors into account using string comparison
#isTypo = False
#Toggle this to include typo check
isTypo = self.checkIfTypo(programmerAlias.lower(), name.lower())
#name is basically the same, only with a typo
if(isTypo):
#add this name as an alias for this programmer
elem.addAlias(name.lower())
return elem
return ""
#Check if the two names are the same but contain a typo
#Do this by calculating the jaro-winkler distance
#@param name1 = first name of the comparison
#@param name2 = second name of the comparison
#@returns True if both string have a high probability of being the same
def checkIfTypo(self,name1, name2):
#normalizedHammingDistance = textdistance.hamming.normalized_similarity(name1,name2)
#levenshteinDistance = textdistance.levenshtein.normalized_similarity(name1,name2)
jaroWinkler = textdistance.jaro_winkler(name1,name2)
#avgSimilarityScore = (normalizedHammingDistance + levenshteinDistance + jaroWinkler)/3
#if(avgSimilarityScore > 0.78):
if (jaroWinkler > 0.92):
return True
else:
return False
#search if file with this path already exists
#@param filePath = file path
#@returns that file object
def searchFile(self,filePath):
for elem in self.listOfFiles:
if(elem.getPath() == filePath):
return elem
return ""
#function that triggers each file to calculates its importance
def calculateFileImportance(self):
firstAndLastTimeStamp = self.getFirstAndLastCommitDate()
for file in self.listOfFiles:
importance = file.calculateImportanceRatio(firstAndLastTimeStamp)
#@return tuple of time stamp of the first and last commit
def getFirstAndLastCommitDate(self):
listOfTimestamps = []
for commit in self.listOfCommits:
listOfTimestamps.append(commit.getDate())
listOfTimestamps.sort()
lastTimeStamp = listOfTimestamps[-1]
firstTimeStamp = listOfTimestamps[0]
return (firstTimeStamp,lastTimeStamp)
#The VCS log is presented in the form of a XES log,
#Read this into the underlying data structure
#The XES file follows the following format:
#case ID is represented by concept:name key
#for event level:
#concept:name = activity
#time:timestamp
#event = "commit"
#org:resource
#filePath = contains the file path
#revision = contains the commit number
#@prerequisite the log is clean: no wrong programmer names, no wrong file paths
def parseXESLog(self):
#Read the XES file into an event log
log = xes_import_factory.apply(self.filename)
#parse this log into the data structure
#Iterate over every file
for case_index, case in enumerate(log):
#make file object: every file is a new case, so it does not exist yet
fileID = case.attributes["concept:name"]
#Every file has at least one commit, and every commit in this trace is about the same file, so filepath is always the same
path = case[0]["filePath"]
file = self.makeFile(path,fileID)
#Iterate over every commit concerning this file
for event_index, event in enumerate(case):
self.handleEventLogLine(event,file)
def handleEventLogLine(self, event, file):
#gather all information
programmer = event["org:resource"]
timestamp = event["time:timestamp"]
commitNumber = event["revision"]
modifierStatus = event["modifierStatus"]
#Search if the programmer already exists
prog = self.getProgrammerObject(programmer)
#Check if the commit object already exists
commit = self.getCommitByNumber(commitNumber)
if(not isinstance(commit,Commit.Commit)):
#commit does not exist yet, make one
commit = Commit.Commit(commitNumber,"",timestamp,[prog],"",[file])
self.listOfCommits.append(commit)
else:
#add the programmer and the file to the commit
commit.addContributor(prog)
commit.addFile(file)
#Add this commit to the programmers listOfCommits
prog.addToCommitList(commit)
#Add the commit to the right modifier list for the file
file.addCommit(commit,modifierStatus)
#Search if a commit with this revision number already exists
#@param commitNumber is the revision number of this commit
#@returns that commit object or nothing
def getCommitByNumber(self,commitNumber):
commitList = [x for x in self.listOfCommits if x.getRevisionNumber() == commitNumber]
if commitList:
return commitList[0]
else :
return ""
def makeFile(self,path,fileID):
file = File.File(path, fileID)
self.fileIterator = self.fileIterator + 1
#add it to the general overview list
self.listOfFiles.append(file)
return file