def run(filename): xmldoc = minidom.parse(filename[:-4]+'.xml') if "simulatedFiles" in filename: plg = xmldoc.getElementsByTagName('plagiarized') srcs = [] for x in plg: print(x.attributes['sourceid'].value) y = x.attributes['sourceid'].value if str(y)+'.txt' not in srcs: srcs.append(str(y)+'.txt') else: plg = xmldoc.getElementsByTagName('features') srcs = [] for x in plg: print(x.attributes['source_reference'].value) y = x.attributes['source_reference'].value if str(y) + '.txt' not in srcs: srcs.append(str(y) + '.txt') if 'testdoc2' in filename: dupesDict = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') else: dupesDict = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') dupes = [] for dupe in dupesDict: for x in srcs: if x in dupesDict.get(dupe) and dupe not in srcs: dupes.append(dupe) for x in srcs: print(dupesDict.get(x)) try: if len(dupesDict.get(x)) > 1: for dupe in dupesDict.get(x): if dupe not in dupes: dupes.append(dupe) else: try: if dupesDict.get(x)[0] not in dupes: dupes = dupes + dupesDict.get(x) except: continue except: print("not in dictionary") print(dupes) tupleS = [srcs, dupes] print(tupleS) try: dicc = GlobalFunctions.openResult('output/annotation.csv') dicc[ntpath.basename(filename[:-4])] = tupleS except: dicc = {ntpath.basename(filename[:-4]):tupleS} with open('output/annotation.csv', 'w', encoding='utf-8', newline='')as csvfile: fieldname = ['TestDocs', 'Source','Duplicates'] writer = csv.DictWriter(csvfile, fieldnames=fieldname) writer.writeheader() for x in dicc: try: writer.writerow({'TestDocs':x, 'Source':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]}) except: writer.writerow({'TestDocs': x, 'Source': dicc.get(x)[0]}) csvfile.close()
def run(filename, root, show = True, filter = True, filamt = 5): dicc = GlobalFunctions.openFiles('PLs/CompiledPLs/postingList.csv') df = GlobalFunctions.openFiles('PLs/dictionary/documentFreq.csv') testTF, queries = TestDocProcessing.queryExtract(filename) querySources = [] counter = 0 if show: win = tkinter.Toplevel(root) win.minsize(200, 200) win.title("Search Query for "+ntpath.basename(filename)) c=0 for que in queries: r=0 counter +=1 querySources.append(getSource(testTF,que,dicc,df)) if show: tkinter.Label(win, text = "Search Query "+str(counter)).grid(row=r, column = c, padx = 3) r+=1 try: for q in que: if show: tkinter.Label(win, text=q).grid(row=r, column = c, padx= 3) r +=1 except: print("unable to call new window") c += 1 if show: qwin = tkinter.Toplevel(win) qwin.minsize(200, 200) qwin.title("Query Result for " + ntpath.basename(filename)) c=0 counter = 0 for src in querySources: r=0 counter+=1 tkinter.Label(qwin, text = "Query Result "+str(counter)).grid(row=r, column = c, padx = 3) tkinter.Label(qwin, text="Percentage " + str(counter)).grid(row=r, column=c+1, padx=3) r+=1 try: for s in src: tkinter.Label(qwin, text = s[0]).grid(row=r, column = c, padx = 3) tkinter.Label(qwin, text = round(s[1]*100,2)).grid(row=r,column = c+1, padx=3) r+=1 except: print("no val") c+=2 result =[] for n in querySources: for x in n: result.append(x) result.sort(key=operator.itemgetter(1)) result.reverse() filters = [] copyResult = result.copy() result = [] for x in copyResult: if x[0] not in filters: filters.append(x[0]) result.append(x) print(result) if 'testdoc2' in filename: dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') else: dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') duplicate = [] copyRes = result.copy() result = [] for res in copyRes: result.append(res[0]) for res in result.copy(): try: for x in dupes.get(res): if x not in duplicate and res not in duplicate: duplicate.append(x) result.remove(x) except: pass if 'testdoc2' in filename: dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') else: dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') for res in result.copy(): try: for x in dupes.get(res): if x not in duplicate and res not in duplicate: duplicate.append(x) result.remove(x) except: pass print(result) print(duplicate) if filter: result = result[:filamt] if 'testdoc2' in filename: dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv') else: dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv') duplicate = [] for res in result: try: for x in dupes.get(res): duplicate.append(x) except: pass if show: fres = tkinter.Toplevel(qwin) fres.minsize(200, 200) fres.title("Source Candidate for " + ntpath.basename(filename)) c = 0 r = 0 dr = 0 counter += 1 tkinter.Label(fres, text="Query Result").grid(row=r, column=c, padx=3) r += 1 try: for s in result: tkinter.Label(fres, text=s).grid(row=r, column=c, padx=3) r += 1 for d in duplicate: dr+=1 tkinter.Label(fres, text="Duplicate").grid(row=0, column=c+1, padx=3) tkinter.Label(fres,text = d).grid(row=dr, column=c+1, padx=3) except: print("no val") if not os.path.exists('output/'): os.makedirs('output/') copyResult = [] for res in result.copy(): copyResult.append(res) copyDupe = [] for dupes in duplicate.copy(): copyDupe.append(dupes) tupleS = [copyResult, copyDupe] print(tupleS) print(result) print(duplicate) if filter and filamt == 5: saveResult = 'output/resultsFilter5.csv' elif filter and filamt ==10: saveResult = 'output/resultsFilter10.csv' else: saveResult = 'output/resultsNoFil.csv' try: dicc = GlobalFunctions.openResult(saveResult) dicc[ntpath.basename(filename[:-4])] = tupleS except: dicc = {ntpath.basename(filename[:-4]):tupleS} if filter: with open('output/resultsFilter'+str(filamt)+'.csv', 'w', encoding='utf-8', newline='')as csvfile: fieldname = ['TestDocs', 'Source_Candidate','Duplicates'] writer = csv.DictWriter(csvfile, fieldnames=fieldname) writer.writeheader() for x in dicc: try: writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]}) except: writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]}) csvfile.close() else: with open('output/resultsNoFil.csv', 'w', encoding='utf-8', newline='')as csvfile: fieldname = ['TestDocs', 'Source_Candidate','Duplicates'] writer = csv.DictWriter(csvfile, fieldnames=fieldname) writer.writeheader() for x in dicc: try: writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]}) except: writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]}) csvfile.close() return copyResult,copyDupe