Beispiel #1
0
def run(filename):
    xmldoc = minidom.parse(filename[:-4]+'.xml')

    if "simulatedFiles" in filename:
        plg = xmldoc.getElementsByTagName('plagiarized')
        srcs = []
        for x in plg:
            print(x.attributes['sourceid'].value)
            y = x.attributes['sourceid'].value
            if str(y)+'.txt' not in srcs:
                srcs.append(str(y)+'.txt')
    else:
        plg = xmldoc.getElementsByTagName('features')
        srcs = []
        for x in plg:
            print(x.attributes['source_reference'].value)
            y = x.attributes['source_reference'].value
            if str(y) + '.txt' not in srcs:
                srcs.append(str(y) + '.txt')

    if 'testdoc2' in filename:
        dupesDict = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')
    else:
        dupesDict = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
    dupes = []
    for dupe in dupesDict:
        for x in srcs:
            if x in dupesDict.get(dupe) and dupe not in srcs:
                dupes.append(dupe)
    for x in srcs:
        print(dupesDict.get(x))
        try:
            if len(dupesDict.get(x)) > 1:
                for dupe in dupesDict.get(x):
                    if dupe not in dupes:
                        dupes.append(dupe)
            else:
                try:
                    if dupesDict.get(x)[0] not in dupes:
                        dupes = dupes + dupesDict.get(x)
                except:
                    continue
        except:
            print("not in dictionary")
    print(dupes)

    tupleS = [srcs, dupes]
    print(tupleS)
    try:
        dicc = GlobalFunctions.openResult('output/annotation.csv')
        dicc[ntpath.basename(filename[:-4])] = tupleS
    except:
        dicc = {ntpath.basename(filename[:-4]):tupleS}

    with open('output/annotation.csv', 'w', encoding='utf-8', newline='')as csvfile:
        fieldname = ['TestDocs', 'Source','Duplicates']
        writer = csv.DictWriter(csvfile, fieldnames=fieldname)
        writer.writeheader()
        for x in dicc:
            try:
                writer.writerow({'TestDocs':x, 'Source':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]})
            except:
                writer.writerow({'TestDocs': x, 'Source': dicc.get(x)[0]})
    csvfile.close()
Beispiel #2
0
def run(filename, root, show = True, filter = True, filamt = 5):
    dicc = GlobalFunctions.openFiles('PLs/CompiledPLs/postingList.csv')
    df = GlobalFunctions.openFiles('PLs/dictionary/documentFreq.csv')
    testTF, queries = TestDocProcessing.queryExtract(filename)
    querySources = []
    counter = 0
    if show:
        win = tkinter.Toplevel(root)
        win.minsize(200, 200)
        win.title("Search Query for "+ntpath.basename(filename))
    c=0
    for que in queries:
        r=0
        counter +=1
        querySources.append(getSource(testTF,que,dicc,df))
        if show:
            tkinter.Label(win, text = "Search Query "+str(counter)).grid(row=r, column = c, padx = 3)
        r+=1
        try:
            for q in que:
                if show:
                    tkinter.Label(win, text=q).grid(row=r, column = c, padx= 3)
                r +=1
        except:
            print("unable to call new window")
        c += 1

    if show:
        qwin = tkinter.Toplevel(win)
        qwin.minsize(200, 200)
        qwin.title("Query Result for " + ntpath.basename(filename))
        c=0
        counter = 0
        for src in querySources:
            r=0
            counter+=1
            tkinter.Label(qwin, text = "Query Result "+str(counter)).grid(row=r, column = c, padx = 3)
            tkinter.Label(qwin, text="Percentage " + str(counter)).grid(row=r, column=c+1, padx=3)
            r+=1
            try:
                for s in src:
                    tkinter.Label(qwin, text = s[0]).grid(row=r, column = c, padx = 3)
                    tkinter.Label(qwin, text = round(s[1]*100,2)).grid(row=r,column = c+1, padx=3)
                    r+=1
            except:
                print("no val")
            c+=2

    result =[]
    for n in querySources:
        for x in n:
            result.append(x)

    result.sort(key=operator.itemgetter(1))
    result.reverse()
    filters = []
    copyResult = result.copy()
    result = []
    for x in copyResult:
        if x[0] not in filters:
            filters.append(x[0])
            result.append(x)
    print(result)

    if 'testdoc2' in filename:
        dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')
    else:
        dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
    duplicate = []

    copyRes = result.copy()
    result = []
    for res in copyRes:
        result.append(res[0])

    for res in result.copy():
        try:
            for x in dupes.get(res):
                if x not in duplicate and res not in duplicate:
                    duplicate.append(x)
                    result.remove(x)
        except:
            pass

    if 'testdoc2' in filename:
        dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
    else:
        dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')

    for res in result.copy():
        try:
            for x in dupes.get(res):
                if x not in duplicate and res not in duplicate:
                    duplicate.append(x)
                    result.remove(x)
        except:
            pass

    print(result)
    print(duplicate)

    if filter:
        result = result[:filamt]
        if 'testdoc2' in filename:
            dupes = GlobalFunctions.openFiles('duplicate/FirstDuplicates.csv')
        else:
            dupes = GlobalFunctions.openFiles('duplicate/NumericDuplicate.csv')
        duplicate = []
        for res in result:
            try:
                for x in dupes.get(res):
                    duplicate.append(x)
            except:
                pass

    if show:
        fres = tkinter.Toplevel(qwin)
        fres.minsize(200, 200)
        fres.title("Source Candidate for " + ntpath.basename(filename))
        c = 0
        r = 0
        dr = 0
        counter += 1
        tkinter.Label(fres, text="Query Result").grid(row=r, column=c, padx=3)
        r += 1
        try:
            for s in result:
                tkinter.Label(fres, text=s).grid(row=r, column=c, padx=3)
                r += 1
            for d in duplicate:
                dr+=1
                tkinter.Label(fres, text="Duplicate").grid(row=0, column=c+1, padx=3)
                tkinter.Label(fres,text = d).grid(row=dr, column=c+1, padx=3)
        except:
            print("no val")

    if not os.path.exists('output/'):
        os.makedirs('output/')

    copyResult = []
    for res in result.copy():
        copyResult.append(res)
    copyDupe = []
    for dupes in duplicate.copy():
        copyDupe.append(dupes)
    tupleS = [copyResult, copyDupe]
    print(tupleS)

    print(result)
    print(duplicate)

    if filter and filamt == 5:
        saveResult = 'output/resultsFilter5.csv'
    elif filter and filamt ==10:
        saveResult = 'output/resultsFilter10.csv'
    else:
        saveResult = 'output/resultsNoFil.csv'

    try:
        dicc = GlobalFunctions.openResult(saveResult)
        dicc[ntpath.basename(filename[:-4])] = tupleS
    except:
        dicc = {ntpath.basename(filename[:-4]):tupleS}

    if filter:
        with open('output/resultsFilter'+str(filamt)+'.csv', 'w', encoding='utf-8', newline='')as csvfile:
            fieldname = ['TestDocs', 'Source_Candidate','Duplicates']
            writer = csv.DictWriter(csvfile, fieldnames=fieldname)
            writer.writeheader()
            for x in dicc:
                try:
                    writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]})
                except:
                    writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]})
        csvfile.close()
    else:
        with open('output/resultsNoFil.csv', 'w', encoding='utf-8', newline='')as csvfile:
            fieldname = ['TestDocs', 'Source_Candidate','Duplicates']
            writer = csv.DictWriter(csvfile, fieldnames=fieldname)
            writer.writeheader()
            for x in dicc:
                try:
                    writer.writerow({'TestDocs':x, 'Source_Candidate':dicc.get(x)[0],'Duplicates':dicc.get(x)[1]})
                except:
                    writer.writerow({'TestDocs': x, 'Source_Candidate': dicc.get(x)[0]})
        csvfile.close()

    return copyResult,copyDupe