def createDotplotFromResult(result, showIds=False): """Creates a dotplot from the result (PlagResult) of a similarity check. It uses the tiles of the PlagResult. Therefore it does not need w/k parameters. """ #check preconditions if type(result) != type(PlagResult()): raise NoValidArgumentError, 'Input must be of type PlagResult.' if result == None: raise NoValidArgumentError, 'Input must be of type PlagResult not None.' maxX = result.getIdStringLength()[0] maxY = result.getIdStringLength()[1] #create image (B/W) img = Image.new("L", (maxX, maxY), 255) tiles = result.getTiles() for tile in tiles: for i in range(0, tile[2]): img.putpixel((tile[0] + i, tile[1] + i), 0) if showIds: img = addIds(img, result.getIdentifier(), result.getIdStringLength()) return img
def run(s1, s2, mML=3, treshold=0.5): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings mML : minimumMatchingLength treshold : a single value between 0 and 1 Output: PlagResult """ #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #compute tiles global tiles, matchList tiles = [] #TODO: anders regeln ? tiles matchList = [] #TODO: anders regeln`? matchList tiles = RKR_GST(s1, s2, mML) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] if similarity>1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #return result of similarity check as PlagResult object return result
def getPositiveResults(resultList): """Returns a list with all positive PlagResult objects from the result list. @param resultList list containing PlagResult objects @return A list containing only positive, i.e. suspected, PlagResult objects. """ return [r for r in resultList if r.isSuspectPlagiarism()] #=============================================================================== # Test #=============================================================================== if __name__ == '__main__': print "Start Tests - PlagResultList helper methods" #create Test PlagResult1 plagResult = PlagResult("Test1", "Test2") plagResult.setAlgorithmName("NGRAM") plagResult.setNormalizerName("NORMAL") plagResult.setSimilarity(0.65) plagResult.setSuspectedPlagiarism(True) plagResult.setIdStringLength(52, 45) plagResult.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)]) #create Test PlagResult2 plagResult2 = PlagResult("Test3", "Test4") plagResult2.setAlgorithmName("NGRAM") plagResult2.setNormalizerName("NORMAL") plagResult2.setSimilarity(0.45) plagResult2.setSuspectedPlagiarism(False) plagResult2.setIdStringLength(152, 145) plagResult2.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)]) #create Test PlagResult3
def run(s1, s2, mML=3, treshold=0.5): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings mML : minimumMatchingLength treshold : a single value between 0 and 1 Output: PlagResult """ #check if the preconditions are fullfilled if mML < 1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #compute tiles global tiles, matchList tiles = [] #TODO: anders regeln ? tiles matchList = [] #TODO: anders regeln`? matchList tiles = RKR_GST(s1, s2, mML) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] if similarity > 1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #return result of similarity check as PlagResult object return result
def run(s1, s2, mML=3, treshold=[0.03, 0.31]): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings Options: ======== mML : minimumMatchingLength treshold : could be a single number(Resemblance) or a list of two numbers(Resemblance and Containment) Output: PlagResult """ #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if type(treshold) == type([]): if len(treshold) != 2: raise AssertionError, 'Treshold must be a single Value (Resemblance) or a list of two Values [Resemblance, Containment]' elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1): raise OutOfRangeError, 'tresholds values need to be 0<=t<=1' elif not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #create NGrams for strings ngramDict1 = createNGrams(s1, mML) ngramDict2 = createNGrams(s2, mML) #no nGrams found -> return empty initialized PlagResult if ngramDict1 == {} or ngramDict2 == {}: return PlagResult(hash(s1), hash(s2)) #compute similarity simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()), treshold) similarity = simResult[0] #compute tiles tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML) #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[2]) result.setIdStringLength(len(s1.split()), len(s2.split())) #return result of similarity check as PlagResult object return result
def resultFromXML(xmlString): """Returns a PlagResult object from the given XML representation. @param xmlString XML representation (whole file with a single PlagResult) @return A PlagResult object of the given XML representation. """ from xml.dom import minidom dom_object = minidom.parseString(xmlString) #get all PlagResult tags in the XML representation list = dom_object.getElementsByTagName("PlagResult") #get the first PlagResult tag plagResultTag = list[0] #create a PlagResult object plagResult = PlagResult() #read attributes from XML and fill PlagResult plagResult.setIdentifier(plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id1"), plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id2")) plagResult.setAlgorithmName(plagResultTag.getElementsByTagName("Algorithm")[0].firstChild.data) plagResult.setNormalizerName(plagResultTag.getElementsByTagName("Normalizer")[0].firstChild.data) plagResult.setSimilarity(float(plagResultTag.getElementsByTagName("Similarity")[0].firstChild.data)) plagResult.setSuspectedPlagiarism(bool(plagResultTag.getElementsByTagName("SuspectedPlagiarism")[0].firstChild.data)) tileTags = plagResultTag.getElementsByTagName("Tiles")[0].getElementsByTagName("Tile") tiles = [] for tileTag in tileTags: tile = (int(tileTag.getAttribute("posId1")), int(tileTag.getAttribute("posId2")), int(tileTag.getAttribute("length"))) tiles.append(tile) plagResult.setTiles(tiles) plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2"))) return plagResult
tile = (int(tileTag.getAttribute("posId1")), int(tileTag.getAttribute("posId2")), int(tileTag.getAttribute("length"))) tiles.append(tile) plagResult.setTiles(tiles) plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2"))) return plagResult #=============================================================================== # Test #=============================================================================== if __name__ == '__main__': print "Start Test - Transfrom PlagResult to XML and Back" #create Test PlagResult plagResult = PlagResult("Test1", "Test2") plagResult.setAlgorithmName("NGRAM") plagResult.setNormalizerName("NORMAL") plagResult.setSimilarity(0.65) plagResult.setSuspectedPlagiarism(True) plagResult.setIdStringLength(52, 45) plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)]) #test transfrom to xml and back should result in the same PlagResult object result = resultFromXML(resultToXML(plagResult)) assert plagResult.__eq__(plagResult), "plagResult changed through transformation to xml and back" print "End Test - Transfrom PlagResult to XML and Back"
def getPositiveResults(resultList): """Returns a list with all positive PlagResult objects from the result list. @param resultList list containing PlagResult objects @return A list containing only positive, i.e. suspected, PlagResult objects. """ return [r for r in resultList if r.isSuspectPlagiarism()] #=============================================================================== # Test #=============================================================================== if __name__ == '__main__': print "Start Tests - PlagResultList helper methods" #create Test PlagResult1 plagResult = PlagResult("Test1", "Test2") plagResult.setAlgorithmName("NGRAM") plagResult.setNormalizerName("NORMAL") plagResult.setSimilarity(0.65) plagResult.setSuspectedPlagiarism(True) plagResult.setIdStringLength(52, 45) plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)]) #create Test PlagResult2 plagResult2 = PlagResult("Test3", "Test4") plagResult2.setAlgorithmName("NGRAM") plagResult2.setNormalizerName("NORMAL") plagResult2.setSimilarity(0.45) plagResult2.setSuspectedPlagiarism(False) plagResult2.setIdStringLength(152, 145) plagResult2.setTiles([(3,5,4),(12,23,5),(34,2,3)]) #create Test PlagResult3
def resultsToTorc(resultList, colored=False): """Takes the result and returns an image showing a Torc indicating the similarity relations of the compared texts in the results. A Torc is a kind of overview which allows the user to recognize the similarity relations between different texts. Therefore all texts are arranged on a circle. For each relation of similarity between two texts a connecting line is drawn. """ #check preconditions if type(resultList) != type([]): raise NoValidArgumentError, 'Input must be of type list' elif len(resultList) == 0: return None else: for result in resultList: if type(result) != type(PlagResult()): raise NoValidArgumentError, 'Input list should only contain values of type PlagResult.' #1. get all identifiers of the results idSet = set() for result in resultList: for id in result.getIdentifier(): idSet.add(id) idSet = list(idSet) idSet.sort() #2. create a circle with a size depending on the number of identifier font = ImageFont.load_default() freespace = computeMaxIdLength(idSet, font) margin = 10 radius = computeRadius( len(idSet)) # computes radius depending on number of ids xM = freespace + radius + margin #middle x pos of circle yM = freespace + radius + margin #middle y pos of circle img = Image.new('RGB', (2 * xM, 2 * yM), (255, 255, 255)) draw = ImageDraw.Draw(img) draw.arc((freespace + margin, freespace + margin, freespace + margin + (2 * radius), freespace + margin + (2 * radius)), 0, 360, fill=(150, 150, 150)) #3. arrange the ids along the circle and save the coordinates for each id distToNextId = 360 / len(idSet) angles = range(0, 360, distToNextId) idPosDict = {} for idNr in xrange(0, len(idSet)): # x = xM + r * cos phi und y = yM + r * sin phi pos = (xM + (radius * cos(radians(angles[idNr]))), yM + (radius * sin(radians(angles[idNr])))) idPosDict.setdefault(idSet[idNr], pos) # use a truetype font and draw the id names for id in idPosDict: draw.text(computeFontPos(font, draw, str(id), idPosDict.get(id), xM, yM), str(id), font=font, fill=(0, 0, 0)) #4. walk through the results and plot the similarity relations as lines between the Ids if colored: #TODO: Params von aussen eingeben? clusters = getClusters(resultList, onlyPositives=False, onlyNonZeroSimilarities=False) for result in resultList: if result.isSuspectPlagiarism(): ids = result.getIdentifier() if colored: color = getColorForScope( getClusterNr(ids[0], ids[1], clusters), range(len(clusters))) else: color = (0, 0, 0) draw.line([idPosDict.get(ids[0]), idPosDict.get(ids[1])], fill=color) del draw #free draw instance #5. return the image return img
def run(s1, s2, mML=3, treshold=0.5): """Tuned Version of GST Input: s1 and s2 : normalized Strings mML : minimumMatchingLength treshold : a single value between 0 and 1 Output: PlagResult """ #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #compute tiles tiles = [] tiles = GSTPrechelt(s1, s2, mML) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] if similarity>1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #return result of similarity check as PlagResult object return result
def run(s1, s2, mML=3, treshold=[0.03, 0.31]): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings Options: ======== mML : minimumMatchingLength treshold : could be a single number(Resemblance) or a list of two numbers(Resemblance and Containment) Output: PlagResult """ #check if the preconditions are fullfilled if mML < 1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if type(treshold) == type([]): if len(treshold) != 2: raise AssertionError, 'Treshold must be a single Value (Resemblance) or a list of two Values [Resemblance, Containment]' elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1): raise OutOfRangeError, 'tresholds values need to be 0<=t<=1' elif not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #create NGrams for strings ngramDict1 = createNGrams(s1, mML) ngramDict2 = createNGrams(s2, mML) #no nGrams found -> return empty initialized PlagResult if ngramDict1 == {} or ngramDict2 == {}: return PlagResult(hash(s1), hash(s2)) #compute similarity simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()), treshold) similarity = simResult[0] #compute tiles tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML) #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[2]) result.setIdStringLength(len(s1.split()), len(s2.split())) #return result of similarity check as PlagResult object return result