def run(s1, s2, mML=3, treshold=[0.03, 0.31]): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings Options: ======== mML : minimumMatchingLength treshold : could be a single number(Resemblance) or a list of two numbers(Resemblance and Containment) Output: PlagResult """ #check if the preconditions are fullfilled if mML < 1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if type(treshold) == type([]): if len(treshold) != 2: raise AssertionError, 'Treshold must be a single Value (Resemblance) or a list of two Values [Resemblance, Containment]' elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1): raise OutOfRangeError, 'tresholds values need to be 0<=t<=1' elif not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #create NGrams for strings ngramDict1 = createNGrams(s1, mML) ngramDict2 = createNGrams(s2, mML) #no nGrams found -> return empty initialized PlagResult if ngramDict1 == {} or ngramDict2 == {}: return PlagResult(hash(s1), hash(s2)) #compute similarity simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()), treshold) similarity = simResult[0] #compute tiles tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML) #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[2]) result.setIdStringLength(len(s1.split()), len(s2.split())) #return result of similarity check as PlagResult object return result
def run(s1, s2, mML=3, treshold=[0.03, 0.31]): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings Options: ======== mML : minimumMatchingLength treshold : could be a single number(Resemblance) or a list of two numbers(Resemblance and Containment) Output: PlagResult """ #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if type(treshold) == type([]): if len(treshold) != 2: raise AssertionError, 'Treshold must be a single Value (Resemblance) or a list of two Values [Resemblance, Containment]' elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1): raise OutOfRangeError, 'tresholds values need to be 0<=t<=1' elif not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #create NGrams for strings ngramDict1 = createNGrams(s1, mML) ngramDict2 = createNGrams(s2, mML) #no nGrams found -> return empty initialized PlagResult if ngramDict1 == {} or ngramDict2 == {}: return PlagResult(hash(s1), hash(s2)) #compute similarity simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()), treshold) similarity = simResult[0] #compute tiles tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML) #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[2]) result.setIdStringLength(len(s1.split()), len(s2.split())) #return result of similarity check as PlagResult object return result
def resultFromXML(xmlString): """Returns a PlagResult object from the given XML representation. @param xmlString XML representation (whole file with a single PlagResult) @return A PlagResult object of the given XML representation. """ from xml.dom import minidom dom_object = minidom.parseString(xmlString) # get all PlagResult tags in the XML representation list = dom_object.getElementsByTagName("PlagResult") # get the first PlagResult tag plagResultTag = list[0] # create a PlagResult object plagResult = PlagResult() # read attributes from XML and fill PlagResult plagResult.setIdentifier( plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id1"), plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id2"), ) plagResult.setAlgorithmName(plagResultTag.getElementsByTagName("Algorithm")[0].firstChild.data) plagResult.setNormalizerName(plagResultTag.getElementsByTagName("Normalizer")[0].firstChild.data) plagResult.setSimilarity(float(plagResultTag.getElementsByTagName("Similarity")[0].firstChild.data)) plagResult.setSuspectedPlagiarism( bool(plagResultTag.getElementsByTagName("SuspectedPlagiarism")[0].firstChild.data) ) tileTags = plagResultTag.getElementsByTagName("Tiles")[0].getElementsByTagName("Tile") tiles = [] for tileTag in tileTags: tile = ( int(tileTag.getAttribute("posId1")), int(tileTag.getAttribute("posId2")), int(tileTag.getAttribute("length")), ) tiles.append(tile) plagResult.setTiles(tiles) plagResult.setIdStringLength( int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2")), ) return plagResult
def run(s1, s2, mML=3, treshold=0.5): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings mML : minimumMatchingLength treshold : a single value between 0 and 1 Output: PlagResult """ #check if the preconditions are fullfilled if mML < 1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #compute tiles global tiles, matchList tiles = [] #TODO: anders regeln ? tiles matchList = [] #TODO: anders regeln`? matchList tiles = RKR_GST(s1, s2, mML) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] if similarity > 1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #return result of similarity check as PlagResult object return result
def run(s1, s2, mML=3, treshold=0.5): """This method runs a comparison on the two given strings s1 and s2 returning a PlagResult object containing the similarity value, the similarities as list of tiles and a boolean value indicating suspected plagiarism. Input: s1 and s2 : normalized Strings mML : minimumMatchingLength treshold : a single value between 0 and 1 Output: PlagResult """ #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #compute tiles global tiles, matchList tiles = [] #TODO: anders regeln ? tiles matchList = [] #TODO: anders regeln`? matchList tiles = RKR_GST(s1, s2, mML) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] if similarity>1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #return result of similarity check as PlagResult object return result
def resultFromXML(xmlString): """Returns a PlagResult object from the given XML representation. @param xmlString XML representation (whole file with a single PlagResult) @return A PlagResult object of the given XML representation. """ from xml.dom import minidom dom_object = minidom.parseString(xmlString) #get all PlagResult tags in the XML representation list = dom_object.getElementsByTagName("PlagResult") #get the first PlagResult tag plagResultTag = list[0] #create a PlagResult object plagResult = PlagResult() #read attributes from XML and fill PlagResult plagResult.setIdentifier(plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id1"), plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id2")) plagResult.setAlgorithmName(plagResultTag.getElementsByTagName("Algorithm")[0].firstChild.data) plagResult.setNormalizerName(plagResultTag.getElementsByTagName("Normalizer")[0].firstChild.data) plagResult.setSimilarity(float(plagResultTag.getElementsByTagName("Similarity")[0].firstChild.data)) plagResult.setSuspectedPlagiarism(bool(plagResultTag.getElementsByTagName("SuspectedPlagiarism")[0].firstChild.data)) tileTags = plagResultTag.getElementsByTagName("Tiles")[0].getElementsByTagName("Tile") tiles = [] for tileTag in tileTags: tile = (int(tileTag.getAttribute("posId1")), int(tileTag.getAttribute("posId2")), int(tileTag.getAttribute("length"))) tiles.append(tile) plagResult.setTiles(tiles) plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2"))) return plagResult
def run(s1, s2, mML=3, treshold=0.5): """Tuned Version of GST Input: s1 and s2 : normalized Strings mML : minimumMatchingLength treshold : a single value between 0 and 1 Output: PlagResult """ #check if the preconditions are fullfilled if mML<1: raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0' if not (0 <= treshold <= 1): raise OutOfRangeError, 'treshold t needs to be 0<=t<=1' if s1 == None or s2 == None: raise NoValidArgumentError, 'input must be of type string not None' if type(s1) != type('') or type(s2) != type(''): raise NoValidArgumentError, 'input must be of type string' if s1 == '' or s2 == '': return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken.. #compute tiles tiles = [] tiles = GSTPrechelt(s1, s2, mML) #compute similarity simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold) similarity = simResult[0] if similarity>1: similarity = 1 #create PlagResult and set attributes result = PlagResult() result.setIdentifier(hash(s1), hash(s2)) result.setTiles(tiles) result.setSimilarity(similarity) result.setSuspectedPlagiarism(simResult[1]) #return result of similarity check as PlagResult object return result
@return A list containing only positive, i.e. suspected, PlagResult objects. """ return [r for r in resultList if r.isSuspectPlagiarism()] #=============================================================================== # Test #=============================================================================== if __name__ == '__main__': print "Start Tests - PlagResultList helper methods" #create Test PlagResult1 plagResult = PlagResult("Test1", "Test2") plagResult.setAlgorithmName("NGRAM") plagResult.setNormalizerName("NORMAL") plagResult.setSimilarity(0.65) plagResult.setSuspectedPlagiarism(True) plagResult.setIdStringLength(52, 45) plagResult.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)]) #create Test PlagResult2 plagResult2 = PlagResult("Test3", "Test4") plagResult2.setAlgorithmName("NGRAM") plagResult2.setNormalizerName("NORMAL") plagResult2.setSimilarity(0.45) plagResult2.setSuspectedPlagiarism(False) plagResult2.setIdStringLength(152, 145) plagResult2.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)]) #create Test PlagResult3 plagResult3 = PlagResult("Test5", "Test6") plagResult3.setAlgorithmName("NGRAM") plagResult3.setNormalizerName("NORMAL") plagResult3.setSimilarity(0.75)
tile = (int(tileTag.getAttribute("posId1")), int(tileTag.getAttribute("posId2")), int(tileTag.getAttribute("length"))) tiles.append(tile) plagResult.setTiles(tiles) plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2"))) return plagResult #=============================================================================== # Test #=============================================================================== if __name__ == '__main__': print "Start Test - Transfrom PlagResult to XML and Back" #create Test PlagResult plagResult = PlagResult("Test1", "Test2") plagResult.setAlgorithmName("NGRAM") plagResult.setNormalizerName("NORMAL") plagResult.setSimilarity(0.65) plagResult.setSuspectedPlagiarism(True) plagResult.setIdStringLength(52, 45) plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)]) #test transfrom to xml and back should result in the same PlagResult object result = resultFromXML(resultToXML(plagResult)) assert plagResult.__eq__(plagResult), "plagResult changed through transformation to xml and back" print "End Test - Transfrom PlagResult to XML and Back"
@param resultList list containing PlagResult objects @return A list containing only positive, i.e. suspected, PlagResult objects. """ return [r for r in resultList if r.isSuspectPlagiarism()] #=============================================================================== # Test #=============================================================================== if __name__ == '__main__': print "Start Tests - PlagResultList helper methods" #create Test PlagResult1 plagResult = PlagResult("Test1", "Test2") plagResult.setAlgorithmName("NGRAM") plagResult.setNormalizerName("NORMAL") plagResult.setSimilarity(0.65) plagResult.setSuspectedPlagiarism(True) plagResult.setIdStringLength(52, 45) plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)]) #create Test PlagResult2 plagResult2 = PlagResult("Test3", "Test4") plagResult2.setAlgorithmName("NGRAM") plagResult2.setNormalizerName("NORMAL") plagResult2.setSimilarity(0.45) plagResult2.setSuspectedPlagiarism(False) plagResult2.setIdStringLength(152, 145) plagResult2.setTiles([(3,5,4),(12,23,5),(34,2,3)]) #create Test PlagResult3 plagResult3 = PlagResult("Test5", "Test6") plagResult3.setAlgorithmName("NGRAM") plagResult3.setNormalizerName("NORMAL") plagResult3.setSimilarity(0.75)