コード例 #1
0
def run(s1, s2, mML=3, treshold=[0.03, 0.31]):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
        
                Options:
                ========
                    mML      : minimumMatchingLength
                    treshold : could be a single number(Resemblance) 
                                or a list of two numbers(Resemblance 
                                and Containment)
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML < 1:
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if type(treshold) == type([]):
        if len(treshold) != 2:
            raise AssertionError, 'Treshold must be a  single Value (Resemblance) or a list of two Values [Resemblance, Containment]'
        elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1):
            raise OutOfRangeError, 'tresholds values need to be 0<=t<=1'
    elif not (0 <= treshold <= 1):
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None:
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''):
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1),
                          hash(s2))  #TODO: Identifierbildung ueberdenken..

    #create NGrams for strings
    ngramDict1 = createNGrams(s1, mML)
    ngramDict2 = createNGrams(s2, mML)

    #no nGrams found -> return empty initialized PlagResult
    if ngramDict1 == {} or ngramDict2 == {}:
        return PlagResult(hash(s1), hash(s2))

    #compute similarity
    simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()),
                               treshold)
    similarity = simResult[0]

    #compute tiles
    tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML)

    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[2])
    result.setIdStringLength(len(s1.split()), len(s2.split()))

    #return result of similarity check as PlagResult object
    return result
コード例 #2
0
def run(s1, s2, mML=3, treshold=[0.03, 0.31]):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
        
                Options:
                ========
                    mML      : minimumMatchingLength
                    treshold : could be a single number(Resemblance) 
                                or a list of two numbers(Resemblance 
                                and Containment)
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML<1: 
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if type(treshold) == type([]):
        if len(treshold) != 2:
            raise AssertionError, 'Treshold must be a  single Value (Resemblance) or a list of two Values [Resemblance, Containment]'
        elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1):
            raise OutOfRangeError, 'tresholds values need to be 0<=t<=1'
    elif not (0 <= treshold <= 1): 
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None: 
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''): 
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken..
    
    #create NGrams for strings
    ngramDict1 = createNGrams(s1, mML)
    ngramDict2 = createNGrams(s2, mML)
    
    #no nGrams found -> return empty initialized PlagResult
    if ngramDict1 == {} or ngramDict2 == {}:
        return PlagResult(hash(s1), hash(s2))
    
    #compute similarity
    simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()), treshold)
    similarity = simResult[0]
    
    #compute tiles
    tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML)
    
    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[2])
    result.setIdStringLength(len(s1.split()), len(s2.split()))
    
    #return result of similarity check as PlagResult object
    return result
コード例 #3
0
ファイル: XMLHelper.py プロジェクト: dtgit/dtedu
def resultFromXML(xmlString):
    """Returns a PlagResult object from the given XML representation.
    
        @param xmlString XML representation (whole file with a single PlagResult)
        @return A PlagResult object of the given XML representation.
    """
    from xml.dom import minidom

    dom_object = minidom.parseString(xmlString)

    # get all PlagResult tags in the XML representation
    list = dom_object.getElementsByTagName("PlagResult")

    # get the first PlagResult tag
    plagResultTag = list[0]

    # create a PlagResult object
    plagResult = PlagResult()

    # read attributes from XML and fill PlagResult
    plagResult.setIdentifier(
        plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id1"),
        plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id2"),
    )
    plagResult.setAlgorithmName(plagResultTag.getElementsByTagName("Algorithm")[0].firstChild.data)
    plagResult.setNormalizerName(plagResultTag.getElementsByTagName("Normalizer")[0].firstChild.data)
    plagResult.setSimilarity(float(plagResultTag.getElementsByTagName("Similarity")[0].firstChild.data))
    plagResult.setSuspectedPlagiarism(
        bool(plagResultTag.getElementsByTagName("SuspectedPlagiarism")[0].firstChild.data)
    )
    tileTags = plagResultTag.getElementsByTagName("Tiles")[0].getElementsByTagName("Tile")
    tiles = []
    for tileTag in tileTags:
        tile = (
            int(tileTag.getAttribute("posId1")),
            int(tileTag.getAttribute("posId2")),
            int(tileTag.getAttribute("length")),
        )
        tiles.append(tile)
    plagResult.setTiles(tiles)
    plagResult.setIdStringLength(
        int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")),
        int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2")),
    )

    return plagResult
コード例 #4
0
def run(s1, s2, mML=3, treshold=0.5):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
                mML : minimumMatchingLength
                treshold : a single value between 0 and 1
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML < 1:
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if not (0 <= treshold <= 1):
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None:
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''):
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1),
                          hash(s2))  #TODO: Identifierbildung ueberdenken..

    #compute tiles
    global tiles, matchList
    tiles = []  #TODO: anders regeln ? tiles
    matchList = []  #TODO: anders regeln`? matchList
    tiles = RKR_GST(s1, s2, mML)

    #compute similarity
    simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
    similarity = simResult[0]
    if similarity > 1: similarity = 1

    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[1])

    #return result of similarity check as PlagResult object
    return result
コード例 #5
0
ファイル: algGST.py プロジェクト: collective/ECAssignmentBox
def run(s1, s2, mML=3, treshold=0.5):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
                mML : minimumMatchingLength
                treshold : a single value between 0 and 1
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML<1: 
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if not (0 <= treshold <= 1): 
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None: 
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''): 
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken..
    
    #compute tiles
    global tiles, matchList
    tiles = []    #TODO: anders regeln ? tiles
    matchList = []    #TODO: anders regeln`? matchList
    tiles = RKR_GST(s1, s2, mML)

    #compute similarity
    simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
    similarity = simResult[0]
    if similarity>1: similarity = 1
    
    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[1])
    
    #return result of similarity check as PlagResult object
    return result
コード例 #6
0
def resultFromXML(xmlString):
    """Returns a PlagResult object from the given XML representation.
    
        @param xmlString XML representation (whole file with a single PlagResult)
        @return A PlagResult object of the given XML representation.
    """
    from xml.dom import minidom
    dom_object = minidom.parseString(xmlString)
    
    #get all PlagResult tags in the XML representation
    list = dom_object.getElementsByTagName("PlagResult")
    
    #get the first PlagResult tag
    plagResultTag = list[0]
    
    #create a PlagResult object
    plagResult = PlagResult()
    
    #read attributes from XML and fill PlagResult
    plagResult.setIdentifier(plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id1"), 
                             plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id2"))
    plagResult.setAlgorithmName(plagResultTag.getElementsByTagName("Algorithm")[0].firstChild.data)
    plagResult.setNormalizerName(plagResultTag.getElementsByTagName("Normalizer")[0].firstChild.data)
    plagResult.setSimilarity(float(plagResultTag.getElementsByTagName("Similarity")[0].firstChild.data))
    plagResult.setSuspectedPlagiarism(bool(plagResultTag.getElementsByTagName("SuspectedPlagiarism")[0].firstChild.data))
    tileTags = plagResultTag.getElementsByTagName("Tiles")[0].getElementsByTagName("Tile")
    tiles = []
    for tileTag in tileTags:
        tile = (int(tileTag.getAttribute("posId1")), 
                int(tileTag.getAttribute("posId2")), 
                int(tileTag.getAttribute("length")))
        tiles.append(tile)
    plagResult.setTiles(tiles)
    plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), 
                             int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2")))
    

    return plagResult
コード例 #7
0
def run(s1, s2, mML=3, treshold=0.5):
    """Tuned Version of GST
    
        Input:  s1 and s2 : normalized Strings  
                mML : minimumMatchingLength
                treshold : a single value between 0 and 1
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML<1: 
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if not (0 <= treshold <= 1): 
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None: 
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''): 
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken..
    
    #compute tiles
    tiles = []
    tiles = GSTPrechelt(s1, s2, mML)

    #compute similarity
    simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
    similarity = simResult[0]
    if similarity>1: similarity = 1
    
    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[1])
    
    #return result of similarity check as PlagResult object
    return result
コード例 #8
0
        @return A list containing only positive, i.e. suspected, PlagResult objects.
    """
    return [r for r in resultList if r.isSuspectPlagiarism()]


#===============================================================================
#    Test
#===============================================================================
if __name__ == '__main__':
    print "Start Tests - PlagResultList helper methods"
    #create Test PlagResult1
    plagResult = PlagResult("Test1", "Test2")
    plagResult.setAlgorithmName("NGRAM")
    plagResult.setNormalizerName("NORMAL")
    plagResult.setSimilarity(0.65)
    plagResult.setSuspectedPlagiarism(True)
    plagResult.setIdStringLength(52, 45)
    plagResult.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)])
    #create Test PlagResult2
    plagResult2 = PlagResult("Test3", "Test4")
    plagResult2.setAlgorithmName("NGRAM")
    plagResult2.setNormalizerName("NORMAL")
    plagResult2.setSimilarity(0.45)
    plagResult2.setSuspectedPlagiarism(False)
    plagResult2.setIdStringLength(152, 145)
    plagResult2.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)])
    #create Test PlagResult3
    plagResult3 = PlagResult("Test5", "Test6")
    plagResult3.setAlgorithmName("NGRAM")
    plagResult3.setNormalizerName("NORMAL")
    plagResult3.setSimilarity(0.75)
コード例 #9
0
        tile = (int(tileTag.getAttribute("posId1")), 
                int(tileTag.getAttribute("posId2")), 
                int(tileTag.getAttribute("length")))
        tiles.append(tile)
    plagResult.setTiles(tiles)
    plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), 
                             int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2")))
    

    return plagResult

#===============================================================================
#    Test
#===============================================================================
if __name__ == '__main__':
    print "Start Test - Transfrom PlagResult to XML and Back"
    #create Test PlagResult
    plagResult = PlagResult("Test1", "Test2")
    plagResult.setAlgorithmName("NGRAM")
    plagResult.setNormalizerName("NORMAL")
    plagResult.setSimilarity(0.65)
    plagResult.setSuspectedPlagiarism(True)
    plagResult.setIdStringLength(52, 45)
    plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)])
    
    #test transfrom to xml and back should result in the same PlagResult object
    result = resultFromXML(resultToXML(plagResult))
    
    assert plagResult.__eq__(plagResult), "plagResult changed through transformation to xml and back"
    print "End Test - Transfrom PlagResult to XML and Back"
コード例 #10
0
        @param resultList list containing PlagResult objects
        @return A list containing only positive, i.e. suspected, PlagResult objects.
    """
    return [r for r in resultList if r.isSuspectPlagiarism()]

#===============================================================================
#    Test
#===============================================================================
if __name__ == '__main__':
    print "Start Tests - PlagResultList helper methods"
    #create Test PlagResult1
    plagResult = PlagResult("Test1", "Test2")
    plagResult.setAlgorithmName("NGRAM")
    plagResult.setNormalizerName("NORMAL")
    plagResult.setSimilarity(0.65)
    plagResult.setSuspectedPlagiarism(True)
    plagResult.setIdStringLength(52, 45)
    plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)])
    #create Test PlagResult2
    plagResult2 = PlagResult("Test3", "Test4")
    plagResult2.setAlgorithmName("NGRAM")
    plagResult2.setNormalizerName("NORMAL")
    plagResult2.setSimilarity(0.45)
    plagResult2.setSuspectedPlagiarism(False)
    plagResult2.setIdStringLength(152, 145)
    plagResult2.setTiles([(3,5,4),(12,23,5),(34,2,3)])
    #create Test PlagResult3
    plagResult3 = PlagResult("Test5", "Test6")
    plagResult3.setAlgorithmName("NGRAM")
    plagResult3.setNormalizerName("NORMAL")
    plagResult3.setSimilarity(0.75)