Ejemplo n.º 1
0
def createDotplotFromResult(result, showIds=False):
    """Creates a dotplot from the result (PlagResult) of a similarity check.
    
        It uses the tiles of the PlagResult. Therefore it does not need w/k
        parameters.
    """
    #check preconditions
    if type(result) != type(PlagResult()):
        raise NoValidArgumentError, 'Input must be of type PlagResult.'
    if result == None:
        raise NoValidArgumentError, 'Input must be of type PlagResult not None.'

    maxX = result.getIdStringLength()[0]
    maxY = result.getIdStringLength()[1]

    #create image (B/W)
    img = Image.new("L", (maxX, maxY), 255)

    tiles = result.getTiles()

    for tile in tiles:
        for i in range(0, tile[2]):
            img.putpixel((tile[0] + i, tile[1] + i), 0)

    if showIds:
        img = addIds(img, result.getIdentifier(), result.getIdStringLength())

    return img
Ejemplo n.º 2
0
def run(s1, s2, mML=3, treshold=0.5):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
                mML : minimumMatchingLength
                treshold : a single value between 0 and 1
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML<1: 
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if not (0 <= treshold <= 1): 
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None: 
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''): 
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken..
    
    #compute tiles
    global tiles, matchList
    tiles = []    #TODO: anders regeln ? tiles
    matchList = []    #TODO: anders regeln`? matchList
    tiles = RKR_GST(s1, s2, mML)

    #compute similarity
    simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
    similarity = simResult[0]
    if similarity>1: similarity = 1
    
    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[1])
    
    #return result of similarity check as PlagResult object
    return result
def getPositiveResults(resultList):
    """Returns a list with all positive PlagResult objects from the result list.
    
        @param resultList list containing PlagResult objects
        @return A list containing only positive, i.e. suspected, PlagResult objects.
    """
    return [r for r in resultList if r.isSuspectPlagiarism()]


#===============================================================================
#    Test
#===============================================================================
if __name__ == '__main__':
    print "Start Tests - PlagResultList helper methods"
    #create Test PlagResult1
    plagResult = PlagResult("Test1", "Test2")
    plagResult.setAlgorithmName("NGRAM")
    plagResult.setNormalizerName("NORMAL")
    plagResult.setSimilarity(0.65)
    plagResult.setSuspectedPlagiarism(True)
    plagResult.setIdStringLength(52, 45)
    plagResult.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)])
    #create Test PlagResult2
    plagResult2 = PlagResult("Test3", "Test4")
    plagResult2.setAlgorithmName("NGRAM")
    plagResult2.setNormalizerName("NORMAL")
    plagResult2.setSimilarity(0.45)
    plagResult2.setSuspectedPlagiarism(False)
    plagResult2.setIdStringLength(152, 145)
    plagResult2.setTiles([(3, 5, 4), (12, 23, 5), (34, 2, 3)])
    #create Test PlagResult3
Ejemplo n.º 4
0
def run(s1, s2, mML=3, treshold=0.5):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
                mML : minimumMatchingLength
                treshold : a single value between 0 and 1
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML < 1:
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if not (0 <= treshold <= 1):
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None:
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''):
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1),
                          hash(s2))  #TODO: Identifierbildung ueberdenken..

    #compute tiles
    global tiles, matchList
    tiles = []  #TODO: anders regeln ? tiles
    matchList = []  #TODO: anders regeln`? matchList
    tiles = RKR_GST(s1, s2, mML)

    #compute similarity
    simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
    similarity = simResult[0]
    if similarity > 1: similarity = 1

    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[1])

    #return result of similarity check as PlagResult object
    return result
Ejemplo n.º 5
0
def run(s1, s2, mML=3, treshold=[0.03, 0.31]):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
        
                Options:
                ========
                    mML      : minimumMatchingLength
                    treshold : could be a single number(Resemblance) 
                                or a list of two numbers(Resemblance 
                                and Containment)
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML<1: 
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if type(treshold) == type([]):
        if len(treshold) != 2:
            raise AssertionError, 'Treshold must be a  single Value (Resemblance) or a list of two Values [Resemblance, Containment]'
        elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1):
            raise OutOfRangeError, 'tresholds values need to be 0<=t<=1'
    elif not (0 <= treshold <= 1): 
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None: 
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''): 
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken..
    
    #create NGrams for strings
    ngramDict1 = createNGrams(s1, mML)
    ngramDict2 = createNGrams(s2, mML)
    
    #no nGrams found -> return empty initialized PlagResult
    if ngramDict1 == {} or ngramDict2 == {}:
        return PlagResult(hash(s1), hash(s2))
    
    #compute similarity
    simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()), treshold)
    similarity = simResult[0]
    
    #compute tiles
    tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML)
    
    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[2])
    result.setIdStringLength(len(s1.split()), len(s2.split()))
    
    #return result of similarity check as PlagResult object
    return result
Ejemplo n.º 6
0
def resultFromXML(xmlString):
    """Returns a PlagResult object from the given XML representation.
    
        @param xmlString XML representation (whole file with a single PlagResult)
        @return A PlagResult object of the given XML representation.
    """
    from xml.dom import minidom
    dom_object = minidom.parseString(xmlString)
    
    #get all PlagResult tags in the XML representation
    list = dom_object.getElementsByTagName("PlagResult")
    
    #get the first PlagResult tag
    plagResultTag = list[0]
    
    #create a PlagResult object
    plagResult = PlagResult()
    
    #read attributes from XML and fill PlagResult
    plagResult.setIdentifier(plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id1"), 
                             plagResultTag.getElementsByTagName("Identifier")[0].getAttribute("Id2"))
    plagResult.setAlgorithmName(plagResultTag.getElementsByTagName("Algorithm")[0].firstChild.data)
    plagResult.setNormalizerName(plagResultTag.getElementsByTagName("Normalizer")[0].firstChild.data)
    plagResult.setSimilarity(float(plagResultTag.getElementsByTagName("Similarity")[0].firstChild.data))
    plagResult.setSuspectedPlagiarism(bool(plagResultTag.getElementsByTagName("SuspectedPlagiarism")[0].firstChild.data))
    tileTags = plagResultTag.getElementsByTagName("Tiles")[0].getElementsByTagName("Tile")
    tiles = []
    for tileTag in tileTags:
        tile = (int(tileTag.getAttribute("posId1")), 
                int(tileTag.getAttribute("posId2")), 
                int(tileTag.getAttribute("length")))
        tiles.append(tile)
    plagResult.setTiles(tiles)
    plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), 
                             int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2")))
    

    return plagResult
Ejemplo n.º 7
0
        tile = (int(tileTag.getAttribute("posId1")), 
                int(tileTag.getAttribute("posId2")), 
                int(tileTag.getAttribute("length")))
        tiles.append(tile)
    plagResult.setTiles(tiles)
    plagResult.setIdStringLength(int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId1")), 
                             int(plagResultTag.getElementsByTagName("IdStringLength")[0].getAttribute("lengthId2")))
    

    return plagResult

#===============================================================================
#    Test
#===============================================================================
if __name__ == '__main__':
    print "Start Test - Transfrom PlagResult to XML and Back"
    #create Test PlagResult
    plagResult = PlagResult("Test1", "Test2")
    plagResult.setAlgorithmName("NGRAM")
    plagResult.setNormalizerName("NORMAL")
    plagResult.setSimilarity(0.65)
    plagResult.setSuspectedPlagiarism(True)
    plagResult.setIdStringLength(52, 45)
    plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)])
    
    #test transfrom to xml and back should result in the same PlagResult object
    result = resultFromXML(resultToXML(plagResult))
    
    assert plagResult.__eq__(plagResult), "plagResult changed through transformation to xml and back"
    print "End Test - Transfrom PlagResult to XML and Back"
def getPositiveResults(resultList):
    """Returns a list with all positive PlagResult objects from the result list.
    
        @param resultList list containing PlagResult objects
        @return A list containing only positive, i.e. suspected, PlagResult objects.
    """
    return [r for r in resultList if r.isSuspectPlagiarism()]

#===============================================================================
#    Test
#===============================================================================
if __name__ == '__main__':
    print "Start Tests - PlagResultList helper methods"
    #create Test PlagResult1
    plagResult = PlagResult("Test1", "Test2")
    plagResult.setAlgorithmName("NGRAM")
    plagResult.setNormalizerName("NORMAL")
    plagResult.setSimilarity(0.65)
    plagResult.setSuspectedPlagiarism(True)
    plagResult.setIdStringLength(52, 45)
    plagResult.setTiles([(3,5,4),(12,23,5),(34,2,3)])
    #create Test PlagResult2
    plagResult2 = PlagResult("Test3", "Test4")
    plagResult2.setAlgorithmName("NGRAM")
    plagResult2.setNormalizerName("NORMAL")
    plagResult2.setSimilarity(0.45)
    plagResult2.setSuspectedPlagiarism(False)
    plagResult2.setIdStringLength(152, 145)
    plagResult2.setTiles([(3,5,4),(12,23,5),(34,2,3)])
    #create Test PlagResult3
Ejemplo n.º 9
0
def resultsToTorc(resultList, colored=False):
    """Takes the result and returns an image showing a Torc indicating the
        similarity relations of the compared texts in the results.
        
        A Torc is a kind of overview which allows the user to recognize the similarity
        relations between different texts. Therefore all texts are arranged on a circle.
        For each relation of similarity between two texts a connecting line is drawn.
    """
    #check preconditions
    if type(resultList) != type([]):
        raise NoValidArgumentError, 'Input must be of type list'
    elif len(resultList) == 0:
        return None
    else:
        for result in resultList:
            if type(result) != type(PlagResult()):
                raise NoValidArgumentError, 'Input list should only contain values of type PlagResult.'
    #1. get all identifiers of the results
    idSet = set()
    for result in resultList:
        for id in result.getIdentifier():
            idSet.add(id)
    idSet = list(idSet)
    idSet.sort()

    #2. create a circle with a size depending on the number of identifier
    font = ImageFont.load_default()
    freespace = computeMaxIdLength(idSet, font)
    margin = 10
    radius = computeRadius(
        len(idSet))  # computes radius depending on number of ids
    xM = freespace + radius + margin  #middle x pos of circle
    yM = freespace + radius + margin  #middle y pos of circle
    img = Image.new('RGB', (2 * xM, 2 * yM), (255, 255, 255))
    draw = ImageDraw.Draw(img)
    draw.arc((freespace + margin, freespace + margin, freespace + margin +
              (2 * radius), freespace + margin + (2 * radius)),
             0,
             360,
             fill=(150, 150, 150))

    #3. arrange the ids along the circle and save the coordinates for each id
    distToNextId = 360 / len(idSet)
    angles = range(0, 360, distToNextId)
    idPosDict = {}
    for idNr in xrange(0, len(idSet)):
        # x = xM + r * cos phi und y = yM + r * sin phi
        pos = (xM + (radius * cos(radians(angles[idNr]))),
               yM + (radius * sin(radians(angles[idNr]))))
        idPosDict.setdefault(idSet[idNr], pos)

    # use a truetype font and draw the id names
    for id in idPosDict:
        draw.text(computeFontPos(font, draw, str(id), idPosDict.get(id), xM,
                                 yM),
                  str(id),
                  font=font,
                  fill=(0, 0, 0))

    #4. walk through the results and plot the similarity relations as lines between the Ids
    if colored:
        #TODO: Params von aussen eingeben?
        clusters = getClusters(resultList,
                               onlyPositives=False,
                               onlyNonZeroSimilarities=False)

    for result in resultList:
        if result.isSuspectPlagiarism():
            ids = result.getIdentifier()
            if colored:
                color = getColorForScope(
                    getClusterNr(ids[0], ids[1], clusters),
                    range(len(clusters)))
            else:
                color = (0, 0, 0)
            draw.line([idPosDict.get(ids[0]),
                       idPosDict.get(ids[1])],
                      fill=color)

    del draw  #free draw instance

    #5. return the image
    return img
def run(s1, s2, mML=3, treshold=0.5):
    """Tuned Version of GST
    
        Input:  s1 and s2 : normalized Strings  
                mML : minimumMatchingLength
                treshold : a single value between 0 and 1
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML<1: 
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if not (0 <= treshold <= 1): 
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None: 
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''): 
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1), hash(s2)) #TODO: Identifierbildung ueberdenken..
    
    #compute tiles
    tiles = []
    tiles = GSTPrechelt(s1, s2, mML)

    #compute similarity
    simResult = calcSimilarity(s1.split(), s2.split(), tiles, treshold)
    similarity = simResult[0]
    if similarity>1: similarity = 1
    
    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[1])
    
    #return result of similarity check as PlagResult object
    return result
Ejemplo n.º 11
0
def run(s1, s2, mML=3, treshold=[0.03, 0.31]):
    """This method runs a comparison on the two given strings s1 and s2 returning
        a PlagResult object containing the similarity value, the similarities as 
        list of tiles and a boolean value indicating suspected plagiarism.
    
        Input:  s1 and s2 : normalized Strings  
        
                Options:
                ========
                    mML      : minimumMatchingLength
                    treshold : could be a single number(Resemblance) 
                                or a list of two numbers(Resemblance 
                                and Containment)
        Output: PlagResult
    """
    #check if the preconditions are fullfilled
    if mML < 1:
        raise OutOfRangeError, 'minimum Matching Length mML needs to be greater than 0'
    if type(treshold) == type([]):
        if len(treshold) != 2:
            raise AssertionError, 'Treshold must be a  single Value (Resemblance) or a list of two Values [Resemblance, Containment]'
        elif not (0 <= treshold[0] <= 1) or not (0 <= treshold[1] <= 1):
            raise OutOfRangeError, 'tresholds values need to be 0<=t<=1'
    elif not (0 <= treshold <= 1):
        raise OutOfRangeError, 'treshold t needs to be 0<=t<=1'
    if s1 == None or s2 == None:
        raise NoValidArgumentError, 'input must be of type string not None'
    if type(s1) != type('') or type(s2) != type(''):
        raise NoValidArgumentError, 'input must be of type string'
    if s1 == '' or s2 == '':
        return PlagResult(hash(s1),
                          hash(s2))  #TODO: Identifierbildung ueberdenken..

    #create NGrams for strings
    ngramDict1 = createNGrams(s1, mML)
    ngramDict2 = createNGrams(s2, mML)

    #no nGrams found -> return empty initialized PlagResult
    if ngramDict1 == {} or ngramDict2 == {}:
        return PlagResult(hash(s1), hash(s2))

    #compute similarity
    simResult = calcSimilarity(set(ngramDict1.keys()), set(ngramDict2.keys()),
                               treshold)
    similarity = simResult[0]

    #compute tiles
    tiles = calcTiles(ngramDict1, ngramDict2, simResult[1], mML)

    #create PlagResult and set attributes
    result = PlagResult()
    result.setIdentifier(hash(s1), hash(s2))
    result.setTiles(tiles)
    result.setSimilarity(similarity)
    result.setSuspectedPlagiarism(simResult[2])
    result.setIdStringLength(len(s1.split()), len(s2.split()))

    #return result of similarity check as PlagResult object
    return result