def tagPlagiarisms(idx):
    # creates a list of detections from the predictions of parts of a document
    filename = "C:\DiplomaProject\PredictionResults\Results"
    fPath = filename + str(idx) + ".txt"
    partname = "C:\DiplomaProject\partIds\partIds" + str(idx) + ".txt"
    resultVals = Fileparser.get_indices_from_file(fPath)
    plagiarismList = []
    path = "C:\DiplomaProject\AlmarimiDocuments"
    documentList = Fileparser.extract_plagiarisms_from_files(path)
    text = Fileparser.extract_text_from_document(path, documentList[idx])
    start, end, sentences = Fileparser.split_into_sentences(text)
    if (len(resultVals) == 0):
        return plagiarismList
    startIds, endIds = Fileparser.get_partIds_from_file(partname)

    parts = []
    for i in range(len(resultVals)):
        if (float(resultVals[i]) > 0.7):
            part = partStats(int(startIds[i]), int(endIds[i]),
                             int(int(endIds[i]) - int(startIds[i]) + 1),
                             float(resultVals[i]))
            parts.append(part)

    longestParts = sorted(parts, key=lambda x: x.partLength, reverse=True)
    sameStarts = set()
    newParts = []
    for p in longestParts:
        if (p.startId not in sameStarts):
            sameStarts.add(p.startId)
            newParts.append(p)
    longestParts = sorted(newParts, key=lambda x: x.partLength, reverse=True)
    sameEnds = set()
    newParts = []
    for p in longestParts:
        if (p.endId not in sameEnds):
            sameEnds.add(p.endId)
            newParts.append(p)
    indices = []
    for p in newParts:
        pair = [p.startId, p.endId]
        indices.append(pair)
    if (len(indices) == 0):
        return plagiarismList
    indices.sort(key=lambda interval: interval[0])
    merged = [indices[0]]
    for current in indices:
        previous = merged[-1]
        if current[0] <= previous[1]:
            previous[1] = max(previous[1], current[1])
        else:
            merged.append(current)

    for p in merged:

        plag = plagiarism(start[p[0]], end[p[1]] - start[p[0]] + 1)
        plag.toString()
        plagiarismList.append(plag)

    return plagiarismList
Exemple #2
0
def relationalFrequenciesToFile(idx):
    # calculates the relational frequenies of sentences and writes them to a file
    corpusPath = "C:\DiplomaProject\OriginalCorpus\Corpus" + str(idx) + ".txt"
    corpus = Fileparser.get_corpus_from_file(corpusPath)
    sents = [corpus[0], corpus[1], corpus[2]]
    sentenceFives = [merge_sentences(sents)]
    sents.append(corpus[3])
    sentenceFives.append(merge_sentences(sents))
    for i in range(2, len(corpus) - 2):
        sents = [
            corpus[i - 2], corpus[i - 1], corpus[i], corpus[i + 1],
            corpus[i + 2]
        ]
        merged = merge_sentences(sents)
        sentenceFives.append(merged)
    finalIdx = len(corpus) - 1
    sents = [
        corpus[finalIdx - 3], corpus[finalIdx - 2], corpus[finalIdx - 1],
        corpus[finalIdx]
    ]
    sentenceFives.append(merge_sentences(sents))
    sents.pop(0)
    sentenceFives.append(merge_sentences(sents))
    meanFreqs, lowestFreqs, upperFreqs = calculateRelationalFrequencyOfSentence(
        corpus)
    goal_path = "C:\DiplomaProject\RelationalFrequncies\Frequncies" + str(
        idx) + ".txt"
    with open(goal_path, 'w', encoding="utf-8", errors='ignore') as f:
        f.truncate(0)
        for j in range(len(meanFreqs)):
            f.write(str(meanFreqs[j]) + ",")
            f.write(str(lowestFreqs[j]) + ",")
            f.write(str(upperFreqs[j]))
            f.write("|")
        f.close()
def runOutlierDetector():
    # a method for evaluating the list of our detections adn outputting them to a file
    precisions = []
    recalls = []
    accuracies = []
    fullOverlapList = []
    partialOverlapList = []
    numberOfDetectedPlags = []
    for i in range(0, 40):

        detectedList = tagPlagiarisms(i)
        numberOfDetectedPlags.append(len(detectedList))
        path = "C:\DiplomaProject\AlmarimiDocuments"
        documentList = Fileparser.extract_plagiarisms_from_files(path)
        plagiarismList = documentList[i].plagiarismList

        text = Fileparser.extract_text_from_document(path, documentList[i])
        precision, recall, accuracy, fullDetections, partialDetections = Fileparser.confusionMatrix(
            text, plagiarismList, detectedList)
        precisions.append(precision)
        recalls.append(recall)
        accuracies.append(accuracy)
        fullOverlapList.append(fullDetections)
        partialOverlapList.append(partialDetections)
    print(len(precisions))
    print(len(accuracies))
    print(len(recalls))

    filepath = "C:\DiplomaProject\OutputFile.txt"
    with open(filepath, 'w', encoding="utf-8", errors='ignore') as f:
        f.truncate(0)
        for j in range(0, 40):
            f.write("id: " + str(j) + '\n')
            f.write("precision: " + str(precisions[j]) + "\n")
            f.write("recall: " + str(recalls[j]) + "\n")
            f.write("accuracy: " + str(accuracies[j]) + "\n")
            f.write("number of detections: " + str(numberOfDetectedPlags[j]) +
                    "\n")
            f.write("FullDetections: " + str(fullOverlapList[j]) + "\n")
            f.write("PartialDetections: " + str(partialOverlapList[j]) + "\n")

            f.write("----------------\n")
        f.close()
def getPlagIndices(idx):
    # returns the beginning and ending indices of plagiarisms
    path = "C:\DiplomaProject\AlmarimiDocuments"
    documentList = Fileparser.extract_plagiarisms_from_files(path)
    plagiarismList = documentList[idx].plagiarismList
    text = Fileparser.extract_text_from_document(path, documentList[idx])
    start, end, sents = Fileparser.split_into_sentences(text)
    plagIndices = []
    if (len(plagiarismList) == 0):
        return plagIndices, len(end)

    for p in plagiarismList:
        plagIndices.append(p.offset)
        plagIndices.append(p.offset + p.length)
    pId = 0
    indices = []
    for i in range(len(plagIndices)):
        while (end[pId] < plagIndices[i] - 1):
            pId += 1
        indices.append(pId)
    return indices, len(end)
Exemple #5
0
def getWordFrequencies(text):
    # creates a dictionary of the frequencies of all texts
    text = " ".join(text.split())
    text = Fileparser.remove_stopwords(text)
    words = text.split()
    freqDict = {}
    for w in words:
        if w in freqDict:
            freqDict[w] = (int(freqDict[w]) + 1)
        else:
            freqDict.update({w: 1})
    return freqDict
def writeOffsetLoss():
    # writes the loss of the classifier into a file
    path = "C:\DiplomaProject\AlmarimiDocuments"
    documentList = Fileparser.extract_plagiarisms_from_files(path)
    lossPath = "C:\DiplomaProject\offsetLoss\Loss"
    for i in range(len(documentList)):
        losses = offsetLossScores(i)
        lPath = lossPath + str(i) + ".txt"
        with open(lPath, 'w', encoding="utf-8", errors='ignore') as f:
            f.truncate(0)
            for j in range(len(losses)):
                f.write(str(losses[j]))
                f.write("|")
            f.close()
def compareAnomalousTexts(metricList, indices, idx):
    # creates a score to separate non-anomalous parts from anomalous
    f_path = "C:\DiplomaProject\CleanFingerprints\Fingerprints" + str(
        idx) + ".txt"
    fingerprints = Fileparser.get_clean_fingerprints_from_file(f_path)
    euclidean, cosine, jaccard = RetinaOperations.calculateOverlapScoresOfParts(
        fingerprints, indices)
    euclidean = normalize_data(euclidean)
    cosine = normalize_data(cosine)
    jaccard = normalize_data(jaccard)
    for i in range(len(metricList)):
        metricList[i].score = euclidean[i] - cosine[i] - jaccard[i]

        metricList[i].toString()
    return metricList
def writeSimilarityMeasures(idx, windowsize):
    # writes the similarity metrics of the classifier to a file
    f_path = "C:\DiplomaProject\CleanFingerprints\Fingerprints" + str(
        idx) + ".txt"
    fingerprints = Fileparser.get_clean_fingerprints_from_file(f_path)
    overlapFingerprints = calculateWindowOverlapFingerprints(
        fingerprints, windowsize)
    goal_path = "C:\DiplomaProject\SimilarityMetrics\SimilarityMetrics" + str(
        idx) + ".txt"
    with open(goal_path, 'w', encoding="utf-8", errors='ignore') as f:
        f.truncate(0)
        for j in range(int(len(overlapFingerprints) / 3)):
            f.write(str(overlapFingerprints[j * 3]) + ",")
            f.write(str(overlapFingerprints[j * 3 + 1]) + ",")
            f.write(str(overlapFingerprints[j * 3 + 2]))
            f.write("|")
        f.close()
def getPartCorpusMetrics(idx, startIds, endIds):
    # collects and creates the input dataset for the regressor
    path = "C:\DiplomaProject\AlmarimiDocuments"
    documentList = Fileparser.extract_plagiarisms_from_files(path)
    text = Fileparser.extract_text_from_document(path, documentList[idx])
    start, end, sentences = Fileparser.split_into_sentences(text)
    originalCorpusPath = "C:\DiplomaProject\OriginalCorpus\Corpus" + str(
        idx) + ".txt"
    alamrimiCorpusPath = "C:\DiplomaProject\AlmarimiCorpus\Corpus" + str(
        idx) + ".txt"
    cosPath = "C:\DiplomaProject\CosineSimilarities\Cosine" + str(idx) + ".txt"

    f_path = "C:\DiplomaProject\CleanFingerprints\Fingerprints" + str(
        idx) + ".txt"
    index_path = "C:\DiplomaProject\AlmarimiIndices\Indices" + str(
        idx) + ".txt"
    SyntaxIds_path = "C:\DiplomaProject\SyntaxIds\Ids" + str(idx) + ".txt"
    SemanticIds_path = "C:\DiplomaProject\SemanticIds\Ids" + str(idx) + ".txt"
    SyntaxIds = Fileparser.get_indices_from_file(SyntaxIds_path)
    SemanticIds = Fileparser.get_indices_from_file(SemanticIds_path)
    originalCorpus = Fileparser.get_corpus_from_file(originalCorpusPath)
    cosineDoc2Vec = list(map(float, Fileparser.get_indices_from_file(cosPath)))
    fingerprints = Fileparser.get_clean_fingerprints_from_file(f_path)
    indices = list(map(float, Fileparser.get_indices_from_file(index_path)))
    euclidean, cosine, jaccard = RetinaOperations.readSimilarities(idx)
    euclidean = list(map(float, euclidean))
    cosine = list(map(float, cosine))
    jaccard = list(map(float, jaccard))
    meanFreq, lowestFreq, upperFreq = SyntaxParser.calculateRelationalFrequencyOfSentence(
        originalCorpus)
    meanFreq = list(map(float, meanFreq))
    lowestFreq = list(map(float, lowestFreq))
    upperFreq = list(map(float, upperFreq))
    authorStyleVals = SyntaxParser.authorStyleOfSentences(originalCorpus)
    cosineScores = []
    mean = []
    lowest = []
    upper = []
    authorStyle = []
    for i in range(len(SyntaxIds)):
        cosineScores.append(cosineDoc2Vec[int(SyntaxIds[i]) - 1])
        mean.append(meanFreq[int(SyntaxIds[i]) - 1])
        lowest.append(lowestFreq[int(SyntaxIds[i]) - 1])
        upper.append(upperFreq[int(SyntaxIds[i]) - 1])
        authorStyle.append(authorStyleVals[int(SyntaxIds[i]) - 1])

    euclid = []
    cos = []
    jacc = []

    for i in range(len(SemanticIds)):
        euclid.append(euclidean[int(SemanticIds[i]) - 1])
        cos.append(cosine[int(SemanticIds[i]) - 1])
        jacc.append(jaccard[int(SemanticIds[i]) - 1])

    partLengths = []

    startIndices = []
    endIndices = []
    partMetrics = []
    if (len(startIds) == 0):
        return partMetrics
    if (startIds[0] == 0):

        partLengths.append(endIds[0] + 1)
        startIndices.append(0)
        endIndices.append(end[endIds[0]])
    else:
        partLengths.append(endIds[0] - startIds[0] + 1)
        startIndices.append(end[startIds[0] - 1] + 1)
        endIndices.append(end[endIds[0]])

    for i in range(1, len(startIds)):
        partLengths.append(endIds[i] - startIds[i] + 1)
        startIndices.append(end[startIds[i] - 1] + 1)
        endIndices.append(end[endIds[i]])
    print(len(partLengths))

    cosineDoc2Vec = []
    meanFreq = []
    lowestFreq = []
    upperFreq = []
    meanStyle = []
    euclidean = []
    cosine = []
    jaccard = []
    for i in range(len(partLengths)):
        cosineDoc2Vec.append(
            getMeanOfValues(startIds[i], partLengths[i], cosineScores))
        meanFreq.append(getMeanOfValues(startIds[i], partLengths[i], mean))
        lowestFreq.append(getMeanOfValues(startIds[i], partLengths[i], lowest))
        upperFreq.append(getMeanOfValues(startIds[i], partLengths[i], upper))
        euclidean.append(getMeanOfValues(startIds[i], partLengths[i], euclid))
        cosine.append(getMeanOfValues(startIds[i], partLengths[i], cos))
        jaccard.append(getMeanOfValues(startIds[i], partLengths[i], jacc))
        meanStyle.append(
            getMeanOfValues(startIds[i], partLengths[i], authorStyle))

    partEuclids, partCosines, partJaccards = RetinaOperations.calculateOverlapScoresOfParts(
        fingerprints, endIds)

    meanPartFreqs = []
    lowestPartFreqs = []
    upperPartFreqs = []
    for i in range(len(partLengths)):
        meanPartFreq, lowestPartFreq, upperPartFreq = SyntaxParser.calculateRelationalFrequenciesOfPart(
            startIds[i], partLengths[i], sentences)
        meanPartFreqs.append(meanPartFreq)
        lowestPartFreqs.append(lowestPartFreq)
        upperPartFreqs.append(upperPartFreq)
    partMetrics = []
    for i in range(len(partLengths)):
        metric = PartMetric(startIds[i], endIds[i], startIndices[i],
                            endIndices[i], partLengths[i], cosineDoc2Vec[i],
                            euclidean[i], cosine[i], jaccard[i], meanFreq[i],
                            lowestFreq[i], upperFreq[i], meanStyle[i],
                            partEuclids[i], partCosines[i], partJaccards[i],
                            meanPartFreqs[i], lowestPartFreqs[i],
                            upperPartFreqs[i])
        partMetrics.append(metric)
    return partMetrics
def getOffsetCorpusMetrics(idx):
    # a method to get the metrics of the regressor
    path = "C:\DiplomaProject\AlmarimiDocuments"
    documentList = Fileparser.extract_plagiarisms_from_files(path)
    text = Fileparser.extract_text_from_document(path, documentList[idx])
    start, end, sentences = Fileparser.split_into_sentences(text)
    originalCorpusPath = "C:\DiplomaProject\OriginalCorpus\Corpus" + str(
        idx) + ".txt"
    alamrimiCorpusPath = "C:\DiplomaProject\AlmarimiCorpus\Corpus" + str(
        idx) + ".txt"
    cosPath = "C:\DiplomaProject\CosineSimilarities\Cosine" + str(idx) + ".txt"
    anomaly_path = "C:\DiplomaProject\AnomalyScores\AnomalyScores" + str(
        idx) + ".txt"
    likelyHood_path = "C:\DiplomaProject\LikelyHoodScores\LikelyHoodScores" + str(
        idx) + ".txt"
    #likelyHood_path = "C:\DiplomaProject\LikelyHoodScores\LikelyHoodScoresWindow" + str(idx) + ".txt"
    f_path = "C:\DiplomaProject\CleanFingerprints\Fingerprints" + str(
        idx) + ".txt"
    index_path = "C:\DiplomaProject\AlmarimiIndices\Indices" + str(
        idx) + ".txt"
    SyntaxIds_path = "C:\DiplomaProject\SyntaxIds\Ids" + str(idx) + ".txt"
    SemanticIds_path = "C:\DiplomaProject\SemanticIds\Ids" + str(idx) + ".txt"
    SyntaxIds = Fileparser.get_indices_from_file(SyntaxIds_path)
    SemanticIds = Fileparser.get_indices_from_file(SemanticIds_path)
    originalCorpus = Fileparser.get_corpus_from_file(originalCorpusPath)
    cosineDoc2Vec = list(map(float, Fileparser.get_indices_from_file(cosPath)))
    anomalies = list(map(float,
                         Fileparser.get_indices_from_file(anomaly_path)))
    likelyhood = list(
        map(float, Fileparser.get_indices_from_file(likelyHood_path)))
    indices = list(map(float, Fileparser.get_indices_from_file(index_path)))
    euclidean, cosine, jaccard = RetinaOperations.readSimilarities(idx)
    euclidean = list(map(float, euclidean))
    cosine = list(map(float, cosine))
    jaccard = list(map(float, jaccard))
    meanFreq, lowestFreq, upperFreq = SyntaxParser.calculateRelationalFrequencyOfSentence(
        originalCorpus)
    meanFreq = list(map(float, meanFreq))
    lowestFreq = list(map(float, lowestFreq))
    upperFreq = list(map(float, upperFreq))
    authorStyleVals = SyntaxParser.authorStyleOfSentences(originalCorpus)
    cosineScores = []
    mean = []
    lowest = []
    upper = []
    authorStyle = []
    for i in range(len(SyntaxIds)):
        cosineScores.append(cosineDoc2Vec[int(SyntaxIds[i]) - 1])
        mean.append(meanFreq[int(SyntaxIds[i]) - 1])
        lowest.append(lowestFreq[int(SyntaxIds[i]) - 1])
        upper.append(upperFreq[int(SyntaxIds[i]) - 1])
        authorStyle.append(authorStyleVals[int(SyntaxIds[i]) - 1])
    anomaly = []
    likely = []
    euclid = []
    cos = []
    jacc = []

    for i in range(len(SemanticIds)):
        anomaly.append(anomalies[int(SemanticIds[i]) - 1])
        likely.append(likelyhood[int(SemanticIds[i]) - 1])
        euclid.append(euclidean[int(SemanticIds[i]) - 1])
        cos.append(cosine[int(SemanticIds[i]) - 1])
        jacc.append(jaccard[int(SemanticIds[i]) - 1])
    metricList = []
    for i in range(len(sentences)):
        metric = OffsetMetrics(i, end[i], anomaly[i], likely[i],
                               cosineScores[i], euclid[i], cos[i], jacc[i],
                               mean[i], lowest[i], upper[i], authorStyle[i])
        metricList.append(metric)

    #cosineD2V_norm =normalize_data(cosineDoc2Vec)
    #anomalies_norm=normalize_data(anomalies)
    #likelyhood_norm = normalize_data(likelyhood)
    #euclidean_norm = normalize_data(euclidean)
    #cosine_norm = normalize_data(cosine)
    #jaccard_norm = normalize_data(jaccard)
    #metricList=[]
    #normList=[]
    #for i in range(len(corpus)):
    #metric=CorpusMetrics(i,indices[i],anomalies[i],likelyhood[i],corpus[i],cosineDoc2Vec[i],euclidean[i],cosine[i],jaccard[i],
    #euclidean_norm[i]-cosine_norm[i]-jaccard_norm[i])
    #metricList.append(metric)
    #metric.toString()
    #score=likelyhood_norm[i]-cosineD2V_norm[i]+euclidean_norm[i]-cosine_norm[i]-jaccard_norm[i]
    #normMetric=CorpusMetrics(i,indices[i],anomalies_norm[i],likelyhood_norm[i],corpus[i],cosineD2V_norm[i],euclidean_norm[i],cosine_norm[i],jaccard_norm[i],score)
    #normList.append(normMetric)

    return metricList
Exemple #11
0
        return "FAIL|"
        pass


driver = webdriver.Chrome(executable_path='C:/chromedriver/chromedriver.exe')
base_url = "http://api.cortical.io/Text.htm#!/text/"
driver.get(base_url + "/")
verificationErrors = []

sourcePath = "C:\DiplomaProject\AlmarimiCorpus\Corpus"
goalPath = "C:\DiplomaProject\AlmarimiFingerprints\Fingerprints"
# here, we can define which texts to extract
for i in range(0, 40):
    sPath = sourcePath + str(i) + ".txt"
    gPath = goalPath + str(i) + ".txt"
    corpus = Fileparser.get_corpus_from_file(sPath)

    driver.find_element_by_link_text("Expand Operations").click()
    time.sleep(2)
    textList = []
    failedIndices = []
    with open(gPath, 'w', encoding="utf-8", errors='ignore') as goalFile:
        goalFile.truncate(0)
        for j in range(len(corpus) - 1):
            txt = getFingerprint(corpus[j])
            if (txt == "FAIL|"):
                print("fail")
                failedIndices.append(j)
            textList.append(txt)

        print("fixing fails")
def readOffsetLoss(idx):
    # reads the loss scores created for the regressor - redacted
    lossPath = "C:\DiplomaProject\offsetLoss\Loss" + str(idx) + ".txt"
    strLosses = Fileparser.get_indices_from_file(lossPath)
    losses = [float(i) for i in strLosses]
    return losses
Exemple #13
0
def mergePrints(printList):
    # merges multiple fingerprints into a single print by aggregation and sparsification
    k = round(math.sqrt(len(printList)))
    indexDict = aggregateFingerPrints(printList)
    return kSparsify(indexDict, k)


def saveCleanFingerPrints(prints, path):
    # saves the clean fingerprints to a file
    prints = cleanFingerPrints(prints)
    with open(path, 'w', encoding="utf-8", errors='ignore') as cleanFile:
        for i in range(len(prints)):
            for j in range(len(prints[i])):
                cleanFile.write(prints[i][j])
                cleanFile.write(",")
            cleanFile.write("|")


fingerprintPath = "C:\DiplomaProject\AlmarimiFingerprints\Fingerprints3.txt"
indexPath = "C:\DiplomaProject\AlmarimiIndices\Indices3.txt"
cleanPath = "C:\DiplomaProject\CleanFingerprints\Fingerprints3.txt"
prints = Fileparser.get_fingerprint_from_file(cleanPath)
print(len(prints[0]))
print(len(prints[1]))
print(len(prints[2]))
print(len(prints[3]))
printList = [prints[0], prints[1]]
merged = mergePrints(printList)
print(len(merged))
#indices = Fileparser.get_indices_from_file(indexPath)