Exemple #1
0
def getspamtokens(challenge=None, duplicates=False):
    # get list of spam words from old ideas
    if challenge is None:
        if duplicates:
            bayesspamwords = list(
                importDataHelper.readcsvdata(variables.complexbayesmixedpath +
                                             'duplicateBayesSpamToken.csv'))
        else:
            bayesspamwords = list(
                importDataHelper.readcsvdata(variables.complexbayesmixedpath +
                                             'bayesSpamToken.csv'))
    else:
        if duplicates:
            bayesspamwords = list(
                importDataHelper.readcsvdata(
                    variables.complexbayeschallengebasedpath + challenge +
                    '/duplicateBayesSpamToken.csv'))
        else:
            bayesspamwords = list(
                importDataHelper.readcsvdata(
                    variables.complexbayeschallengebasedpath + challenge +
                    '/bayesSpamToken.csv'))
    spamdict = {}
    for row in bayesspamwords:
        spamdict.update(row)
    return spamdict
Exemple #2
0
def gettokenprobs(challenge=None, duplicates=False):
    if challenge is None:
        if duplicates:
            bayesphraseprobs = list(
                importDataHelper.readcsvdata(variables.complexbayesmixedpath +
                                             'duplicateBayesTokenProbs.csv'))
        else:
            bayesphraseprobs = list(
                importDataHelper.readcsvdata(variables.complexbayesmixedpath +
                                             'bayesTokenProbs.csv'))
    else:
        if duplicates:
            bayesphraseprobs = list(
                importDataHelper.readcsvdata(
                    variables.complexbayeschallengebasedpath + challenge +
                    '/duplicateBayesTokenProbs.csv'))
        else:
            bayesphraseprobs = list(
                importDataHelper.readcsvdata(
                    variables.complexbayeschallengebasedpath + challenge +
                    '/bayesTokenProbs.csv'))
    probdict = {}
    for row in bayesphraseprobs:
        probdict.update(row)
    return probdict
Exemple #3
0
def gethamtokens(challenge=None, duplicates=False):
    # get list of ham words from old ideas
    if challenge is None:
        if duplicates:
            bayeshamwords = list(
                importDataHelper.readcsvdata(variables.simplebayesmixedpath +
                                             'duplicateBayesHamToken.csv'))
        else:
            bayeshamwords = list(
                importDataHelper.readcsvdata(variables.simplebayesmixedpath +
                                             'bayesHamToken.csv'))
    else:
        if duplicates:
            bayeshamwords = list(
                importDataHelper.readcsvdata(
                    variables.simplebayeschallengebasedpath + challenge +
                    '/duplicateBayesHamToken.csv'))
        else:
            bayeshamwords = list(
                importDataHelper.readcsvdata(
                    variables.simplebayeschallengebasedpath + challenge +
                    '/bayesHamToken.csv'))
    # convert spam and ham word lists to dicts
    hamdict = {}
    for row in bayeshamwords:
        hamdict.update(row)
    return hamdict
def train_linear_classificator(challenge, new=False):
    if new:
        unigram_tagger, st = spamFilter.prepare_tagger()
        idealist = list(
        importDataHelper.readcsvdata(variables.ideadbpath + challenge + '.csv'))
        featurelist = {}
        for idea in idealist:
            idea['TRIGGERED'] = []
            idea['PREDICTION'] = "Ham"
            idea, ideafeatures = spamFilter.classify_and_get_idea(idea, unigram_tagger, st)
            if "unusable" in idea["STATUS"] or 'spam' in idea.get("SPAM", ""):
                ideafeatures["Spam"] = 1
            else:
                ideafeatures["Spam"] = 0
            for key in ideafeatures.keys():
                featurelist[key] = featurelist.get(key, [])
                featurelist[key].append(ideafeatures[key])
    else:
        if challenge == "all":
            idealist = []
            for file in listdir(variables.linclasstrainingsdatapath):
                if isfile(join(variables.linclasstrainingsdatapath, file)):
                    idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file)))
        else:
            idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv"))
        featurelist = {}
        for key in idealist[0].keys():
            featurelist[key] = [int(x) for x in idealist[0][key].replace('[', '').replace(']', '').split(',')]
    testdata = pd.DataFrame(featurelist)
    X = testdata.drop('Spam', axis=1)
    y = testdata['Spam']
    importDataHelper.writecsvfiledict(variables.linclasstrainingsdatapath + challenge + ".csv", featurelist.keys(), featurelist)
    clf = RidgeClassifier().fit(X, y)
    print(clf.score(X, y))
    return clf, clf.coef_
def load_data(challenge):
    print("USE loaded")
    data = {"DESCRIPTION": [], "SPAM": []}
    if challenge == "all":
        idealist = []
        for file in listdir(variables.ideadbpath):
            if isfile(join(variables.ideadbpath, file)):
                idealist += list(
                    importDataHelper.readcsvdata(
                        join(variables.ideadbpath, file)))
    else:
        idealist = list(
            importDataHelper.readcsvdata(variables.ideadbpath + challenge +
                                         ".csv"))
    for idea in idealist:
        data["DESCRIPTION"].append(idea["DESCRIPTION"])
        if "unusable" in idea.get("STATUS", ""):
            data["SPAM"].append(1)
        elif "usable" in idea.get("STATUS", ""):
            data["SPAM"].append(0)
        elif "spam" in idea.get("SPAM", ""):
            data["SPAM"].append(1)
        else:
            data["SPAM"].append(0)
    return pd.DataFrame(data)
def train_and_test(challenge):
    idealist = []
    if challenge == "all":
        for file in listdir(variables.linclasstrainingsdatapath):
            if isfile(join(variables.linclasstrainingsdatapath, file)):
                filename = file.split(".")[0]
                idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file)))
    else:
        idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv"))
    featurelist = {}
    for row in idealist:
        for key in row.keys():
            featurelist[key] = featurelist.get(key, [])
            featurelist[key] += [int(x) for x in row[key].replace('[', '').replace(']', '').split(',')]
    testdata = pd.DataFrame(featurelist)
    X = testdata.drop('Spam', axis=1)
    y = testdata['Spam']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    clf = RidgeClassifier()
    y_score = clf.fit(X_train, y_train).decision_function(X_test)
    testres = clf.predict(X_test)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in [0, 1]:
        fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure()
    lw = 2
    plt.plot(fpr[1], tpr[1], color="darkorange", lw=lw, label="ROC" % roc_auc[1])
    plt.plot([0, 1], [0, 1], color="cornflowerblue", lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(challenge)
    plt.legend(loc="lower right")
    plt.savefig(variables.plotspath + "ROC_linClass_" + challenge + ".png")
    plt.show()
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    plt.show()
    confusion_matrix = ConfusionMatrix(y_test, testres)
    confusion_matrix.plot(normalized=True)
    plt.title(challenge)
    plt.savefig(variables.plotspath + "CM_linClass_" + challenge + ".png")
    plt.show()
    print(clf.coef_)
    print(classification_report(y_test, testres))
    print(confusion_matrix.stats())
def eval_all():
    challengedict = {
        "TCO":
        list(importDataHelper.readcsvdata("Data/DBs/ideaDB/TCO.csv")),
        "bionicRadar":
        list(importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv")),
        "fabricDisplay":
        list(importDataHelper.readcsvdata("Data/DBs/ideaDB/fabricDisplay.csv"))
    }
    dupdict = {}
    for key in challengedict.keys():
        idealist = []
        for key2 in challengedict.keys():
            if key2 is not key:
                idealist += challengedict[key2].copy()
        X_train, X_test = train_test_split(challengedict[key], test_size=0.33)
        idealist += X_train.copy()

        X_ndtrain = duplicateDetection.filterduplikates(
            X_train, variables.resultpath + "eval2" + key + ".csv")
        dupdict[key] = len(X_train) - len(X_ndtrain)
        X_ndtest = X_test.copy()
        idealist_test = X_test.copy()
        idealist_nodups_test = X_test.copy()
        results = evaluate_system(X_train, X_test, key)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluation" + key + ".csv", results.keys(),
            results)
        print("Done first set")
        results2 = evaluate_system(X_ndtrain, X_ndtest, key, dups=True)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluationResultsNoDups" + key + ".csv",
            results2.keys(), results2)
        print("Challenge training done", key)

        idealist_nodups = duplicateDetection.filterduplikates(
            idealist, variables.resultpath + "eval" + key + ".csv")
        dupdict[key + " All"] = len(idealist) - len(idealist_nodups)
        results = evaluate_system(idealist, idealist_test)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluationAll" + key + ".csv", results.keys(),
            results)
        print("Done first set")
        results2 = evaluate_system(idealist_nodups,
                                   idealist_nodups_test,
                                   dups=True)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluationResultsNoDupsAll" + key + ".csv",
            results2.keys(), results2)
        print("All training done", key)
    print(dupdict)
    importDataHelper.writecsvfiledict("Data/ResultsAllNew/dupNums.csv",
                                      dupdict.keys(), dupdict)
def extend_challenge_db(idealist):
    challengelist = {}

    for file in listdir(variables.ideadbpath):
        if isfile(join(variables.ideadbpath, file)):
            filename = file.split(".")[0]
            challengelist[filename] = list(
                importDataHelper.readcsvdata(join(variables.ideadbpath, file)))

    for idea in idealist:
        idea["CHALLENGE"] = idea.get("CHALLENGE", "")
        if "cscw19-1" in idea["CHALLENGE"]:
            challengelist["TCO"] = challengelist.get("TCO", [])
            if not any(e['ID'] == idea['ID'] for e in challengelist["TCO"]):
                challengelist["TCO"].append(idea)
        elif "chi19s1" in idea["CHALLENGE"]:
            challengelist["TCO"] = challengelist.get("TCO", [])
            if not any(e['ID'] == idea['ID'] for e in challengelist["TCO"]):
                challengelist["TCO"].append(idea)
        elif "bionic" in idea["CHALLENGE"].lower():
            challengelist["bionicRadar"] = challengelist.get("bionicRadar", [])
            if not any(e['ID'] == idea['ID']
                       for e in challengelist["bionicRadar"]):
                challengelist["bionicRadar"].append(idea)
        elif "fabric" in idea["CHALLENGE"].lower():
            challengelist["fabricDisplay"] = challengelist.get(
                "fabricDisplay", [])
            if not any(e['ID'] == idea['ID']
                       for e in challengelist["fabricDisplay"]):
                challengelist["fabricDisplay"].append(idea)
    for key in challengelist.keys():
        importDataHelper.writecsvfile(join(variables.ideadbpath, key + ".csv"),
                                      challengelist[key][0].keys(),
                                      challengelist[key])
        print("saved " + key)
Exemple #9
0
def read_noun_corpus():
    nounlist = list(
        importDataHelper.readcsvdata(variables.dbpath + 'NLPdata/NounDB.csv'))
    # convert spam and ham word lists to dicts
    nouncorpus = {}
    for row in nounlist:
        nouncorpus.update(row)
    return nouncorpus
Exemple #10
0
def plot_Evaluation(dataset):
    N = 0
    filterlistall = list(
        importDataHelper.readcsvdata(variables.evaluationpresultpath))
    tplist = []
    fplist = []
    filterlistdataset = []

    fig, ax = plt.subplots(figsize=(10, 10))

    num = 0
    pnum = 0
    nnum = 0
    maxnum = 0
    temp = 0
    gotdata = False
    i = 0
    for filter in filterlistall:
        if dataset in filter["Dataset"]:
            i += 1
            if filter["Variable"] not in "None":
                filterlistdataset.append(
                    str(i) + ": " + filter["Filter"] + ": " +
                    filter["Variable"])
            else:
                filterlistdataset.append(str(i) + ": " + filter["Filter"])
            if not gotdata:
                num = filter["population"]
                pnum = filter["P"]
                nnum = filter["N"]
                gotdata = True
            N += 1
            if filter["TP"] in '':
                tplist.append(0)
            else:
                temp = int(filter["TP"])
                tplist.append(int(filter["TP"]))
            if filter["FP"] in '':
                fplist.append(0)
            else:
                temp += int(filter["FP"])
                fplist.append(int(filter["FP"]))
            if temp > maxnum:
                maxnum = temp
    ind = np.arange(N)
    p1 = plt.bar(ind, tplist)
    p2 = plt.bar(ind, fplist, bottom=tplist)
    plt.ylabel('Amount')
    plt.title(('Filter evaluation for ' + dataset + ' with ' + num +
               ' ideas (' + pnum + ' positives and ' + nnum + ' negatives)'))
    plt.xticks(ind, range(1, len(filterlistdataset)))
    plt.yticks(np.arange(0, maxnum, 20))
    plt.legend((p1[0], p2[0]), ('TP', 'FP'))
    print(filterlistdataset)
    plt.show()
def add_all_ideas_toDB():
    for file in listdir(variables.importpathclassified):
        if isfile(join(variables.importpathclassified, file)):
            if ".csv" in file:
                extend_challenge_db(
                    list(
                        importDataHelper.readcsvdata(
                            join(variables.importpathclassified, file))))
                print("finished: " + file)
            else:
                print("just csv supported right now")
Exemple #12
0
def save_confusionmatrix(cm,
                         path,
                         applied_filters=[],
                         description="",
                         dataset=""):
    cmdict = list(importDataHelper.readcsvdata(path))
    cmdict.append(cm.stats())
    cmdict[len(cmdict) - 1]["applied Filter"] = applied_filters
    cmdict[len(cmdict) - 1]["Description"] = description
    cmdict[len(cmdict) - 1]["Dataset"] = dataset
    importDataHelper.writecsvfile(path, cmdict[0].keys(), cmdict)
    return 0
def classify_unreviewed():
    idealist = list(
        importDataHelper.readcsvdata(
            "Data/Results/fabricDisplayunreviewed.csv"))
    idealist2 = list(
        importDataHelper.readcsvdata(
            "Data/Results/fabricDisplayClassified.csv"))
    print("bionic Radar:")
    for idea in idealist:
        if idea["ID"] in [ideas["ID"] for ideas in idealist2]:
            idealist.remove(idea)
    print(len(idealist))
    for idea in idealist:
        print(" ")
        if "usable" not in idea.get("STATUS", ""):
            print("Content: " + idea["DESCRIPTION"])
            print("Prediction: " + idea["PREDICTION"])
            print("Bayes: " + idea["OTHERBayes"])
            print("Others: " + idea["OTHERS"])
            print("Filter: " + idea["TRIGGERED"])
            x = input("Spam? (y/n)")
            if 'y' in x:
                idea["STATUS"] = "unusable"
                idealist2.append(idea)
                idealist.remove(idea)
            elif 'n' in x:
                idea["STATUS"] = "usable"
                idealist2.append(idea)
                idealist.remove(idea)
            else:
                importDataHelper.writecsvfile(
                    "Data/Results/fabricDisplayClassified.csv",
                    idealist2[0].keys(), idealist2)
                importDataHelper.writecsvfile(
                    "Data/Results/fabricDisplayunreviewed.csv",
                    idealist[0].keys(), idealist)
    importDataHelper.writecsvfile("Data/Results/fabricDisplayClassified.csv",
                                  idealist2[0].keys(), idealist2)
    importDataHelper.writecsvfile("Data/Results/fabricDisplayunreviewed.csv",
                                  idealist[0].keys(), idealist)
Exemple #14
0
def extend_noun_corpus():
    idealist = list(
        importDataHelper.readcsvdata(variables.importpathclassified +
                                     'cscw19-unapproved-ideas_import.csv'))
    nouncorpus = read_noun_corpus()
    unigram_tagger = prepare_tagger()
    for idea in idealist:
        nouns = get_Nouns(idea['DESCRIPTION'], unigram_tagger)
        for noun in nouns:
            if noun not in nouncorpus:
                nouncorpus[noun] = "unclassified"
    importDataHelper.writecsvfiledict(variables.dbpath + 'NLPdata/NounDB.csv',
                                      nouncorpus.keys(), nouncorpus)
Exemple #15
0
def traincomplexbayes(dataset=None):
    if dataset is None:
        print("Select a dataset: ")
        i = 0
        print("Classified datasets")
        filesclass = []
        for file in listdir(variables.importpathclassified):
            if isfile(join(variables.importpathclassified, file)):
                print("", i, ": ", file)
                filesclass.append((variables.importpathclassified, file))
                i += 1
        print("Unclassified datasets")
        for file in listdir(variables.importpathunclassified):
            if isfile(join(variables.importpathunclassified, file)):
                print("", i, ": ", file)
                filesclass.append((variables.importpathunclassified, file))
                i += 1
        selected = int(input("Which dataset do you want to use? "))
        path = filesclass[selected][0]
        filename, fileformat = filesclass[selected][1].replace(".",
                                                               ' ').split()
        if 'csv' in fileformat:
            idealist = list(
                importDataHelper.readcsvdata(
                    join(path, filename + '.' + fileformat)))
        else:
            idealist = list(
                importDataHelper.readxmldata(
                    join(path, filename + '.' + fileformat)))
    else:
        idealist = dataset[0]
    delete = ""
    while ('y' not in delete or 'n' in delete):
        delete = input(
            "Do you want to override old 5-word bayes results (y/n): ").lower(
            )
    start = time.process_time_ns()
    if 'y' in delete:
        spamdictcom = {}
        hamdictcom = {}
    else:
        spamdictcom = complexBayes.getspamtokens(
        )  # load data this time to get data from both datasets
        hamdictcom = complexBayes.gethamtokens(
        )  # load data this time to get data from both datasets
    complexBayes.trainbayes(idealist, spamdictcom, hamdictcom)
    duration = time.process_time_ns() - start
    print("Duration (complex) bayestraining: ", duration / 1000000000,
          "seconds")
    return None
def match_iui_challenges():
    unmatchedlist = list(
        importDataHelper.readcsvdata(
            "Data/ImportsClassified/iui-export-ideas.csv"))
    print(len(unmatchedlist))
    challengelist = list(
        importDataHelper.readcsvdata(
            "Data/ImportsClassified/ideas-with-challenges.csv"))
    print(len(challengelist))
    count_unmatched = 0
    count_matched = 0
    for idea in unmatchedlist:
        matched = False
        for idea2 in challengelist:
            if (idea["ID"] in idea2["ID"]):
                idea["CHALLENGE"] = idea2["CHALLENGE"]
                count_matched += 1
                matched = True
                break
        if not matched:
            count_unmatched += 1
    print(count_unmatched)
    print(count_matched)
    extend_challenge_db(unmatchedlist)
def spamdetection():
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="Path to a csv or xml file with ideas")

    parser.add_argument("-t",
                        "--train",
                        help="to train the system. Requires classified ideas.",
                        action="store_true")
    parser.add_argument(
        "--challenge",
        help=
        "give a challenge to use instead of the challenges given in an idea")
    args = parser.parse_args()
    filename, fileformat = os.path.basename(args.path).split('.')
    if fileformat == 'csv':
        idealist = list(importDataHelper.readcsvdata(args.path))
    elif fileformat == 'xml':
        idealist = importDataHelper.readxmldata(args.path)
    else:
        print("Can not read the file, please use csv or xml files")
        return 1
    challengelists = {}
    # Divide idea in challenges or use the given challenge
    if args.challenge is None:
        for idea in idealist:
            challenge = idea.get("CHALLENGE", "Cross-Domain")
            challengelists[challenge] = challengelists.get(challenge, [])
            challengelists[challenge].append(idea)
    else:
        challengelists[args.challenge] = idealist
    if args.train:
        for elem in challengelists:
            train(challengelists[elem], elem)
    else:
        classifiedlist = []
        for elem in challengelists:
            if fileformat == "csv":
                classifiedlist += classify(challengelists[elem], elem,
                                           fileformat)
                importDataHelper.writecsvfile(
                    os.path.dirname(args.path) + "/" + filename +
                    "_classified.csv", classifiedlist[0].keys(),
                    classifiedlist)
            else:
                idealist = classify(idealist, elem, fileformat)
                idealist.write(
                    os.path.dirname(args.path) + "/" + filename +
                    "_classified.xml")
def evaluate_fun():
    idealist = list(
        importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv"))
    X_train, X_test = train_test_split(idealist, test_size=0.33)
    X_ndtrain = duplicateDetection.filterduplikates(
        X_train, variables.resultpath + "evalbionicRadar.csv")
    X_ndtest = X_test.copy()
    results = evaluate_system(X_train, X_test, "bionicRadar")
    importDataHelper.writecsvfiledict(
        "Data/ResultsNew/evaluationResultsbionicRadar.csv", results.keys(),
        results)
    print("Done first set")
    results2 = evaluate_system(X_ndtrain, X_ndtest, "bionicRadar", dups=True)
    importDataHelper.writecsvfiledict(
        "Data/ResultsNew/evaluationResultsNoDupsbionicRadar.csv",
        results2.keys(), results2)

    print("Done")
def evaluationData_table(dataset):
    filterlistall = list(importDataHelper.readcsvdata(variables.evaluationpresultpath))
    data = []
    columns = ("Filter", "TP", "FP")
    num = 0
    pnum = 0
    nnum = 0
    gotdata = False

    fig, ax = plt.subplots(figsize=(10, 10))

    # hide axes
    fig.patch.set_visible(False)
    ax.axis('off')
    ax.axis('tight')

    for filter in filterlistall:
        if dataset in filter["Dataset"]:
            if not gotdata:
                num = filter["population"]
                pnum = filter["P"]
                nnum = filter["N"]
                gotdata = True
            if filter["Variable"] not in "None":
                if filter["TP"] in '' or filter["FP"] in '':
                    data.append([filter["Filter"] + ": " + filter["Variable"], 0, 0])
                else:
                    data.append([filter["Filter"] + ": " + filter["Variable"], int(filter["TP"]), int(filter["FP"])])
            else:
                if filter["TP"] in '' or filter["FP"] in '':
                    data.append([filter["Filter"], 0, 0])
                else:
                    data.append([filter["Filter"], int(filter["TP"]), int(filter["FP"])])

    df = pd.DataFrame(data, columns=columns)

    ax.table(cellText=df.values, colLabels=df.columns, loc='center')

    fig.tight_layout()
    plt.title('Filter evaluation for ' + dataset + ' with ' + num + ' ideas (' + pnum + ' positives and ' + nnum + ' negatives)')
    plt.show()
Exemple #20
0
def duplicatefilter(dataset=None):
    if dataset is None:
        print("Select a dataset: ")
        i = 0
        print("Classified datasets")
        filesclass = []
        for file in listdir(variables.importpathclassified):
            if isfile(join(variables.importpathclassified, file)):
                print("", i, ": ", file)
                filesclass.append((variables.importpathclassified, file))
                i += 1
        print("Unclassified datasets")
        for file in listdir(variables.importpathunclassified):
            if isfile(join(variables.importpathunclassified, file)):
                print("", i, ": ", file)
                filesclass.append((variables.importpathunclassified, file))
                i += 1
        selected = int(input("Which dataset do you want to use? "))
        path = filesclass[selected][0]
        filename, fileformat = filesclass[selected][1].replace(".",
                                                               ' ').split()
        if 'csv' in fileformat:
            idealist = list(
                importDataHelper.readcsvdata(
                    join(path, filename + '.' + fileformat)))
        else:
            idealist = list(
                importDataHelper.readxmldata(
                    join(path, filename + '.' + fileformat)))
    else:
        fileformat = dataset[3]
        filename = dataset[2]
        path = dataset[1]
        idealist = dataset[0]
    idealist = duplicateDetection.filterduplikates(
        idealist, variables.duplicateresultpath + filename + 'Duplicates.csv')
    return idealist, path, filename, fileformat
Exemple #21
0
def classifyideas(dataset=None):
    if dataset is None:
        print("Select a dataset: ")
        i = 0
        print("Classified datasets")
        filesclass = []
        for file in listdir(variables.importpathclassified):
            if isfile(join(variables.importpathclassified, file)):
                print("", i, ": ", file)
                filesclass.append((variables.importpathclassified, file))
                i += 1
        print("Unclassified datasets")
        for file in listdir(variables.importpathunclassified):
            if isfile(join(variables.importpathunclassified, file)):
                print("", i, ": ", file)
                filesclass.append((variables.importpathunclassified, file))
                i += 1
        selected = int(input("Which dataset do you want to use? "))
        path = filesclass[selected][0]
        filename, fileformat = filesclass[selected][1].replace(".",
                                                               ' ').split()
        if 'csv' in fileformat:
            idealist = list(
                importDataHelper.readcsvdata(
                    join(path, filename + '.' + fileformat)))
        else:
            idealist = list(
                importDataHelper.readxmldata(
                    join(path, filename + '.' + fileformat)))
    else:
        fileformat = dataset[3]
        filename = dataset[2]
        path = dataset[1]
        idealist = dataset[0]
    bayesbool = 'y' in input(
        "Do you want to use single word bayes to classify? (y/n) ").lower()
    complbayesbool = 'y' in input(
        "Do you want to use 5-word bayes to classify? (y/n) ").lower()
    filtersystembool = 'y' in input(
        "Do you want to use the Filtersystem to classify? (y/n) ").lower()
    if bayesbool:
        wordprobs = bayes.gettokenprobs()
    if complbayesbool:
        wordprobscom = complexBayes.gettokenprobs()
    if filtersystembool:
        unigram_tagger, st = prepare_tagger()

    spamlist = []
    applied_filters = {}
    pred = []
    actual = []
    fplist = []
    fnlist = []
    start1 = time.time()

    for row in idealist:
        row['TRIGGERED'] = []
        row['PREDICTION'] = "Ham"
        if bayesbool:
            bayesprob = bayes.classify(row['DESCRIPTION'], wordprobs)
            if bayesprob > 0.8:
                row['TRIGGERED'].append("bayes")
                applied_filters["bayes"] = int(applied_filters.get("bayes",
                                                                   0)) + 1
                row['PREDICTION'] = "Spam"
        if complbayesbool:
            combayesprob = complexBayes.classify(row['DESCRIPTION'],
                                                 wordprobscom)
            if combayesprob > 0.8:
                row['TRIGGERED'].append("complex bayes: " + str(combayesprob))
                applied_filters["complex bayes"] = int(
                    applied_filters.get("complex bayes", 0)) + 1
                row['PREDICTION'] = "Spam"
        if filtersystembool:
            row = spamFilter.classifyidea(row, unigram_tagger, st)
        actual.append("spam" in row.get('SPAM', "")
                      or "unusable" in row.get("STATUS", ""))
        pred.append(row['PREDICTION'] == "Spam")
        for filter in row['TRIGGERED']:
            if 'bayes' not in filter:
                applied_filters[filter] = int(applied_filters.get(filter,
                                                                  0)) + 1
        spamlist.append(row)
        if row['PREDICTION'] == "Spam" and ("ham" in row.get('SPAM', "") or
                                            row.get("STATUS", "") == "usable"):
            fplist.append(row)
        elif row['PREDICTION'] == "Ham" and ("spam" in row.get(
                'SPAM', "") or "unusable" in row.get("STATUS", "")):
            fnlist.append(row)
    cm = confusionMatrix.create_confusionmatrix(actual, pred)
    confusionMatrix.print_confusionmatrix(cm, True)
    description = "just filtersystem, Test enumeration fix with iui dataset"

    confusionMatrix.save_confusionmatrix(
        cm, variables.resultpath + "ConfusionMatrices.csv", applied_filters,
        description, filename)
    duration1 = time.time() - start1
    print("Duration1: ", duration1, "seconds")
    print(applied_filters)

    ###################### Save results ######################
    #    importDataHelper.writecsvfile(variables.resultpath + 'IdeaDataSpam2.csv', spamlist[0].keys(), spamlist)
    if len(fplist) > 0:
        importDataHelper.writecsvfile(
            variables.filterresults + filename + '_fp.csv', fplist[0].keys(),
            fplist)
    if len(fnlist) > 0:
        importDataHelper.writecsvfile(
            variables.filterresults + filename + '_fn.csv', fnlist[0].keys(),
            fnlist)
    return None
def test():
    #    idealist = list(importDataHelper.readxmldata(variables.importpathunclassified + 'IdeaData.xml'))
    idealist = list(
        importDataHelper.readcsvdata(variables.importpathclassified +
                                     "ideas-with-challenges.csv"))
    idealistchallenge = {"bionicRadar": [], "fabricDisplay": []}
    print(len(idealist))
    i = 0
    j = 0
    k = 0
    for idea in idealist:
        if idea["STATUS"] == "unreviewed":
            if "bionic" in idea["CHALLENGE"].lower():
                i += 1
                idealistchallenge["bionicRadar"].append(idea)
            elif "fabric" in idea["CHALLENGE"].lower():
                j += 1
                idealistchallenge["fabricDisplay"].append(idea)
            else:
                k += 1
    print("unreviewed bionic: ", i)
    print("unreviewed fabric: ", j)
    print("unreviewed others: ", k)

    idealisttrainingschallenge = {}
    idealisttrainingschallenge["fabricDisplay"] = list(
        importDataHelper.readcsvdata(variables.ideadbpath +
                                     'fabricDisplay.csv'))
    idealisttrainingschallenge["bionicRadar"] = list(
        importDataHelper.readcsvdata(variables.ideadbpath + 'bionicRadar.csv'))
    idealisttrainingschallenge["TCO"] = list(
        importDataHelper.readcsvdata(variables.ideadbpath + 'TCO.csv'))

    idealisttrainingschallengewodups = {}
    idealisttrainingschallengewodups["fabricDisplay"] = list(
        importDataHelper.readcsvdata(variables.ideadbwithoutduppath +
                                     "fabricDisplay.csv"))
    idealisttrainingschallengewodups["bionicRadar"] = list(
        importDataHelper.readcsvdata(variables.ideadbwithoutduppath +
                                     "bionicRadar.csv"))
    idealisttrainingschallengewodups["TCO"] = list(
        importDataHelper.readcsvdata(variables.ideadbwithoutduppath +
                                     "TCO.csv"))

    idealistmixedtraining = idealisttrainingschallenge[
        "fabricDisplay"] + idealisttrainingschallenge[
            "bionicRadar"] + idealisttrainingschallenge["TCO"]
    idealistmixedtrainingwithoutdups = idealisttrainingschallengewodups[
        "fabricDisplay"] + idealisttrainingschallengewodups[
            "bionicRadar"] + idealisttrainingschallengewodups["TCO"]

    for key in idealistchallenge.keys():
        idealisttraining = idealisttrainingschallenge[key]
        idealisttrainingwithoutdups = list(
            importDataHelper.readcsvdata(variables.ideadbwithoutduppath + key +
                                         ".csv"))

        #        idealistchallengewithoutdups = duplicateDetection.filterduplikates(idealistchallenge[key], variables.resultpath + "test3.csv", idealisttrainingwithoutdups)
        print("duplicate detection done")

        bayes.trainbayes(idealisttraining, challenge=key, delete=True)
        bayes.trainbayes(idealisttrainingwithoutdups,
                         challenge=key,
                         delete=True,
                         duplicates=True)
        print("bayes training TCO complete")

        bayes.trainbayes(idealistmixedtraining, delete=True)
        bayes.trainbayes(idealistmixedtrainingwithoutdups,
                         delete=True,
                         duplicates=True)
        print("bayes training mixed complete")

        wordprobs = bayes.gettokenprobs(challenge=key)
        wordprobswithoutdups = bayes.gettokenprobs(challenge=key,
                                                   duplicates=True)

        wordprobsmixed = bayes.gettokenprobs()
        wordprobsmixedwithoutdups = bayes.gettokenprobs(duplicates=True)
        print("loaded probs")
        complexBayes.trainbayes(idealisttraining, challenge=key, delete=True)
        complexBayes.trainbayes(idealisttrainingwithoutdups,
                                challenge=key,
                                delete=True,
                                duplicates=True)
        print("complex bayes training TCO complete")

        complexBayes.trainbayes(idealistmixedtraining, delete=True)
        complexBayes.trainbayes(idealistmixedtrainingwithoutdups,
                                delete=True,
                                duplicates=True)
        print("complex bayes training mixed complete")

        comwordprobs = complexBayes.gettokenprobs(challenge=key)
        comwordprobswithoutdups = complexBayes.gettokenprobs(challenge=key,
                                                             duplicates=True)

        comwordprobsmixed = complexBayes.gettokenprobs()
        comwordprobsmixedwithoutdups = complexBayes.gettokenprobs(
            duplicates=True)
        print("loaded probs complex")

        linclass, lincoeff = linearClassifier.train_linear_classificator(key)
        print(lincoeff)
        linclassmixed, lincoeffmixed = linearClassifier.train_linear_classificator(
            "all")
        print(lincoeffmixed)

        useest = USEClassifier.train_classifier(key)
        useestmixed = USEClassifier.train_classifier("all")
        print("trained USE")

        unigram_tagger, st = spamFilter.prepare_tagger()

        i = 1
        for idea in idealistchallenge[key]:
            print(i)
            idea["TRIGGERED"] = [""]
            # classify with challenge bayes with duplicates
            bayesprob = bayes.classify(idea["DESCRIPTION"], wordprobs)
            # classify with challenge bayes without duplicates
            bayesprobdup = bayes.classify(idea["DESCRIPTION"],
                                          wordprobswithoutdups)
            # classify with mixed challenge bayes with duplicates
            bayesprobmixed = bayes.classify(idea["DESCRIPTION"],
                                            wordprobsmixed)
            # classify with mixed challenge bayes without duplicates
            bayesprobmixedwithoutdup = bayes.classify(
                idea["DESCRIPTION"], wordprobsmixedwithoutdups)

            combayesprob = complexBayes.classify(idea["DESCRIPTION"],
                                                 comwordprobs)
            # classify with challenge bayes without duplicates
            combayesprobdup = complexBayes.classify(idea["DESCRIPTION"],
                                                    comwordprobswithoutdups)
            # classify with mixed challenge bayes with duplicates
            combayesprobmixed = complexBayes.classify(idea["DESCRIPTION"],
                                                      comwordprobsmixed)
            # classify with mixed challenge bayes without duplicates
            combayesprobmixedwithoutdup = complexBayes.classify(
                idea["DESCRIPTION"], comwordprobsmixedwithoutdups)

            # classify with challenge USE:
            useclass, useclassprob = USEClassifier.classify(useest, idea)
            # classify with mixed challenge USE:
            usemixedclass, usemixedclassprob = USEClassifier.classify(
                useestmixed, idea)

            idea, ideadata = spamFilter.classify_and_get_idea(
                idea, unigram_tagger, st)
            allnull = True
            for keytest in ideadata.keys():
                ideadata[keytest] = [ideadata[keytest]]
                if ideadata[keytest] == 1:
                    allnull = False
            if not allnull:
                linclasspred, linclassprob = linearClassifier.classify(
                    ideadata, linclass)
                linmixedclasspred, linmixedclassprob = linearClassifier.classify(
                    ideadata, linclassmixed)
            else:
                linclasspred, linclassprob = 0, 0
                linmixedclasspred, linmixedclassprob = 0, 0
            idea["PREDICTION"] = "Bayes: " + str(
                bayesprobdup) + ", complexBayes " + str(
                    combayesprobdup) + ", linClass: " + str(
                        linmixedclasspred) + " " + str(
                            linmixedclassprob) + ", USE: " + str(
                                useclass) + " " + str(useclassprob)
            idea["OTHERBayes"] = "BayesTCO: " + str(
                bayesprob) + ", BayesMixed " + str(
                    bayesprobmixed) + ", BayesMixed w/o dups " + str(
                        bayesprobmixedwithoutdup) + ", compl BayesTCO: " + str(
                            combayesprob) + ", compl BayesMixed: " + str(
                                combayesprobmixed
                            ) + ", compl BayesMixed w/o dups: " + str(
                                combayesprobmixedwithoutdup)
            idea["OTHERS"] = "Lin Class: " + str(linclasspred) + " " + str(
                linclassprob) + ", USE mixed: " + str(
                    usemixedclass) + " " + str(usemixedclassprob)

            i += 1
        importDataHelper.writecsvfile(
            variables.resultpath + key + "unreviewed.csv",
            idealistchallenge[key][0].keys(), idealistchallenge[key])
Exemple #23
0
def evaluate_filtersystem():
    resultlist = []
    unigram, st = prepare_tagger()
    for file in listdir(variables.importpathclassified):
        if isfile(join(variables.importpathclassified, file)):
            if ".csv" in file:
                idealist = list(
                    importDataHelper.readcsvdata(
                        join(variables.importpathclassified, file)))
            elif ".xml" in file:
                idealist = list(
                    importDataHelper.readxmldata(
                        join(variables.importpathclassified, file)))
            else:
                print(
                    "Not able to read all files (just csv and xml are supported)"
                )
                return 1
            for filter in textDataFilter.textDataFilterList:
                if "count" in str(filter):
                    if "more" in filter.__name__:
                        for count in countmore:
                            cm = evaluate_filter(filter, idealist, count)
                            result = {
                                "Dataset": file,
                                "Filter": filter.__name__,
                                "Variable": count
                            }
                            if cm is not None:
                                result.update(cm.stats())
                            resultlist.append(result)
                    elif "less" in filter.__name__:
                        for count in countless:
                            cm = evaluate_filter(filter, idealist, count)
                            result = {
                                "Dataset": file,
                                "Filter": filter.__name__,
                                "Variable": count
                            }
                            if cm is not None:
                                result.update(cm.stats())
                            resultlist.append(result)
                    elif "word" in filter.__name__:
                        for count in countwords:
                            cm = evaluate_filter(filter, idealist, count)
                            result = {
                                "Dataset": file,
                                "Filter": filter.__name__,
                                "Variable": count
                            }
                            if cm is not None:
                                result.update(cm.stats())
                            resultlist.append(result)
                else:
                    cm = evaluate_filter(filter, idealist)
                    result = {
                        "Dataset": file,
                        "Filter": filter.__name__,
                        "Variable": "None"
                    }
                    if cm is not None:
                        result.update(cm.stats())
                    resultlist.append(result)
            for filter in textContentFilter.textContentFilterlist:
                if "unigram" in filter.__name__:
                    cm = evaluate_filter(filter, idealist, unigram)
                    result = {
                        "Dataset": file,
                        "Filter": filter.__name__,
                        "Variable": "UnigramTagger"
                    }
                    if cm is not None:
                        result.update(cm.stats())
                    resultlist.append(result)
                elif "containsnames" in filter.__name__:
                    cm = evaluate_filter(filter, idealist, st)
                    result = {
                        "Dataset": file,
                        "Filter": filter.__name__,
                        "Variable": "StanfordNERTagger"
                    }
                    if cm is not None:
                        result.update(cm.stats())
                    resultlist.append(result)
                else:
                    cm = evaluate_filter(filter, idealist)
                    result = {
                        "Dataset": file,
                        "Filter": filter.__name__,
                        "Variable": "None"
                    }
                    if cm is not None:
                        result.update(cm.stats())
                    resultlist.append(result)
                print(filter.__name__)
    importDataHelper.writecsvfile(
        variables.resultpath + "FilterEvaluation.csv", resultlist[0].keys(),
        resultlist)
def import_results():
    fabricresults = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationfabricDisplay.csv"))[0])
    fabricresultsNoDups = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationResultsNoDupsfabricDisplay.csv"))
        [0])
    fabricresultsNoDupsAll = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationResultsNoDupsAllfabricDisplay.csv"
            ))[0])
    fabricresultsAll = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationAllfabricDisplay.csv"))[0])

    tcoresults = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationTCO.csv"))[0])
    tcoresultsNoDups = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationResultsNoDupsTCO.csv"))[0])
    tcoresultsNoDupsAll = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationResultsNoDupsAllTCO.csv"))[0])
    tcoresultsAll = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationAllTCO.csv"))[0])

    bionicresults = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationbionicRadar.csv"))[0])
    bionicresultsNoDups = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationResultsNoDupsbionicRadar.csv"))
        [0])
    bionicresultsNoDupsAll = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationResultsNoDupsAllbionicRadar.csv")
        )[0])
    bionicresultsAll = evaluationHelper.convertResults(
        list(
            importDataHelper.readcsvdata(
                "Data/ResultsAllNew/evaluationAllbionicRadar.csv"))[0])

    return {
        "fabricresults": fabricresults,
        "fabricresultsNoDups": fabricresultsNoDups,
        "fabricresults All": fabricresultsAll,
        "fabricresults AllNoDups": fabricresultsNoDupsAll,
        "tcoresults": tcoresults,
        "tcoresultsNoDups": tcoresultsNoDups,
        "tcoresults All": tcoresultsAll,
        "tcoresults AllNoDups": tcoresultsNoDupsAll,
        "bionicresults": bionicresults,
        "bionicresultsNoDups": bionicresultsNoDups,
        "bionicresults All": bionicresultsAll,
        "bionicresults AllNoDups": bionicresultsNoDupsAll
    }
Exemple #25
0
def load_confusionmatrices(path):
    return list(importDataHelper.readcsvdata(path))