def train_linear_classificator(challenge, new=False):
    if new:
        unigram_tagger, st = spamFilter.prepare_tagger()
        idealist = list(
        importDataHelper.readcsvdata(variables.ideadbpath + challenge + '.csv'))
        featurelist = {}
        for idea in idealist:
            idea['TRIGGERED'] = []
            idea['PREDICTION'] = "Ham"
            idea, ideafeatures = spamFilter.classify_and_get_idea(idea, unigram_tagger, st)
            if "unusable" in idea["STATUS"] or 'spam' in idea.get("SPAM", ""):
                ideafeatures["Spam"] = 1
            else:
                ideafeatures["Spam"] = 0
            for key in ideafeatures.keys():
                featurelist[key] = featurelist.get(key, [])
                featurelist[key].append(ideafeatures[key])
    else:
        if challenge == "all":
            idealist = []
            for file in listdir(variables.linclasstrainingsdatapath):
                if isfile(join(variables.linclasstrainingsdatapath, file)):
                    idealist += list(importDataHelper.readcsvdata(join(variables.linclasstrainingsdatapath, file)))
        else:
            idealist = list(importDataHelper.readcsvdata(variables.linclasstrainingsdatapath + challenge + ".csv"))
        featurelist = {}
        for key in idealist[0].keys():
            featurelist[key] = [int(x) for x in idealist[0][key].replace('[', '').replace(']', '').split(',')]
    testdata = pd.DataFrame(featurelist)
    X = testdata.drop('Spam', axis=1)
    y = testdata['Spam']
    importDataHelper.writecsvfiledict(variables.linclasstrainingsdatapath + challenge + ".csv", featurelist.keys(), featurelist)
    clf = RidgeClassifier().fit(X, y)
    print(clf.score(X, y))
    return clf, clf.coef_
def eval_all():
    challengedict = {
        "TCO":
        list(importDataHelper.readcsvdata("Data/DBs/ideaDB/TCO.csv")),
        "bionicRadar":
        list(importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv")),
        "fabricDisplay":
        list(importDataHelper.readcsvdata("Data/DBs/ideaDB/fabricDisplay.csv"))
    }
    dupdict = {}
    for key in challengedict.keys():
        idealist = []
        for key2 in challengedict.keys():
            if key2 is not key:
                idealist += challengedict[key2].copy()
        X_train, X_test = train_test_split(challengedict[key], test_size=0.33)
        idealist += X_train.copy()

        X_ndtrain = duplicateDetection.filterduplikates(
            X_train, variables.resultpath + "eval2" + key + ".csv")
        dupdict[key] = len(X_train) - len(X_ndtrain)
        X_ndtest = X_test.copy()
        idealist_test = X_test.copy()
        idealist_nodups_test = X_test.copy()
        results = evaluate_system(X_train, X_test, key)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluation" + key + ".csv", results.keys(),
            results)
        print("Done first set")
        results2 = evaluate_system(X_ndtrain, X_ndtest, key, dups=True)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluationResultsNoDups" + key + ".csv",
            results2.keys(), results2)
        print("Challenge training done", key)

        idealist_nodups = duplicateDetection.filterduplikates(
            idealist, variables.resultpath + "eval" + key + ".csv")
        dupdict[key + " All"] = len(idealist) - len(idealist_nodups)
        results = evaluate_system(idealist, idealist_test)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluationAll" + key + ".csv", results.keys(),
            results)
        print("Done first set")
        results2 = evaluate_system(idealist_nodups,
                                   idealist_nodups_test,
                                   dups=True)
        importDataHelper.writecsvfiledict(
            "Data/ResultsAllNew/evaluationResultsNoDupsAll" + key + ".csv",
            results2.keys(), results2)
        print("All training done", key)
    print(dupdict)
    importDataHelper.writecsvfiledict("Data/ResultsAllNew/dupNums.csv",
                                      dupdict.keys(), dupdict)
예제 #3
0
def extend_noun_corpus():
    idealist = list(
        importDataHelper.readcsvdata(variables.importpathclassified +
                                     'cscw19-unapproved-ideas_import.csv'))
    nouncorpus = read_noun_corpus()
    unigram_tagger = prepare_tagger()
    for idea in idealist:
        nouns = get_Nouns(idea['DESCRIPTION'], unigram_tagger)
        for noun in nouns:
            if noun not in nouncorpus:
                nouncorpus[noun] = "unclassified"
    importDataHelper.writecsvfiledict(variables.dbpath + 'NLPdata/NounDB.csv',
                                      nouncorpus.keys(), nouncorpus)
def evaluate_fun():
    idealist = list(
        importDataHelper.readcsvdata("Data/DBs/ideaDB/bionicRadar.csv"))
    X_train, X_test = train_test_split(idealist, test_size=0.33)
    X_ndtrain = duplicateDetection.filterduplikates(
        X_train, variables.resultpath + "evalbionicRadar.csv")
    X_ndtest = X_test.copy()
    results = evaluate_system(X_train, X_test, "bionicRadar")
    importDataHelper.writecsvfiledict(
        "Data/ResultsNew/evaluationResultsbionicRadar.csv", results.keys(),
        results)
    print("Done first set")
    results2 = evaluate_system(X_ndtrain, X_ndtest, "bionicRadar", dups=True)
    importDataHelper.writecsvfiledict(
        "Data/ResultsNew/evaluationResultsNoDupsbionicRadar.csv",
        results2.keys(), results2)

    print("Done")
예제 #5
0
def classify_noun_corpus():
    nouncorpus = read_noun_corpus()
    print(
        "Enter \'y\' if noun is concrete, \'n\' if noun is abstract, \'skip\' to skip the word or \'stop\' to safe results and stop classifying"
    )
    for noun in nouncorpus.keys():
        if "unclassified" in nouncorpus[noun]:
            answer = input(noun + ": ")
            if 'y' in answer:
                nouncorpus[noun] = "C"
            elif 'n' in answer:
                nouncorpus[noun] = "A"
            elif 'skip' in answer:
                pass
            else:
                importDataHelper.writecsvfiledict(
                    variables.dbpath + 'NLPdata/NounDB.csv', nouncorpus.keys(),
                    nouncorpus)
                break
예제 #6
0
def trainbayes(idealist, challenge=None, delete=False, duplicates=False):
    if delete:
        bayesspamwords = {}
        bayeshamwords = {}
    else:
        bayesspamwords = getspamtokens(challenge, duplicates)
        bayeshamwords = gethamtokens(challenge, duplicates)
    nspam = int(bayesspamwords.pop("<IdeaCount>", 0))
    nham = int(bayeshamwords.pop("<IdeaCount>", 0))
    for idea in idealist:
        if idea.get("STATUS", "") == "unusable":
            bayesspamwords = updatedb(idea['DESCRIPTION'], bayesspamwords)
            nspam += 1
        elif idea.get("STATUS", "") == "usable":
            nham += 1
            bayeshamwords = updatedb(idea['DESCRIPTION'], bayeshamwords)
        elif idea.get("STATUS", "") == "unreviewed" and "spam" in idea.get(
                'SPAM', ""):
            bayesspamwords = updatedb(idea['DESCRIPTION'], bayesspamwords)
            nspam += 1
        elif idea.get("STATUS", "") == "unreviewed" and "ham" in idea.get(
                'SPAM', ""):
            nham += 1
            bayeshamwords = updatedb(idea['DESCRIPTION'], bayeshamwords)
#        if "spam" in idea.get('SPAM', "") or "unusable" in idea.get("STATUS"):
#            bayesspamwords = updatedb(idea['DESCRIPTION'], bayesspamwords)
#            nspam += 1
#        else:
#            nham += 1
#            bayeshamwords = updatedb(idea['DESCRIPTION'], bayeshamwords)
    bayesspamwords["<IdeaCount>"] = nspam
    bayeshamwords["<IdeaCount>"] = nham
    if challenge is None:
        if duplicates:
            importDataHelper.writecsvfiledict(
                variables.complexbayesmixedpath +
                'duplicateBayesSpamToken.csv', bayesspamwords.keys(),
                bayesspamwords)
            importDataHelper.writecsvfiledict(
                variables.complexbayesmixedpath + 'duplicateBayesHamToken.csv',
                bayeshamwords.keys(), bayeshamwords)
            probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam,
                                       nham)
            importDataHelper.writecsvfiledict(
                variables.complexbayesmixedpath +
                'duplicateBayesTokenProbs.csv', probslist.keys(), probslist)
        else:
            importDataHelper.writecsvfiledict(
                variables.complexbayesmixedpath + 'bayesSpamToken.csv',
                bayesspamwords.keys(), bayesspamwords)
            importDataHelper.writecsvfiledict(
                variables.complexbayesmixedpath + 'bayesHamToken.csv',
                bayeshamwords.keys(), bayeshamwords)
            probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam,
                                       nham)
            importDataHelper.writecsvfiledict(
                variables.complexbayesmixedpath + 'bayesTokenProbs.csv',
                probslist.keys(), probslist)
    else:
        if not os.path.exists(variables.complexbayeschallengebasedpath +
                              challenge):
            try:
                os.mkdir(variables.complexbayeschallengebasedpath + challenge)
            except OSError:
                print(
                    "Path for Challenge does not exist and could not be created"
                )
        if duplicates:
            importDataHelper.writecsvfiledict(
                variables.complexbayeschallengebasedpath +
                challenge + '/duplicateBayesSpamToken.csv',
                bayesspamwords.keys(), bayesspamwords)
            importDataHelper.writecsvfiledict(
                variables.complexbayeschallengebasedpath + challenge +
                '/duplicateBayesHamToken.csv', bayeshamwords.keys(),
                bayeshamwords)
            probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam,
                                       nham)
            importDataHelper.writecsvfiledict(
                variables.complexbayeschallengebasedpath + challenge +
                '/duplicateBayesTokenProbs.csv', probslist.keys(), probslist)
        else:
            importDataHelper.writecsvfiledict(
                variables.complexbayeschallengebasedpath + challenge +
                '/bayesSpamToken.csv', bayesspamwords.keys(), bayesspamwords)
            importDataHelper.writecsvfiledict(
                variables.complexbayeschallengebasedpath + challenge +
                '/bayesHamToken.csv', bayeshamwords.keys(), bayeshamwords)
            probslist = calculateprobs(bayesspamwords, bayeshamwords, nspam,
                                       nham)
            importDataHelper.writecsvfiledict(
                variables.complexbayeschallengebasedpath + challenge +
                '/bayesTokenProbs.csv', probslist.keys(), probslist)
def evaluate_system(X_train, X_test, challenge=None, dups=False):
    unigram_tagger, st = spamFilter.prepare_tagger()
    features = {}
    data = {"DESCRIPTION": [], "Spam": []}
    for idea in X_train:
        idea, feature = spamFilter.classify_and_get_idea(
            idea, unigram_tagger, st)
        data["DESCRIPTION"].append(idea["DESCRIPTION"])
        for key in feature.keys():
            features[key] = features.get(key, [])
            features[key].append(feature[key])
        if idea["STATUS"] == "unusable":
            features["Spam"] = features.get("Spam", [])
            features["Spam"].append(1)
            data["Spam"].append(1)
        elif idea["STATUS"] == "usable":
            features["Spam"] = features.get("Spam", [])
            features["Spam"].append(0)
            data["Spam"].append(0)
        elif "spam" in idea["SPAM"]:
            features["Spam"] = features.get("Spam", [])
            features["Spam"].append(1)
            data["Spam"].append(1)
        else:
            features["Spam"] = features.get("Spam", [])
            features["Spam"].append(0)
            data["Spam"].append(0)

    print("prepared")
    bayes.trainbayes(X_train,
                     challenge=challenge,
                     delete=True,
                     duplicates=dups)
    complexBayes.trainbayes(X_train,
                            challenge=challenge,
                            delete=True,
                            duplicates=dups)

    wordprobs = bayes.gettokenprobs(challenge=challenge, duplicates=dups)
    comwordprobs = complexBayes.gettokenprobs(challenge=challenge,
                                              duplicates=dups)

    linClass, coeff = linearClassifier.train_linear_classifier(features)
    useest = USEClassifier.train_classifier_idealist(pd.DataFrame(data))
    print(coeff)

    actual = []
    predbay = []
    predcombay = []
    predUSE = []
    predLin = []
    features = {}
    for idea in X_test:
        if not idea["STATUS"] == "unreviewed":
            actual.append(idea["STATUS"] == "unusable")
        else:
            actual.append("spam" in idea["SPAM"])
        predbay.append(bayes.classify(idea["DESCRIPTION"], wordprobs))
        predcombay.append(
            complexBayes.classify(idea["DESCRIPTION"], comwordprobs))
        if idea["DESCRIPTION"] == "" or idea["DESCRIPTION"] == []:
            predUSE.append((0, 0.0))
        else:
            predUSE.append(
                (USEClassifier.classify(useest,
                                        {"DESCRIPTION": idea["DESCRIPTION"]})))
        idea["TRIGGERED"] = idea.get("TRIGGERED", [])
        idea, ideadata = spamFilter.classify_and_get_idea(
            idea, unigram_tagger, st)
        test = False
        for ideakey in ideadata.keys():
            if ideadata[ideakey] == 1:
                features[ideakey] = features.get(ideakey, [])
                features[ideakey].append(1)
                test = True
            else:
                features[ideakey] = features.get(ideakey, [])
                features[ideakey].append(0)
            ideadata[ideakey] = [ideadata[ideakey]]

        if test:
            predLin.append(
                linearClassifier.classify(pd.DataFrame(ideadata), linClass))
        else:
            predLin.append((0, 0.0))
    results = {
        "actual": actual,
        "bayes": predbay,
        "complexbayes": predcombay,
        "USE": predUSE,
        "linCLassifier": predLin,
        "linClassCo": coeff,
        "Filter": features
    }
    importDataHelper.writecsvfiledict(
        variables.resultpath + "evaluationResults.csv", results.keys(),
        results)
    return results