def trainClassifiers(): argv = sys.argv[1:] extractFeatures(argv[0]) trainXGBoost(FEATURES) print('\n\n') trainRandomForrest(FEATURES)
def applyRules(IDsFilename): """Uses rule based approach to classify the reviews from the given set.""" print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, file=IDsFilename)) print("Creating reviews...(this may take a while)") dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH) # print("Loading reviews...") # dataSet = Corpus.loadCorpus(filename="training_set.pk") print("Extracting features...") features, featureVectors = extractFeatures(dataSet.reviewIDs, dataSet.reviews) gold = dataSet.goldStandard classification = classify(features, featureVectors) showFeatureOccurrence(features, featureVectors, gold, classification) targets = [] cls = [] for ID, g in gold.items(): targets.append(g) cls.append(classification[ID]) showPerformance(targets, cls)
def showFeatures(IDsFilename=REVIEW_IDS_FILENAME): corpus = Corpus(IDsFilename) features, featureVectors = extractFeatures(corpus.reviewIDs, corpus.reviews, features=None) showFeatureOccurrence(features, featureVectors)
def testARFFExport(): """ Tests the functionality to export the features as a valid ARFF file.""" ironicIDs, regularIDs, reviews = createTestReviews() reviewIDs = ironicIDs + regularIDs # for review in reviews.values(): # print(review) features, featureVectors = extractFeatures(reviewIDs, reviews, features=None, createARFF=True)
def main(): argv = sys.argv[1:] # print(argv) try: opts, args = getopt.getopt(argv, 'i:o:c:', []) print(opts) except getopt.GetoptError as err: print(str(err)) sys.exit(2) inDir = '' outDir = '' classifier = '' for opt, arg in opts: if opt == '-i': if not os.path.exists(arg): print('Aborting, can not find infile:', arg) sys.exit(2) inDir = arg elif opt == '-o': outDir = arg else: if arg not in ('xgboost', 'randomforrest'): print( 'Aborting, invalid classifier must be "xgboost" or "randomforrest"' ) sys.exit(2) classifier = arg extractFeatures(inDir) if classifier == 'randomforrest': if not os.path.exists(RANDOM_FORREST_CLASSIFIER): print('Could not find trained RandomForrestClassifier') sys.exit(2) testRandomForrest('features.csv', outDir) if classifier == 'xgboost': if not os.path.exists(XGBOOST_MODEL): print('Can not find trained XGBoost model') sys.exit(2) testXGBoost('features.csv', outDir)
def testRules(): """Uses rule based approach to classify reviews.""" ironicIDs, regularIDs, reviews = createTestReviews() features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews) gold = {ID: reviews[ID].ironic for ID in ironicIDs + regularIDs} classification = ruleClassify(features, featureVectors) showFeatureOccurrence(features, featureVectors, gold, classification) showPerformance(gold, classification)
def applySingleRules(IDsFilename): """ Should originally just apply one rule. Is now used to apply one feature to the given corpus. So it basically shows how often each feature occurs in ironic and regular reviews. """ print("Using the set at '{path}{file}'".format(path=CORPUS_PATH, file=IDsFilename)) print("Creating reviews...(this may take a while)") dataSet = Corpus(IDsFilename, corpusPath=CORPUS_PATH) print("Loading reviews...") # dataSet = Corpus.loadCorpus(filename="training_set.pk") # dataSet = Corpus.loadCorpus(filename="training_and_validation_set.pk") print("Extracting features...") features, featureVectors = extractFeatures(dataSet.reviewIDs, dataSet.reviews) showFeatureOccurrence(features, featureVectors) gold = dataSet.goldStandard # decisiveFeatureNames = ["Scare quotes", # "Positive star polarity discrepancy", # "Negative star polarity discrepancy", # "Positive Ppunctuation", # "Negative Ppunctuation", # "Streak of Positive Words", # "Ellipsis and Punctuation", # "Emoticon Happy", "Emoticon Laughing", # "Emoticon Winking", "Emotion Tongue", # "LoLAcroym", "GrinAcronym", "Onomatopoeia", # "Interrobang"] decisiveFeatureNames = [f.name for f in features] for d in decisiveFeatureNames: classification = classify(features, featureVectors, [d]) targets = [] cls = [] for ID, g in gold.items(): targets.append(g) cls.append(classification[ID]) print("\nClassifying by rule: ", d) showPerformance(targets, cls)
def evaluateClassifiers(): argv = sys.argv[1:] # If dir is specified recompute features top_features = 0 feat_selection = [] if len(argv) == 1: if argv[0].isdigit(): top_features = int(argv[0]) else: extractFeatures(argv[0]) if len(argv) == 2: top_features = int(argv[0]) extractFeatures(argv[1]) # Load features and split in test/train data = pd.read_csv(FEATURES) train, test = train_test_split(data, test_size=0.2, random_state=1) # Write as training and testing expect to read from file train.to_csv(EVAL_TRAIN) test.to_csv(EVAL_TEST) # Feature selection if top_features != 0: features = pd.read_csv('eval_forrest_feature_importance.csv', names=['feature', 'importance'], skiprows=1) features.sort_values(by=['importance'], ascending=False, inplace=True) feat_selection = features['feature'][:top_features].as_matrix() print("USED TOP {} FEATURES".format(len(feat_selection)), feat_selection) # Evaluate both classifiers evaluateRandomForrest(test, train, feat_selection) evaluateXGBoost(test, train, feat_selection)
def createARFF(class1, class2, arff_path, corpus_path=CORPUS_PATH+TRAIN_PATH): """ Define features by their number (see features: feature_names) and create ARFF file. """ corpus = Corpus(class1, class2,corpusPath=corpus_path) allConfig = range(5) featureConfigs = [] # mode,feat,regEx,new_stack,bigram,sentiment,configuration featureConfigs.append(("specific",True,False,False,False,False,allConfig,"features_all_specific" + "_" + class1 + "_vs_" + class2)) # combine each feature with each feature, e.g. IF stopword = true AND negation = true THEN combi = true. binary_combination=False # = createARFF -> wird erstellt oder "" -> wird nicht erstellt for mode, feat, regExp, new_stack, bigram, sentiment, config, createARFF in featureConfigs: createARFF_file = createARFF features, featureVectors = extractFeatures(class1, class2, mode, arff_path, corpus.class1IDs + corpus.class2IDs, corpus.tweets, config, feat, regExp, new_stack, binary_combination, sentiment, bigram, createARFF_file)
def process_audio(request): file = request.FILES['speech'] fs = FileSystemStorage() filename = "temp_%d.wav" % time.time() filename = fs.save(filename, file) feats = extractFeatures("%s/%s" % (fs.location, filename), scmc=True) f_label = gmm.test(feats, 256, settings.GMM_ROOT) # r = settings.RECOGNIZER # with sr.AudioFile("%s/%s" % (fs.location, filename)) as source: # audio = r.record(source) # transcript,score = r.recognize_sphinx(audio) ler, transcript = LER("%s/%s" % (fs.location, filename), request.POST['transcript']) r_label = ler < settings.RECOGNITION_THRESH context = { "label": "passed" if r_label and f_label else "failed", "transcript": transcript, "score": ler } resp = json.dumps(context) # return render(resp, content_type='application/json') return render(request, "basic_index.html", context)
import pandas import features import classify import cluster import itertools INPUT_FILE = "input.log" FEATURES_FILE = "features.csv" RESULT_FILE = "clusterResult.csv" EXTRACTED_SECTIONS_FILE = "detectedSections.log" LABEL_COLUMN_NAME = "label" LINE_NUMBER_COLUMN_NAME = "line_number" features.extractFeatures(inputFile=INPUT_FILE, outputFile=FEATURES_FILE) unlabelledSet = pandas.read_csv(FEATURES_FILE, skipinitialspace=True, header=0) unlabelledSet[LABEL_COLUMN_NAME] = pandas.DataFrame( classify.predict(unlabelledSet), columns=[LABEL_COLUMN_NAME]) #unlabelledSet[LABEL_COLUMN_NAME] = pandas.read_csv("labels.csv", skipinitialspace=True, header=0) labelledSet = unlabelledSet #classify.train(labelledSet, 1000) #classify.evaluate(labelledSet, 1) type10 = cluster.kMeans(10, labelledSet, columnPrefix="type") type100 = cluster.kMeans(100, labelledSet, columnPrefix="type") type1000 = cluster.kMeans(1000, labelledSet, columnPrefix="type")
#extract the data from csv const.M_names, const.F_names, const.name_map = dataParser.get_data() #extract the training/test sets const.ngram_training_set = extract(config.data_extraction_size) const.training_set = extract(config.training_size, labelled=True) const.cv_set = extract(config.cv_set_size, labelled=True) const.test_set = extract(config.test_set_size, labelled=True) #determine most common ngrams getCommonGrams(const.ngram_training_set) get_suffixes(const.ngram_training_set) const.featureCount += config.di_num + config.tri_num + config.last_letters + config.di_sufnum + config.tri_sufnum const.X_train, const.y_train = extractFeatures(const.training_set) const.X_cv, const.y_cv = extractFeatures(const.cv_set) const.X_test, const.y_test = extractFeatures(const.test_set) #--CLASSIFER--# X_train = torch.stack([torch.tensor(i) for i in const.X_train]) y_train = torch.from_numpy(const.y_train) X_cv = torch.stack([torch.tensor(i) for i in const.X_cv]) y_cv = torch.from_numpy(const.y_cv) X_test = torch.stack([torch.tensor(i) for i in const.X_test]) y_test = torch.from_numpy(const.y_test) training_set = data.TensorDataset(X_train, y_train)
def applyMachineLearning(class1, class2, randomSeed, arff_path, trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH): """ Uses machine learning approach to classify sentences. """ no_configs = [("irony", "figurative"), ("irony", "irony"), ("sarcasm", "sarcasm"), ("sarcasm", "figurative"), ("regular", "regular"), ("figurative", "irony"), ("figurative", "sarcasm"), ("figurative", "figurative")] if (class1, class2) in no_configs: print "ERROR! Please use allowed combination of classes!" exit() ## ---------- feature configurations -------------------------------- featureConfigs = [] bowConfig = [] bowBigramConfig = [] allBinaryConfig = range(18, 40) allConfig = range(60) allConfig.extend(range(156, 157)) allWithoutNumbers = range(18, 60) allWithoutNorm = range(40) allWithoutStacks = range(60) normConfig = range(40, 60) normConfig.extend(range(156, 157)) numbersConfig = range(18) stacksConfig = range(60, 156) allWithoutBinary = range(18) allWithoutBinary.extend(range(40, 60)) allWithoutBoW = range(60) allWithoutBowBigram = range(60) allWithoutBigrams = range(60) # top10 - evaluated with weka chi^2-test if class1 == "irony" and class2 == "sarcasm": top10Config = [43, 57, 52, 156, 44, 56, 51, 42, 50, 49] elif class1 == "irony" and class2 == "regular": top10Config = [35, 54, 45, 56, 50, 42, 52, 44] elif class1 == "sarcasm" and class2 == "regular": top10Config = [35, 54, 45, 50, 52, 42, 57, 56] else: top10Config = [] # ablation study 1 sentimentConfig = [ 0, 5, 42, 43, 44, 18, 20, 21, 22, 23, 24, 25, 26, 28, 31, 34, 36, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 155 ] # number = [0,5], norm = [42,43,44], binaer = [18,20,21,22,23,24,25,26,28,31,34,36], stack = [70,71,72,73,74,75,76,78,79,80,81,82,83,84,155] subjConfig = [ 4, 59, 150, 151, 152, 153, 154 ] # number = [4], norm = [59], binaer = [], stack = [150,151,152,153,154] syntaxConfig = [ 1, 13, 16, 52, 56, 156, 29, 115, 116, 117, 118, 119, 135, 136, 137, 138, 139 ] # number = [1,13,16], norm = [52,56,156], binaer = [29], stack = [115,116,117,118,119,135,136,137,138,139] posConfig = [ 6, 7, 11, 17, 45, 50, 51, 57, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 140, 141, 142, 143, 144 ] # number = [6,7,11,17], norm = [45,50,51,57], binaer = [], stack = [85,86,87,88,89,105,106,107,108,109,110,111,112,113,114,140,141,142,143,144] emoticonConfig = [ 2, 10, 12, 14, 15, 41, 46, 48, 53, 55, 58, 30, 32, 37, 65, 66, 67, 68, 69, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 120, 121, 122, 123, 124, 130, 131, 132, 133, 134, 145, 146, 147, 148, 149 ] urlAndUserConfig = [ 39, 35 ] # number = [], norm = [], binaer = [35,39], stack = [] signalConfig = [ 3, 8, 9, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 54, 19, 27, 33, 38, 60, 61, 62, 63, 64, 100, 101, 102, 103, 104, 125, 126, 127, 128, 129 ] # number = [3,8,9], norm = [40,41,42,43,44,45,46,47,48,49,54], binaer = [19,27,33,38], stack = [60,61,62,63,64,100,101,102,103,104,125,126,127,128,129] signalGroupConfig = signalConfig signalGroupConfig.extend(urlAndUserConfig) signalGroupConfig.extend(emoticonConfig) syntaxGroupConfig = syntaxConfig syntaxGroupConfig.extend(subjConfig) syntaxGroupConfig.extend(posConfig) syntaxAndSentiment = syntaxGroupConfig syntaxAndSentiment.extend(sentimentConfig) syntaxAndSignal = syntaxGroupConfig syntaxAndSignal.extend(signalGroupConfig) sentimentAndSignal = signalGroupConfig sentimentAndSignal.extend(sentimentConfig) # ablation study 2: ablation from ALL allWithoutSentiment = list(set(range(157)) - set(sentimentConfig)) allWithoutPOS = list(set(range(157)) - set(posConfig)) # signal + emoticons + url&User allWithoutSignal = list( set(range(157)) - set(signalConfig) - set(emoticonConfig) - set(urlAndUserConfig)) # Syntax # Subjectivity allWithoutSyntax = list( set(range(157)) - set(syntaxConfig) - set(subjConfig) - set(posConfig)) allWithoutTop10 = list(set(allConfig) - set(top10Config)) # ablation study 2: ablation from BINARY binaryWithoutSentiment = list(set(allBinaryConfig) - set(sentimentConfig)) binaryWithoutPOS = list(set(allBinaryConfig) - set(posConfig)) # New Signal: Old Signal + Emoticon and RegExp and URL and User binaryWithoutSignal = list( set(allBinaryConfig) - set(signalConfig) - set(emoticonConfig) - set(urlAndUserConfig)) # New Syntax: Old Syntax + Subj: binaryWithoutSyntax = list( set(allBinaryConfig) - set(syntaxConfig) - set(subjConfig) - set(posConfig)) binaryWithoutTop10 = list(set(allBinaryConfig) - set(top10Config)) # full configuration of feature list for feature extraction # mode,feat,regEx,stack_binning,bigram,sentiment,configuration # featureConfigs.append(("bow",True,True,True,False,False,bowConfig,"features_bowConfig")) # featureConfigs.append(("bow",True,True,True,True,False,bowBigramConfig,"features_bowBigramConfig")) # featureConfigs.append(("all",True,True,False,True,True,allBinaryConfig,"features_allBinaryConfig")) # featureConfigs.append(("all",True,True,True,True,True,allConfig,"features_allConfig")) # featureConfigs.append(("all",True,True,True,True,True,allWithoutNumbers,"features_allWithoutNumbers")) # featureConfigs.append(("all",True,True,True,True,True,allWithoutNorm,"features_allWithoutNorm")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutStacks,"features_allWithoutStacks")) # featureConfigs.append(("all",True,False,True,True,True,allWithoutBinary,"features_allWithoutBinary")) # featureConfigs.append(("specific",True,True,True,True,True,allWithoutBoW,"features_allWithoutBoW")) featureConfigs.append( ("specific", True, True, True, False, True, allWithoutBowBigram, "features_allWithoutBowBigram")) # featureConfigs.append(("all",True,True,True,False,True,allWithoutBigrams,"features_allWithoutBigrams")) # featureConfigs.append(("all",True,False,False,True,False,numbersConfig, "features_numbersConfig")) # featureConfigs.append(("all",True,False,False,True,False,normConfig, "features_normConfig")) # featureConfigs.append(("all",True,False,False,True,False,stacksConfig, "features_stacksConfig")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutEmoticons,"features_allWithoutEmoticons")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutSubj,"features_allWithoutSubj")) # featureConfigs.append(("all",True,False,False,True,True,allWithoutRegExpAndURLandUser,"features_allWithoutRegExpAndURLandUser")) # featureConfigs.append(("specific",True,True,False,True,True,allBinaryConfig,"features_binaryWithoutBoW")) # featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBoWBi")) # featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBi")) # # featureConfigs.append(("all",True,True,False,True,False,binaryWithoutSentiment,"features_binaryWithoutSentiment")) # featureConfigs.append(("all",True,True,False,True,True,binaryWithoutPOS,"features_binaryWithoutPOS")) # featureConfigs.append(("all",True,False,False,True,True,binaryWithoutSignal,"features_binaryWithoutSignal")) # featureConfigs.append(("all",True,True,False,True,True,binaryWithoutSyntax,"features_binaryWithoutSyntax")) # featureConfigs.append(("all",True,True,False,True,False,binaryWithoutWeka,"features_binaryWithoutWeka")) # featureConfigs.append(("all",True,True,False,True,True,binaryWithoutTop10,"features_binaryWithoutTop10")) # featureConfigs.append(("all",True,True,False,True,False,allWithoutSentiment,"features_allWithoutSentiment")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutPOS,"features_allWithoutPOS")) # featureConfigs.append(("all",True,False,False,True,True,allWithoutSignal,"features_allWithoutSignal")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutSyntax,"features_allWithoutSyntax")) # featureConfigs.append(("all",True,True,False,True,False,allWithoutWeka,"features_allWithoutWeka")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutTop10,"features_allWithoutTop10")) # feature categories: # featureConfigs.append(("all",True,False,False,True,True,sentimentConfig, "features_sentimentConfig")) # featureConfigs.append(("all",True,False,False,True,False,posConfig, "features_posConfig")) # featureConfigs.append(("all",True,True,False,True,False,signalGroupConfig, "features_signalGroupConfig")) # featureConfigs.append(("all",True,False,False,True,False,syntaxGroupConfig, "features_syntaxGroupConfig")) # # featureConfigs.append(("specific",True,False,False,False,True,sentimentConfig, "features_sentimentConfig_specific")) # featureConfigs.append(("specific",True,False,False,False,False,posConfig, "features_posConfig_specific")) # featureConfigs.append(("specific",True,True,False,False,False,signalGroupConfig, "features_signalGroupConfig_specific")) # featureConfigs.append(("specific",True,False,False,False,False,syntaxGroupConfig, "features_syntaxGroupConfig_specific")) # Combinations: # featureConfigs.append(("specific",True,True,False,False,True,sentimentAndSignal,"features_sentimentAndSignal")) # featureConfigs.append(("specific",True,False,False,False,True,syntaxAndSentiment,"features_syntaxAndSentiment")) # featureConfigs.append(("specific",True,True,False,False,False,syntaxAndSignal,"features_syntaxAndSignal")) # featureConfigs.append(("all",True,False,False,True,False,top10Config, "features_top10Config")) print str(len(featureConfigs)) + " different configurations of features" # create file which contains status reports. with open("info.txt", "a") as info: info.write("Start" + "\n") print "Start" # TODO: Add condition to create corpus, if no file exists. info.write("Training the classifiers using the set at '{path}{file}'". format(path=setPath, file=trainingSetFilename) + "\n") print( "Training the classifiers using the set at '{path}{file}'".format( path=setPath, file=trainingSetFilename)) lt = localtime() info.write("Begin loading Corpus " + class1 + " vs " + class2 + " - " + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]) + "\n") print("Begin loading Corpus " + class1 + " vs " + class2 + " - " + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0])) # load training corpus. trainingSet = Corpus(class1, class2, trainingSetFilename, corpusPath=CORPUS_PATH + TRAIN_PATH) # Get the ids - which are ordered class1, class2 and shuffle them. trainingIDs = trainingSet.tweetIDs random.seed(randomSeed) random.shuffle(trainingIDs) # load test corpus if filename is given; not needed for cross validation. if not testSetFilename == None: testSet = Corpus(class1, class2, testSetFilename, corpusPath=CORPUS_PATH + TEST_PATH) tweets = dict(trainingSet.tweets.items() + testSet.tweets.items()) mode_list = [] bigram_list = [] # only create dict for bag-of-words and bigrams if really necessary! for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs: mode_list.append(mode) bigram_list.append(bigram) if "all" in mode_list: bowDictionary = createBagOfWordsDictionary(tweets) print "loaded bow" elif "bow" in mode_list: bowDictionary = createBagOfWordsDictionary(tweets) print "loaded bow" else: bowDictionary = {} print "bow not necessary" if True in bigram_list: bigramDictionary = createBagOfBigramsDictionary(tweets) print "loaded bigrams" else: bigramDictionary = {} print "bigrams not necessary" else: bowDictionary = None bigramDictionary = None lt = localtime() info.write("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]) + "\n") print("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0])) info.write("Extracting features with different configurations \n") print("Extracting features with different configurations") t = 0 # feature extraction using above feature configurations. for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs: trainFeatures = [] trainFeatureVectors = {} testFeatures = [] testFeatureVectors = {} t += 1 config_name = createARFF + "_" + class1 + "_vs_" + class2 # if empty string no arff file will be generated # else set createARFF_file = createARFF createARFF_file = createARFF with open("info_" + config_name + ".txt", "a") as info: print "\n" + str( t ) + "th configuration\n-----------------------------------------\n" info.write( "\n" + str(t) + "th configuration\n-----------------------------------------\n" ) # optional: if true, then all binary combinations of all # features are added to feature list. binary_combination = False # feature extraction. trainFeatures, trainFeatureVectors = extractFeatures( class1, class2, mode, arff_path, trainingIDs, trainingSet.tweets, config, feat, regExp, stack_binning, binary_combination, sentiment, bigram, createARFF_file, bowDictionary, bigramDictionary) # array of train data - is not necessary; just used for safeguard. tTargets = [] tData = [] # sparse matrix of train data: ID_map_train = {} rdim = len(trainFeatureVectors.keys()) cdim = len(trainFeatureVectors[trainingIDs[0]]) # create sparse matrix with rdim x cdim trainData = lil_matrix((rdim, cdim)) trainGold = trainingSet.goldStandard trainTargets = range(len(trainGold)) j = 0 for ID, g in trainGold.items(): ID_map_train[j] = ID # array part. tTargets.append(g) tData.append(trainFeatureVectors[ID]) # matrix will be filled. for i in range(len(trainFeatureVectors[ID])): if trainFeatureVectors[ID][i] != 0: trainData[j, i] = trainFeatureVectors[ID][i] trainTargets[j] = g j += 1 trainFeatureVectors = {} trainGold = {} classifiers = [(DecisionTreeClassifier(), "Decision_Tree"), (SVC(kernel="linear"), "Linear_SVC"), (SVC(), "SVC"), (LinearSVC(), "LinearSVC"), (LogisticRegression(), "logRegression")] # classifiers which need matrix matrixClassifier = ["Linear_SVC", "SVC", "LinearSVC", "logRegression"] # Cross validation if testSetFilename == None: for c, name in classifiers: if name in matrixClassifier: if isspmatrix(trainData): duration = timeit(lambda: applyCrossValidation( class1, class2, createARFF, ID_map_train, c, name, trainData, trainTargets, 10), number=1) showDuration(createARFF, name, duration) else: duration = timeit(lambda: applyCrossValidation( class1, class2, createARFF, ID_map_train, c, name, tData, tTargets, 10), number=1) showDuration(createARFF, name, duration) # use test data for evaluation. else: with open("info.txt", "a") as info: info.write( "Testing the classifiers using the set at '{path}{file}'". format(path=CORPUS_PATH, file=testSetFilename) + "\n") print( "Testing the classifiers using the set at '{path}{file}'". format(path=CORPUS_PATH, file=testSetFilename)) info.write("Extracting features... \n") testIDs = testSet.tweetIDs random.seed(randomSeed) random.shuffle(testIDs) # feature extraction for test data. testFeatures, testFeatureVectors = extractFeatures( class1, class2, mode, arff_path, testIDs, testSet.tweets, config, feat, regExp, stack_binning, binary_combination, sentiment, bigram, createARFF_file, bowDictionary, bigramDictionary) # array of test data: tsTargets = [] tsData = [] # sparse matrix of test data rdim = len(testFeatureVectors.keys()) cdim = len(testFeatureVectors[testIDs[0]]) testData = lil_matrix((rdim, cdim)) testGold = testSet.goldStandard testTargets = range(len(testGold)) ID_map_test = {} j = 0 for ID, g in testGold.items(): ID_map_test[j] = ID # array tsTargets.append(g) tsData.append(testFeatureVectors[ID]) # matrix for i in range(len(testFeatureVectors[ID])): if testFeatureVectors[ID][i] != 0: testData[j, i] = testFeatureVectors[ID][i] testTargets[j] = g j += 1 testFeatureVectors = {} testGold = {} for c, name in classifiers: if name in matrixClassifier: duration = timeit(lambda: applyClassifier( class1, class2, createARFF, ID_map_test, c, name, trainData, trainTargets, testData, testTargets), number=1) showDuration(createARFF, name, duration) else: duration = timeit(lambda: applyClassifier( class1, class2, createARFF, ID_map_test, c, name, tData, tTargets, tsData, tsTargets), number=1) showDuration(createARFF, name, duration)
def applyML2(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH): """ Uses machine learning approach to classify sentences. Implements a truly simple 'Leave One Out' function. """ # TODO: Add condition to create corpus, if no file exists. print("Training the classifiers using the set at '{path}{file}'".format( path=setPath, file=trainingSetFilename)) #trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH) # trainingSet = Corpus.loadCorpus(filename=trainingSetFilename) # trainingSet = Corpus.loadCorpus(filename="training_and_validation_set.pk") trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk") # for each in trainingSet.reviewIDs[0:10]: # print(each) # print() # Get the ids - which are ordered ironic, regular - and shuffle them. ids = trainingSet.reviewIDs random.seed(44) random.shuffle(ids) # for each in ids[0:10]: # print(each) # print() # Falls das -new flag nicht gesetzt ist ODER es keine Datei zum laden gibt, # erstelle den Corpus neu. print("Extracting features...") # trainFeatures, trainFeatureVectors = extractFeatures(trainingSet.reviewIDs, # trainingSet.reviews) featureConfig = { "minus Imba": { u"Positive Quotes": (u"\"..\"", scareQuotes), u"Negative Quotes": (u"\"--\"", scareQuotesNegative), u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation), u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation), u"Positive Hyperbole": (u"3w+", positiveStreak), u"Negative Hyperbole": (u"3w-", negativeStreak), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, "minus Quotes": {u"Positive Imbalance": (u"w-\u2605 ", posStarPolarityDiscrepancy), u"Negative Imbalance": (u"w+\u2606 ", negStarPolarityDiscrepancy), u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation), u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation), u"Positive Hyperbole": (u"3w+", positiveStreak), u"Negative Hyperbole": (u"3w-", negativeStreak), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, "minus Pos/Neg&Punctuation": {u"Positive Imbalance": (u"w-\u2605 ", posStarPolarityDiscrepancy), u"Negative Imbalance": (u"w+\u2606 ", negStarPolarityDiscrepancy), u"Positive Quotes": (u"\"..\"", scareQuotes), u"Negative Quotes": (u"\"--\"", scareQuotesNegative), u"Positive Hyperbole": (u"3w+", positiveStreak), u"Negative Hyperbole": (u"3w-", negativeStreak), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, "minus Hyperbole": {u"Positive Imbalance": (u"w-\u2605 ", posStarPolarityDiscrepancy), u"Negative Imbalance": (u"w+\u2606 ", negStarPolarityDiscrepancy), u"Positive Quotes": (u"\"..\"", scareQuotes), u"Negative Quotes": (u"\"--\"", scareQuotesNegative), u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation), u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, "minus Ellipsis and Punctuation": {u"Positive Imbalance": (u"w-\u2605 ", posStarPolarityDiscrepancy), u"Negative Imbalance": (u"w+\u2606 ", negStarPolarityDiscrepancy), u"Positive Quotes": (u"\"..\"", scareQuotes), u"Negative Quotes": (u"\"--\"", scareQuotesNegative), u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation), u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation), u"Positive Hyperbole": (u"3w+", positiveStreak), u"Negative Hyperbole": (u"3w-", negativeStreak), u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, "minus Pos/Neg&Ellipsis": {u"Positive Imbalance": (u"w-\u2605 ", posStarPolarityDiscrepancy), u"Negative Imbalance": (u"w+\u2606 ", negStarPolarityDiscrepancy), u"Positive Quotes": (u"\"..\"", scareQuotes), u"Negative Quotes": (u"\"--\"", scareQuotesNegative), u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation), u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation), u"Positive Hyperbole": (u"3w+", positiveStreak), u"Negative Hyperbole": (u"3w-", negativeStreak), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), }, "minus Pos": { u"Negative Imbalance": (u"w+\u2606 ", negStarPolarityDiscrepancy), u"Negative Quotes": (u"\"--\"", scareQuotesNegative), u"Neg&Punctuation": (u"w-!?", negativeNGramPlusPunctuation), u"Negative Hyperbole": (u"3w-", negativeStreak), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), u"Negative&Ellipsis": (u"w-..", lambda x: negativeNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, "minus Neg": {u"Positive Imbalance": (u"w-\u2605 ", posStarPolarityDiscrepancy), u"Positive Quotes": (u"\"..\"", scareQuotes), u"Pos&Punctuation": (u"w+!?", positiveNGramPlusPunctuation), u"Positive Hyperbole": (u"3w+", positiveStreak), u"Ellipsis and Punctuation": (u"..?!", ellipsisPlusPunctuation), u"Positive&Ellipsis": (u"w+..", lambda x: positiveNGramPlusPunctuation(x, pattern=r"(\.\.|\. \. \.)$")), }, } for name, config in featureConfig.items(): print("\n"*5, name) print("-"*60) for each in config: print(each) print() trainFeatures, trainFeatureVectors = extractFeatures(ids, trainingSet.reviews, config) trainTargets = [] trainData = [] trainGold = trainingSet.goldStandard for ID, g in trainGold.items(): trainTargets.append(g) trainData.append(trainFeatureVectors[ID]) #for i, vec in enumerate(data): # print(targets[i], " | ", vec) featureCount = sum([sum(v) for v in trainData]) print("Feature found: ", featureCount, "times.") classifiers = [DecisionTreeClassifier(), SVC(kernel="linear"), SVC(), LinearSVC(), MultinomialNB(), GaussianNB(), RandomForestClassifier(), LogisticRegression(),] # Cross validation if testSetFilename == None: for c in classifiers: applyCrossValidation(c, trainData, trainTargets) # scores = cross_validation.cross_val_score(classifier, array(data), # array(targets), cv=10) # print(scores) else: print("Testing the classifiers using the set at '{path}{file}'".format( path=CORPUS_PATH, file=testSetFilename)) testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH) # testSet = Corpus.loadCorpus(filename="test_set.pk") print("Extracting features...") testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs, testSet.reviews) testData = [] testTargets = [] testGold = testSet.goldStandard for ID, g in testGold.items(): testTargets.append(g) testData.append(testFeatureVectors[ID]) for c in classifiers: applyClassifier(c, trainData, trainTargets, testData, testTargets)
def exportFeatures(): corpus = Corpus(SET_FILENAMES[3]) features, featureVectors = extractFeatures(corpus.reviewIDs, corpus.reviews, features=None, createARFF=True)
def applyML(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH): """ Uses machine learning approach to classify sentences. """ # TODO: Add condition to create corpus, if no file exists. print("Training the classifiers using the set at '{path}{file}'".format( path=setPath, file=trainingSetFilename)) trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH) # trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk") # for each in trainingSet.reviewIDs[0:10]: # print(each) # print() # Get the ids - which are ordered ironic, regular - and shuffle them. ids = trainingSet.reviewIDs random.seed(44) random.shuffle(ids) # for each in ids[0:10]: # print(each) # print() reviews = trainingSet.reviews if not testSetFilename == None: testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH) reviews = dict(trainingSet.reviews.items() + testSet.reviews.items()) bowDictionary = createBagOfWordsDictionary(reviews) else: bowDictionary = None print("Extracting features...") trainFeatures, trainFeatureVectors = extractFeatures(ids, trainingSet.reviews, bowDictionary=bowDictionary) trainTargets = [] trainData = [] stars = [] trainGold = trainingSet.goldStandard for ID, g in trainGold.items(): trainTargets.append(g) trainData.append(trainFeatureVectors[ID]) stars.append(trainingSet.reviews[ID].stars) #for i, vec in enumerate(data): # print(targets[i], " | ", vec) featureCount = sum([sum(v) for v in trainData]) # print("Feature found: ", featureCount, "times.") trainTargets = array(trainTargets) trainData = array(trainData) classifiers = [DecisionTreeClassifier(), SVC(kernel="linear"), SVC(), LinearSVC(), MultinomialNB(), GaussianNB(), RandomForestClassifier(), LogisticRegression(),MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=1, warm_start=True)] # Cross validation if testSetFilename == None: for c in classifiers: applyCrossValidation(c, trainData, trainTargets) # Show star distribution for each classifier # applyCrossValidation(c, trainData, trainTargets, stars=stars) # scores = cross_validation.cross_val_score(classifier, array(data), # array(targets), cv=10) # print(scores) else: print("Testing the classifiers using the set at '{path}{file}'".format( path=CORPUS_PATH, file=testSetFilename)) # testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH) # testSet = Corpus.loadCorpus(filename="test_set.pk") # Create bag of words dictionary that contains words of all reviews # bowDictionary = createBagOfWordsDictionary( # trainingSet.reviews + testSet.reviews) print("Extracting features...") testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs, testSet.reviews, bowDictionary=bowDictionary) testData = [] testTargets = [] testGold = testSet.goldStandard for ID, g in testGold.items(): testTargets.append(g) testData.append(testFeatureVectors[ID]) testData = array(testData) testTargets = array(testTargets) for c in classifiers: applyClassifier(c, trainData, trainTargets, testData, testTargets)
def applyMachineLearning(class1, class2, randomSeed, arff_path, trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH): """ Uses machine learning approach to classify sentences. """ no_configs = [("irony", "figurative"), ("irony", "irony"), ("sarcasm", "sarcasm"), ("sarcasm", "figurative"), ("regular", "regular"), ("figurative", "irony"), ("figurative", "sarcasm"), ("figurative", "figurative")] if (class1, class2) in no_configs: print "ERROR! Please use allowed combination of classes!" exit() ## ---------- feature configurations -------------------------------- featureConfigs = [] bowConfig = [] bowBigramConfig = [] allBinaryConfig = range(18,40) allConfig = range(60) allConfig.extend(range(156,157)) allWithoutNumbers = range(18,60) allWithoutNorm = range(40) allWithoutStacks = range(60) normConfig = range(40,60) normConfig.extend(range(156,157)) numbersConfig = range(18) stacksConfig = range(60,156) allWithoutBinary = range(18) allWithoutBinary.extend(range(40,60)) allWithoutBoW = range(60) allWithoutBowBigram = range(60) allWithoutBigrams = range(60) # top10 - evaluated with weka chi^2-test if class1 == "irony" and class2 == "sarcasm": top10Config = [43,57,52,156,44,56,51,42,50,49] elif class1 == "irony" and class2 == "regular": top10Config = [35,54,45,56,50,42,52,44] elif class1 == "sarcasm" and class2 == "regular": top10Config = [35,54,45,50,52,42,57,56] else: top10Config = [] # ablation study 1 sentimentConfig = [0,5,42,43,44,18,20,21,22,23,24,25,26,28,31,34,36,70,71,72,73,74,75,76,78,79,80,81,82,83,84,155] # number = [0,5], norm = [42,43,44], binaer = [18,20,21,22,23,24,25,26,28,31,34,36], stack = [70,71,72,73,74,75,76,78,79,80,81,82,83,84,155] subjConfig = [4,59,150,151,152,153,154] # number = [4], norm = [59], binaer = [], stack = [150,151,152,153,154] syntaxConfig = [1,13,16,52,56,156,29,115,116,117,118,119,135,136,137,138,139] # number = [1,13,16], norm = [52,56,156], binaer = [29], stack = [115,116,117,118,119,135,136,137,138,139] posConfig = [6,7,11,17,45,50,51,57,85,86,87,88,89,105,106,107,108,109,110,111,112,113,114,140,141,142,143,144] # number = [6,7,11,17], norm = [45,50,51,57], binaer = [], stack = [85,86,87,88,89,105,106,107,108,109,110,111,112,113,114,140,141,142,143,144] emoticonConfig = [2,10,12,14,15,41,46,48,53,55,58,30,32,37,65,66,67,68,69,90,91,92,93,94,95,96,97,98,99,120,121,122,123,124,130,131,132,133,134,145,146,147,148,149] urlAndUserConfig = [39,35] # number = [], norm = [], binaer = [35,39], stack = [] signalConfig = [3,8,9,40,41,42,43,44,45,46,47,48,49,54,19,27,33,38,60,61,62,63,64,100,101,102,103,104,125,126,127,128,129] # number = [3,8,9], norm = [40,41,42,43,44,45,46,47,48,49,54], binaer = [19,27,33,38], stack = [60,61,62,63,64,100,101,102,103,104,125,126,127,128,129] signalGroupConfig = signalConfig signalGroupConfig.extend(urlAndUserConfig) signalGroupConfig.extend(emoticonConfig) syntaxGroupConfig = syntaxConfig syntaxGroupConfig.extend(subjConfig) syntaxGroupConfig.extend(posConfig) syntaxAndSentiment = syntaxGroupConfig syntaxAndSentiment.extend(sentimentConfig) syntaxAndSignal = syntaxGroupConfig syntaxAndSignal.extend(signalGroupConfig) sentimentAndSignal = signalGroupConfig sentimentAndSignal.extend(sentimentConfig) # ablation study 2: ablation from ALL allWithoutSentiment = list(set(range(157)) - set(sentimentConfig)) allWithoutPOS = list(set(range(157)) - set(posConfig)) # signal + emoticons + url&User allWithoutSignal = list(set(range(157)) - set(signalConfig) - set(emoticonConfig) - set(urlAndUserConfig)) # Syntax # Subjectivity allWithoutSyntax = list(set(range(157)) - set(syntaxConfig) - set(subjConfig) - set(posConfig)) allWithoutTop10 = list(set(allConfig) - set(top10Config)) # ablation study 2: ablation from BINARY binaryWithoutSentiment = list(set(allBinaryConfig) - set(sentimentConfig)) binaryWithoutPOS = list(set(allBinaryConfig) - set(posConfig)) # New Signal: Old Signal + Emoticon and RegExp and URL and User binaryWithoutSignal = list(set(allBinaryConfig) - set(signalConfig) - set(emoticonConfig) - set(urlAndUserConfig)) # New Syntax: Old Syntax + Subj: binaryWithoutSyntax = list(set(allBinaryConfig) - set(syntaxConfig) - set(subjConfig) - set(posConfig)) binaryWithoutTop10 = list(set(allBinaryConfig) - set(top10Config)) # full configuration of feature list for feature extraction # mode,feat,regEx,stack_binning,bigram,sentiment,configuration # featureConfigs.append(("bow",True,True,True,False,False,bowConfig,"features_bowConfig")) # featureConfigs.append(("bow",True,True,True,True,False,bowBigramConfig,"features_bowBigramConfig")) # featureConfigs.append(("all",True,True,False,True,True,allBinaryConfig,"features_allBinaryConfig")) # featureConfigs.append(("all",True,True,True,True,True,allConfig,"features_allConfig")) # featureConfigs.append(("all",True,True,True,True,True,allWithoutNumbers,"features_allWithoutNumbers")) # featureConfigs.append(("all",True,True,True,True,True,allWithoutNorm,"features_allWithoutNorm")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutStacks,"features_allWithoutStacks")) # featureConfigs.append(("all",True,False,True,True,True,allWithoutBinary,"features_allWithoutBinary")) # featureConfigs.append(("specific",True,True,True,True,True,allWithoutBoW,"features_allWithoutBoW")) featureConfigs.append(("specific",True,True,True,False,True,allWithoutBowBigram,"features_allWithoutBowBigram")) # featureConfigs.append(("all",True,True,True,False,True,allWithoutBigrams,"features_allWithoutBigrams")) # featureConfigs.append(("all",True,False,False,True,False,numbersConfig, "features_numbersConfig")) # featureConfigs.append(("all",True,False,False,True,False,normConfig, "features_normConfig")) # featureConfigs.append(("all",True,False,False,True,False,stacksConfig, "features_stacksConfig")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutEmoticons,"features_allWithoutEmoticons")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutSubj,"features_allWithoutSubj")) # featureConfigs.append(("all",True,False,False,True,True,allWithoutRegExpAndURLandUser,"features_allWithoutRegExpAndURLandUser")) # featureConfigs.append(("specific",True,True,False,True,True,allBinaryConfig,"features_binaryWithoutBoW")) # featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBoWBi")) # featureConfigs.append(("specific",True,True,False,False,True,allBinaryConfig,"features_binaryWithoutBi")) # # featureConfigs.append(("all",True,True,False,True,False,binaryWithoutSentiment,"features_binaryWithoutSentiment")) # featureConfigs.append(("all",True,True,False,True,True,binaryWithoutPOS,"features_binaryWithoutPOS")) # featureConfigs.append(("all",True,False,False,True,True,binaryWithoutSignal,"features_binaryWithoutSignal")) # featureConfigs.append(("all",True,True,False,True,True,binaryWithoutSyntax,"features_binaryWithoutSyntax")) # featureConfigs.append(("all",True,True,False,True,False,binaryWithoutWeka,"features_binaryWithoutWeka")) # featureConfigs.append(("all",True,True,False,True,True,binaryWithoutTop10,"features_binaryWithoutTop10")) # featureConfigs.append(("all",True,True,False,True,False,allWithoutSentiment,"features_allWithoutSentiment")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutPOS,"features_allWithoutPOS")) # featureConfigs.append(("all",True,False,False,True,True,allWithoutSignal,"features_allWithoutSignal")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutSyntax,"features_allWithoutSyntax")) # featureConfigs.append(("all",True,True,False,True,False,allWithoutWeka,"features_allWithoutWeka")) # featureConfigs.append(("all",True,True,False,True,True,allWithoutTop10,"features_allWithoutTop10")) # feature categories: # featureConfigs.append(("all",True,False,False,True,True,sentimentConfig, "features_sentimentConfig")) # featureConfigs.append(("all",True,False,False,True,False,posConfig, "features_posConfig")) # featureConfigs.append(("all",True,True,False,True,False,signalGroupConfig, "features_signalGroupConfig")) # featureConfigs.append(("all",True,False,False,True,False,syntaxGroupConfig, "features_syntaxGroupConfig")) # # featureConfigs.append(("specific",True,False,False,False,True,sentimentConfig, "features_sentimentConfig_specific")) # featureConfigs.append(("specific",True,False,False,False,False,posConfig, "features_posConfig_specific")) # featureConfigs.append(("specific",True,True,False,False,False,signalGroupConfig, "features_signalGroupConfig_specific")) # featureConfigs.append(("specific",True,False,False,False,False,syntaxGroupConfig, "features_syntaxGroupConfig_specific")) # Combinations: # featureConfigs.append(("specific",True,True,False,False,True,sentimentAndSignal,"features_sentimentAndSignal")) # featureConfigs.append(("specific",True,False,False,False,True,syntaxAndSentiment,"features_syntaxAndSentiment")) # featureConfigs.append(("specific",True,True,False,False,False,syntaxAndSignal,"features_syntaxAndSignal")) # featureConfigs.append(("all",True,False,False,True,False,top10Config, "features_top10Config")) print str(len(featureConfigs)) + " different configurations of features" # create file which contains status reports. with open("info.txt", "a") as info: info.write("Start" + "\n") print "Start" # TODO: Add condition to create corpus, if no file exists. info.write("Training the classifiers using the set at '{path}{file}'".format( path=setPath, file=trainingSetFilename) + "\n") print("Training the classifiers using the set at '{path}{file}'".format( path=setPath, file=trainingSetFilename)) lt = localtime() info.write("Begin loading Corpus " + class1 + " vs " + class2 + " - " + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0]) + "\n") print("Begin loading Corpus " + class1 + " vs " + class2 + " - " + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0])) # load training corpus. trainingSet = Corpus(class1, class2, trainingSetFilename, corpusPath=CORPUS_PATH+TRAIN_PATH) # Get the ids - which are ordered class1, class2 and shuffle them. trainingIDs = trainingSet.tweetIDs random.seed(randomSeed) random.shuffle(trainingIDs) # load test corpus if filename is given; not needed for cross validation. if not testSetFilename == None: testSet = Corpus(class1, class2, testSetFilename, corpusPath=CORPUS_PATH+TEST_PATH) tweets = dict(trainingSet.tweets.items() + testSet.tweets.items()) mode_list = [] bigram_list = [] # only create dict for bag-of-words and bigrams if really necessary! for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs: mode_list.append(mode) bigram_list.append(bigram) if "all" in mode_list: bowDictionary = createBagOfWordsDictionary(tweets) print "loaded bow" elif "bow" in mode_list: bowDictionary = createBagOfWordsDictionary(tweets) print "loaded bow" else: bowDictionary = {} print "bow not necessary" if True in bigram_list: bigramDictionary = createBagOfBigramsDictionary(tweets) print "loaded bigrams" else: bigramDictionary = {} print "bigrams not necessary" else: bowDictionary = None bigramDictionary = None lt = localtime() info.write("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0])+ "\n") print("Corpus loaded -" + str(lt[3]) + "h:" + str(lt[4]) + "m:" + str(lt[5]) + "s Uhr am " + str(lt[2]) + "." + str(lt[1]) + "." + str(lt[0])) info.write("Extracting features with different configurations \n") print("Extracting features with different configurations") t = 0 # feature extraction using above feature configurations. for mode, feat, regExp, stack_binning, bigram, sentiment, config, createARFF in featureConfigs: trainFeatures = [] trainFeatureVectors = {} testFeatures = [] testFeatureVectors = {} t += 1 config_name = createARFF + "_" + class1 + "_vs_" + class2 # if empty string no arff file will be generated # else set createARFF_file = createARFF createARFF_file = createARFF with open("info_" + config_name + ".txt", "a") as info: print "\n" + str(t) + "th configuration\n-----------------------------------------\n" info.write("\n" + str(t) + "th configuration\n-----------------------------------------\n") # optional: if true, then all binary combinations of all # features are added to feature list. binary_combination=False # feature extraction. trainFeatures, trainFeatureVectors = extractFeatures(class1, class2, mode, arff_path, trainingIDs, trainingSet.tweets, config, feat, regExp, stack_binning, binary_combination, sentiment, bigram, createARFF_file, bowDictionary, bigramDictionary) # array of train data - is not necessary; just used for safeguard. tTargets = [] tData = [] # sparse matrix of train data: ID_map_train = {} rdim = len(trainFeatureVectors.keys()) cdim = len(trainFeatureVectors[trainingIDs[0]]) # create sparse matrix with rdim x cdim trainData = lil_matrix((rdim, cdim)) trainGold = trainingSet.goldStandard trainTargets = range(len(trainGold)) j = 0 for ID, g in trainGold.items(): ID_map_train[j] = ID # array part. tTargets.append(g) tData.append(trainFeatureVectors[ID]) # matrix will be filled. for i in range(len(trainFeatureVectors[ID])): if trainFeatureVectors[ID][i] != 0: trainData[j, i] = trainFeatureVectors[ID][i] trainTargets[j] = g j += 1 trainFeatureVectors = {} trainGold = {} classifiers = [(DecisionTreeClassifier(), "Decision_Tree"), (SVC(kernel="linear"), "Linear_SVC"), (SVC(), "SVC"), (LinearSVC(), "LinearSVC"), (LogisticRegression(), "logRegression")] # classifiers which need matrix matrixClassifier = ["Linear_SVC", "SVC", "LinearSVC", "logRegression"] # Cross validation if testSetFilename == None: for c, name in classifiers: if name in matrixClassifier: if isspmatrix(trainData): duration = timeit(lambda: applyCrossValidation(class1, class2, createARFF, ID_map_train, c, name, trainData, trainTargets, 10), number=1) showDuration(createARFF, name, duration) else: duration = timeit(lambda: applyCrossValidation(class1, class2, createARFF, ID_map_train, c, name, tData, tTargets, 10), number=1) showDuration(createARFF, name, duration) # use test data for evaluation. else: with open("info.txt", "a") as info: info.write("Testing the classifiers using the set at '{path}{file}'".format( path=CORPUS_PATH, file=testSetFilename) + "\n") print("Testing the classifiers using the set at '{path}{file}'".format( path=CORPUS_PATH, file=testSetFilename)) info.write("Extracting features... \n") testIDs = testSet.tweetIDs random.seed(randomSeed) random.shuffle(testIDs) # feature extraction for test data. testFeatures, testFeatureVectors = extractFeatures(class1, class2, mode, arff_path, testIDs, testSet.tweets, config, feat, regExp, stack_binning, binary_combination, sentiment, bigram, createARFF_file, bowDictionary, bigramDictionary) # array of test data: tsTargets = [] tsData = [] # sparse matrix of test data rdim = len(testFeatureVectors.keys()) cdim = len(testFeatureVectors[testIDs[0]]) testData = lil_matrix((rdim, cdim)) testGold = testSet.goldStandard testTargets = range(len(testGold)) ID_map_test = {} j = 0 for ID, g in testGold.items(): ID_map_test[j] = ID # array tsTargets.append(g) tsData.append(testFeatureVectors[ID]) # matrix for i in range(len(testFeatureVectors[ID])): if testFeatureVectors[ID][i] != 0: testData[j, i] = testFeatureVectors[ID][i] testTargets[j] = g j += 1 testFeatureVectors ={} testGold = {} for c, name in classifiers: if name in matrixClassifier: duration = timeit(lambda: applyClassifier(class1, class2, createARFF, ID_map_test, c, name, trainData, trainTargets, testData, testTargets), number=1) showDuration(createARFF, name, duration) else: duration = timeit(lambda: applyClassifier(class1, class2, createARFF, ID_map_test, c, name, tData, tTargets, tsData, tsTargets), number=1) showDuration(createARFF, name, duration)
def testFeatures(): """Tests if the features work on the corpus.""" ironicIDs, regularIDs, reviews = createTestReviews() features, featureVectors = extractFeatures(ironicIDs + regularIDs, reviews) showFeatureOccurrence(features, featureVectors)
def main(fastafile, classfile, kernel, evalmodel, paramfile): """ """ # number of cross-validation folds for evaluating model on training data if "trich" in fastafile: nfolds = 5 else: nfolds = 10 # output directory for results on training data resdir = "./results/" dname = fastafile.split("/")[-1].split(".fasta")[0] # load protein IDs and labels - in same order as .class file labels = OrderedDict() with open(classfile) as fin: for l in fin: p, l = l.rstrip().split() labels[p] = int(l) # load FASTA file # calculate features for each sequence with open(fastafile) as fin: seqstr = fin.read() IDs, X = extractFeatures(seqstr) # make sure features are in the right order newX = [] for ID in labels.keys(): newX.append(X[IDs.index(ID)]) X = newX # feature and label matrices to train model X = np.array(X) Y = np.array(labels.values()) if evalmodel: # cross-val folds (leave-one-out cross-val if less than 10 ex) ntrain = X.shape[0] if ntrain <= 10: cv = LeaveOneOut(n=ntrain) else: cv = StratifiedKFold(Y, nfolds, shuffle=True) # cross-validation within dataset results = crossValidate(X, Y, cv, kernel, paramfile) results.update({'dname':dname, 'protIDs':labels}) if not os.path.isdir(resdir): os.makedirs(resdir) resfile = resdir+dname+"_results.txt" printResults(results, resfile) """ train on all data and save final model for inference """ # define parameters (grid to search) params = getParams(kernel, paramfile) bestC, bestKparam = selectParams(X, Y, kernel, params, nfolds) # standardize the data # X, _, scaler = standardizeData(X, None) scaler = None # construct training kernel K = calculateKernel(X, X, bestKparam, kernel) # train model clf = trainModel(K, Y, bestC) # save model support_vectors = X[clf.support_,:] savedict={'dualvars':clf.dual_coef_, 'kernel':kernel, 'Kparam':bestKparam, 'support_vectors':support_vectors, 'scaler':scaler, 'intercept':clf.intercept_} with open("./models/"+dname+"_model.pkl", "wb") as fout: pickle.dump(savedict, fout)
def applyML(trainingSetFilename, testSetFilename=None, setPath=CORPUS_PATH): """ Uses machine learning approach to classify sentences. """ # TODO: Add condition to create corpus, if no file exists. print("Training the classifiers using the set at '{path}{file}'".format( path=setPath, file=trainingSetFilename)) trainingSet = Corpus(trainingSetFilename, corpusPath=CORPUS_PATH) # trainingSet = Corpus.loadCorpus(filename="shuffled_set.pk") # for each in trainingSet.reviewIDs[0:10]: # print(each) # print() # Get the ids - which are ordered ironic, regular - and shuffle them. ids = trainingSet.reviewIDs random.seed(44) random.shuffle(ids) # for each in ids[0:10]: # print(each) # print() reviews = trainingSet.reviews if not testSetFilename == None: testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH) reviews = dict(trainingSet.reviews.items() + testSet.reviews.items()) bowDictionary = createBagOfWordsDictionary(reviews) else: bowDictionary = None print("Extracting features...") trainFeatures, trainFeatureVectors = extractFeatures(ids, trainingSet.reviews, bowDictionary=bowDictionary) trainTargets = [] trainData = [] stars = [] trainGold = trainingSet.goldStandard for ID, g in trainGold.items(): trainTargets.append(g) trainData.append(trainFeatureVectors[ID]) stars.append(trainingSet.reviews[ID].stars) #for i, vec in enumerate(data): # print(targets[i], " | ", vec) featureCount = sum([sum(v) for v in trainData]) # print("Feature found: ", featureCount, "times.") classifiers = [DecisionTreeClassifier(), SVC(kernel="linear"), SVC(), LinearSVC(), MultinomialNB(), GaussianNB(), RandomForestClassifier(), LogisticRegression(),] # Cross validation if testSetFilename == None: for c in classifiers: applyCrossValidation(c, trainData, trainTargets) # Show star distribution for each classifier # applyCrossValidation(c, trainData, trainTargets, stars=stars) # scores = cross_validation.cross_val_score(classifier, array(data), # array(targets), cv=10) # print(scores) else: print("Testing the classifiers using the set at '{path}{file}'".format( path=CORPUS_PATH, file=testSetFilename)) # testSet = Corpus(testSetFilename, corpusPath=CORPUS_PATH) # testSet = Corpus.loadCorpus(filename="test_set.pk") # Create bag of words dictionary that contains words of all reviews # bowDictionary = createBagOfWordsDictionary( # trainingSet.reviews + testSet.reviews) print("Extracting features...") testFeatures, testFeatureVectors = extractFeatures(testSet.reviewIDs, testSet.reviews, bowDictionary=bowDictionary) testData = [] testTargets = [] testGold = testSet.goldStandard for ID, g in testGold.items(): testTargets.append(g) testData.append(testFeatureVectors[ID]) for c in classifiers: applyClassifier(c, trainData, trainTargets, testData, testTargets)
object = sorted(os.listdir(globals.data_train_images)) i, total = 0, len(object) with Timer() as timer: for subject in object: i += 1 print('Processing the subdirectory named:', subject, '\t[', i , '/', total, ']', file = globals.file) # Read in cropped data crop_names = os.listdir(os.path.join(globals.data_train_images, subject)) crop_names = list(map(lambda x: os.path.join(globals.data_train_images, subject, x), crop_names)) crops = [cv.imread(x , cv.IMREAD_GRAYSCALE) for x in crop_names] # Get Features desc = features.extractFeatures(crops, features.features) all[subject] = desc print('Extracted', arguments.descriptor, '\n', file = globals.file) print('Time:', timer, '\n', file = globals.file) # Print print('Done!\n') # Print print('Create Bag of Visual Features\n') # Features matrix = features.groupAllFeatures(all) kmeans = None
# define which features have to be extracted from test data (same as in loaded model). featureConfigs = [] top10_iro_sarc_Config = [43, 57, 52, 156, 44, 56, 51, 42, 50, 49] featureConfigs.append(("all", True, False, False, True, top10_iro_sarc_Config, "features_top10Config")) # feature extraction. for mode, feat, regExp, new_stack, bigram, config, createARFF in featureConfigs: createARFF = createARFF + "_" + class1 + "_vs_" + class2 binary_combination = False sentiment = False testFeatures, testFeatureVectors = extractFeatures( class1, class2, mode, arff_path, testIDs, testSet.tweets, config, feat, regExp, new_stack, binary_combination, sentiment, bigram, createARFF, bowDictionary, bigramDictionary) # array of test data: tsTargets = [] tsData = [] # sparse matrix of test data rdim = len(testFeatureVectors.keys()) cdim = len(testFeatureVectors[testIDs[0]]) testData = lil_matrix((rdim, cdim)) testGold = testSet.goldStandard testTargets = range(len(testGold))