def __init__(self, config_path, verbose=False): ''' Constructor of AutosarkasmusBot Keyword arguments: config_path (str): path to the json configuration file verbose (str): stdout verbosity ''' self.verbose = verbose self._load_config(config_path) # load config from file # Twitter API parameters self.oauth = tweepy.OAuthHandler(self.CONSUMER_KEY, self.CONSUMER_SECRET) self.oauth.set_access_token(self.ACCESS_KEY, self.ACCESS_SECRET) self.twitter_api = tweepy.API(self.oauth) # tweet processing self.pipeline = Pipeline(self.training_corpus_positive_path, self.pipeline_tagger_mapping_path) self.feature_extractor = FeatureExtractor(self.features, self.feature_order) self.classifier = MultiLayerPerceptronClassifier(self.feature_order, verbose=self.verbose)
arg_parser = argparse.ArgumentParser( description='Feature Extraction for the Autosarkasmus Baseline') arg_parser.add_argument('corpus_file_pos', help='path to the positive corpus file') arg_parser.add_argument('corpus_file_neg', help='path to the negative corpus file') arg_parser.add_argument('output_file', help='path to the output file') args = arg_parser.parse_args() print('\n - Autosarkasmus Baseline Feature Extraction (Unigrams) -\n') # feature setup print('setting up features...') features, feature_order = setup_features() print('setting up feature extractor...') feature_extractor = FeatureExtractor(features, feature_order) # ARFF document setup arff_doc = ARFFDocument('Sarkasmuserkennung', features, feature_order) # the magic tweets_ext = feature_extractor.extract_features(args.corpus_file_pos, args.corpus_file_neg, verbose=True) # generate final ARFF document print('generating ARFF document...') for tweet_ext in tweets_ext: arff_doc.add_data(tweet_ext) arff_doc.generate_document(args.output_file)
str(is_sarcastic) + '...') # preprocess tweets if is_sarcastic: pipeline = Pipeline(args.corpus_file_pos, '../rsrc/de-tiger.map', verbose=True) else: pipeline = Pipeline(args.corpus_file_neg, '../rsrc/de-tiger.map', verbose=True) tweets_tkn, tweets_proc = pipeline.process() for tweet_proc in tweets_proc: data.append({'tweet': tweet_proc, 'class': is_sarcastic}) if args.model in ['svm', 'mlp']: feature_extractor = FeatureExtractor(features, feature_order) data = feature_extractor.extract_features( args.corpus_file_pos, args.corpus_file_neg, verbose=True) # extract features from training corpora # classifier setup classifiers = [] if args.model == 'svm': classifiers.append({ 'name': 'svm_classifier', 'classifier': SVMClassifier(feature_order, verbose=True) }) elif args.model == 'mlp': classifiers.append({
help='path to the tagger mapping') arg_parser.add_argument('-f', '--input_file', help='path to input file') args = arg_parser.parse_args() print('\n - Autosarkasmus Demo -\n') # preprocessing pipeline setup pipeline = Pipeline(args.training_pos_path, args.tagger_mapping_path) # feature setup print('setting up features...') features, feature_order = setup_features() # feature extraction print('setting up feature extractor...') feature_extractor = FeatureExtractor(features, feature_order) tweets_ext = feature_extractor.extract_features(args.training_pos_path, args.training_neg_path, verbose=True) print('extracted features from ' + str(len(tweets_ext)) + ' tweets.') # svm training print('training classifier...') classifier = MultiLayerPerceptronClassifier(feature_order, verbose=True) classifier.train(tweets_ext) print('\nready to classify.') # classification if args.input_file: print('preprocessing tweets...') pipeline = Pipeline(args.input_file, args.tagger_mapping_path)
class AutosarkasmusBot: ''' A bot for the @autosarkasmus twitter account Processes sarcasm classification requests and corresponding feedback. ''' def __init__(self, config_path, verbose=False): ''' Constructor of AutosarkasmusBot Keyword arguments: config_path (str): path to the json configuration file verbose (str): stdout verbosity ''' self.verbose = verbose self._load_config(config_path) # load config from file # Twitter API parameters self.oauth = tweepy.OAuthHandler(self.CONSUMER_KEY, self.CONSUMER_SECRET) self.oauth.set_access_token(self.ACCESS_KEY, self.ACCESS_SECRET) self.twitter_api = tweepy.API(self.oauth) # tweet processing self.pipeline = Pipeline(self.training_corpus_positive_path, self.pipeline_tagger_mapping_path) self.feature_extractor = FeatureExtractor(self.features, self.feature_order) self.classifier = MultiLayerPerceptronClassifier(self.feature_order, verbose=self.verbose) def _load_config(self, config_path): ''' Loads configuration from JSON file Keyword arguments: config_path (str): path to the json configuration file ''' config_json = {} try: with open(config_path, 'r', encoding='utf8') as fop: config_json = json.load(fop) except Exception as ex: print('Error: Could not read config file at "' + config_path + '".') print(ex) self.screen_name = config_json.get('SCREEN_NAME', None) # screen name used by the bot self.enquiry_pattern = config_json.get( 'ENQUIRY_PATTERN', None) # pattern that matches an enquiry self.enquiry_responses = config_json.get('ENQUIRY_RESPONSES', { 'positive': [], 'negative': [] }) # responses to an enquiry self.feedback_responses = config_json.get('FEEDBACK_RESPONSES', { 'positive': [], 'negative': [] }) # responses to feedback self.CONSUMER_KEY = config_json.get('CONSUMER_KEY', None) # Twitter API consumer key self.CONSUMER_SECRET = config_json.get( 'CONSUMER_SECRET', None) # Twitter API consumer token self.ACCESS_KEY = config_json.get( 'ACCESS_KEY', None) # Twitter API application access key self.ACCESS_SECRET = config_json.get( 'ACCESS_SECRET', None) # Twitter API application secret key self.pipeline_tagger_mapping_path = config_json.get( 'PIPELINE_TAGGER_MAPPING_PATH', None) # path to tagger mapping file self.training_corpus_positive_path = config_json.get( 'TRAINING_CORPUS_POSITIVE_PATH', None) # path to corpus with positive training data self.training_corpus_negative_path = config_json.get( 'TRAINING_CORPUS_NEGATIVE_PATH', None) # path to corpus with negative training data self.history_path = config_json.get('HISTORY_PATH', None) # path to the bot's history # load history to memory self.history = {} try: with open(self.history_path, 'r', encoding='utf8') as fop: self.history = json.load(fop) except Exception as ex: print('Error: Could not read history file at "' + config_path + '".') print(ex) # load features to memory self.features = {} self.feature_order = [] features_json = config_json.get('FEATURES', []) for feature_json in features_json: self.feature_order.append(feature_json['key']) self.features[feature_json['key']] = feature_json['values'] def train(self): ''' Train the bot on given data ''' tweets_ext = self.feature_extractor.extract_features( self.training_corpus_positive_path, self.training_corpus_negative_path, verbose=self.verbose) # extract features from training corpora if self.verbose: print('training classifier...') self.classifier.train(tweets_ext) def classify_tweet(self, tweet_raw): ''' Classifies a single tweet Keyword arguments: tweet_raw (str): text of the tweet to classify Returns: dict: the extracted features of the tweet in addition to class ''' if self.verbose: print('classifying tweet: "' + tweet_raw + '"') tweet_tkn, tweet_proc = self.pipeline.process_tweet( tweet_raw) # preprocess the raw tweet if self.verbose: print(str(tweet_tkn) + '\n' + str(tweet_proc)) tweet_ext = self.feature_extractor.extract_features_from_tweet( tweet_tkn, tweet_proc, True) # extract features from tweet (sarcasm is True per default) del (tweet_ext['class'] ) # delete class since it is only a default value if self.verbose: print([(feature, tweet_ext[feature]) for feature in self.feature_order if tweet_ext.get(feature, 0) != 0 ]) # print all features != 0 tweet_class = self.classifier.classify([tweet_ext ]) # classify the tweet if self.verbose: print('classified with sarcasm:', tweet_class[0]['class']) return tweet_class[0] def is_sarcastic_tweet(self, tweet_raw): ''' Identifies sarcasm in a single tweet Keyword arguments: tweet_raw (str): text of the tweet to classify Returns: bool: whether the tweet was classified as being sarcastic ''' return self.classify_tweet(tweet_raw)['class'] def is_valid_enquiry(self, tweet_json): ''' Checks whether the given tweet is a valid enquiry Keyword arguments: tweet_json (dict): JSON representation of the tweet object Returns: bool: whether the tweet is a valid enquiry ''' res = False # check if tweet matches enquiry pattern, no case matching if re.match(self.enquiry_pattern, tweet_json['text'], re.IGNORECASE): # check if tweet wasn't authored by the bot itself if tweet_json['user']['screen_name'] != self.screen_name: # check for retweeted enquiry if not tweet_json['retweeted']: # check if tweet is a reply to anything if tweet_json['in_reply_to_status_id']: res = True return res def is_valid_feedback(self, tweet_json): ''' Checks whether the given tweet is valid feedback Keyword arguments: tweet_json (dict): JSON representation of the tweet object Returns: bool: whether the tweet is valid feedback ''' res = False # check if tweet wasn't authored by the bot itself if tweet_json['user']['screen_name'] != self.screen_name: # check for retweet if not tweet_json['retweeted']: # check if tweet is reply to a tweet sent by the bot if tweet_json['in_reply_to_status_id']: reply_tweet_status = self.twitter_api.get_status( tweet_json['in_reply_to_status_id']) if reply_tweet_status.user.screen_name == self.screen_name: # check if bot tweeted a classification for enquiry_response in self.enquiry_responses[ 'positive'] + self.enquiry_responses[ 'negative']: if enquiry_response in reply_tweet_status.text: res = True break return res def gen_enquiry_response(self, recipient, tweet_is_sarcastic): ''' Generate response to an enquiry Keyword arguments: recipient (str): user handle of the addressee tweet_is_sarcastic (bool): whether the tweet was classified as sarcastic Returns: str: twitter-ready response ''' response = '😓' # default response # pick fitting response at random if tweet_is_sarcastic: response = self.enquiry_responses['positive'][randint( 0, len(self.enquiry_responses['positive']) - 1)] else: response = self.enquiry_responses['negative'][randint( 0, len(self.enquiry_responses['negative']) - 1)] # prepend the recipient response = '@' + recipient + ' ' + response + ' Korrekt? (j/n)' # trim tweet if necessary if len(response) > 140: response = response[:137] + '...' return response def gen_feedback_response(self, recipient, correctly_classified): ''' Generate response to feedback Keyword arguments: recipient (str): user handle of the addressee correctly_classified (bool): whether the tweet was correctly classified Returns: str: twitter-ready response ''' response = 'Danke! ^^' # default response # pick fitting response at random if correctly_classified: response = self.feedback_responses['positive'][randint( 0, len(self.feedback_responses['positive']) - 1)] else: response = self.feedback_responses['negative'][randint( 0, len(self.feedback_responses['negative']) - 1)] # prepend the recipient response = '@' + recipient + ' ' + response # trim tweet if necessary if len(response) > 140: response = response[:137] + '...' return response def respond(self, tweet_json): ''' Respond to a tweet Responses are generated for classification enquiries and feedback to said enquiries Keyword arguments: tweet_json (dict): JSON representation of the tweet to respond to ''' if self.verbose: print('mentioned by @' + tweet_json['user']['screen_name'] + '\n"' + tweet_json['text'] + '"') bot_response = None if self.is_valid_enquiry( tweet_json): # if tweet is a classification enquiry if tweet_json[ 'in_reply_to_status_id'] not in self.history: # check if tweet has already been classifed eval_tweet_status = self.twitter_api.get_status( tweet_json['in_reply_to_status_id'] ) # get tweet to be classified (as Tweepy.Status object) eval_tweet_sarcastic = self.is_sarcastic_tweet( eval_tweet_status.text) # classify the tweet bot_response = self.gen_enquiry_response( tweet_json['user']['screen_name'], eval_tweet_sarcastic) # generate response accordingly # save tweet and its classification in history self.history[eval_tweet_status.id] = eval_tweet_status._json self.history[eval_tweet_status. id]['sarcasm_predicted'] = eval_tweet_sarcastic self.save_history() elif self.is_valid_feedback(tweet_json): # if tweet is feedback correctly_classified = None # analyze feedback if re.match(r'.*?\b(j(a|o|ep)?|y(es|o)?)\b.*?', tweet_json['text'], re.IGNORECASE): correctly_classified = True elif re.match(r'.*?\b(n(e(in)?|o(pe)?)?)\b.*?', tweet_json['text'], re.IGNORECASE): correctly_classified = False if correctly_classified is not None: # if feedback could be parsed # follow the tweet trail back to the source (feedback -> classification -> enquiry -> classified_tweet) class_tweet_status = self.twitter_api.get_status( tweet_json['in_reply_to_status_id']) enq_tweet_status = self.twitter_api.get_status( class_tweet_status.in_reply_to_status_id) eval_tweet_status = self.twitter_api.get_status( enq_tweet_status.in_reply_to_status_id) # save the evaluation if 'sarcasm_actual' not in self.history[eval_tweet_status.id]: bot_response = self.gen_feedback_response( tweet_json['user']['screen_name'], correctly_classified) self.history[ eval_tweet_status.id]['sarcasm_actual'] = self.history[ eval_tweet_status. id]['sarcasm_predicted'] and correctly_classified self.save_history() if bot_response: self.twitter_api.update_status( bot_response, tweet_json['id']) # post response to twitter if self.verbose and bot_response: print('responded with: "' + str(bot_response) + '"') def save_history(self): ''' Saves the bot's history to file ''' try: with open(self.history_path, 'w', encoding='utf8') as fop: json.dump(self.history, fop) except Exception as ex: print('Error: Could not save history to "' + self.history_path + '"') print(ex)
def evaluate(self): '''start evaluation and output the results in a file''' try: # Count the folds n = 1 # Create the outputfile f = open(self.outputfile, "w") # Set the finale score variables accuracy_res = 0 precision_res = 0 recall_res = 0 f1_res = 0 # Start the evaluation using Xvalidator for training_pos, validation_pos, training_neg, validation_neg in Xvalidator(self.pos_set, self.k, self.neg_set).k_fold_cross_validation(): # Create temporary files to store the splitted datasets for classification training_pos_file = open ("temp_tweets-training-pos-"+str(n), "w") writer_pos_file = csv.writer(training_pos_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n') writer_pos_file.writerows(training_pos) training_pos_file.flush() training_neg_file = open ("temp_tweets-training-neg-"+str(n), "w") writer_neg_file = csv.writer(training_neg_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n') writer_neg_file.writerows(training_neg) training_neg_file.flush() validation_pos_file = open ("temp_tweets-validation-pos-"+str(n), "w") writer_validation_pos_file = csv.writer(validation_pos_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n') writer_validation_pos_file.writerows(validation_pos) validation_pos_file.flush() validation_neg_file = open ("temp_tweets-validation-neg-"+str(n), "w") writer_validation_neg_file = csv.writer(validation_neg_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n') writer_validation_neg_file.writerows(validation_neg) validation_neg_file.flush() # Extract feature list if self.features is "full_featured": feature_list, feature_order = full_featured("temp_tweets-training-pos-"+str(n)) elif self.features is "unigram_featured": feature_list, feature_order = unigram_featured("temp_tweets-training-pos-"+str(n)) feature_extractor = FeatureExtractor(feature_list, feature_order) tweets_train_ext = feature_extractor.extract_features("temp_tweets-training-pos-"+str(n), "temp_tweets-training-neg-"+str(n), verbose=True) tweets_test_ext = feature_extractor.extract_features("temp_tweets-validation-pos-"+str(n), "temp_tweets-validation-neg-"+str(n), verbose=True) # Create lists, to store the labels for predicted and true classes y_true = [] y_pred = [] # Merge the validation dataset for classification validation = validation_pos + validation_neg # Set the line variable to write the results to a file line = "" line += "+++++++++++++++++++++BEGIN OF " + str(n) + " nth fold++++++++++++++++++++++++" + "\n" line += "---------------------REAL LABELS--------------------------------" + "\n" for tweet, y in zip(validation, tweets_test_ext): y_true.append(y['class']) line += "Tweet: " + tweet[2] line += "\n" line += "Label: " + str(y['class']) line += "\n" Classifier = SVMClassifier(feature_order,'linear',-1,False) Classifier.train(tweets_train_ext) classification_results = Classifier.classify(tweets_test_ext) training = training_pos + training_neg print("Validation "+str(len(validation))) print("Training "+str(len(training))) line += "----------------------PREDICTED LABELS--------------------------------" + "\n" for tweet, val in zip(validation, classification_results): y_pred.append(val['class']) line += "Tweet: " + tweet[2] line += "\n" line += "Label: " + str(val['class']) line += "\n" # Remove the temporary files os.remove("temp_tweets-training-pos-"+str(n)) os.remove("temp_tweets-training-neg-"+str(n)) os.remove("temp_tweets-validation-pos-"+str(n)) os.remove("temp_tweets-validation-neg-"+str(n)) # Show most significant features def show_most_informative_features(features, clf, n=20): feature_names = features output = "Most significant features: \n" coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) for (coef_1, fn_1), (coef_2, fn_2) in top: output += "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2) + "\n" return output # Calculate the current scores accuracy = accuracy_score(y_true, y_pred) print("Accuracy: "+str(accuracy)) precision = precision_score(y_true, y_pred, average='binary', pos_label='sarcastic') print("Precision: "+str(precision)) recall = recall_score(y_true, y_pred, average='binary', pos_label='sarcastic') print("Recall: "+str(recall)) f1 = f1_score(y_true, y_pred, average='binary', pos_label='sarcastic') print("F1-Score: "+str(f1)) # Calculate the final scores accuracy_res += accuracy precision_res += precision recall_res += recall f1_res += f1 line += "Accuracy: " + str(accuracy) + "\n" line += "Precision: " + str(precision) + "\n" line += "Recall: " + str(recall) + "\n" line += "F1-Score: " + str(f1) + "\n" line += show_most_informative_features(feature_order, Classifier.svm) line += "++++++++++++++++++++++END OF " + str(n) + " nth fold++++++++++++++++++++++++++++++" + "\n" f.write(line) print(n) n += 1 line = "" line += "Accuracy: " + str(accuracy_res/self.k) + "Precision: " + str(precision_res/self.k) + "Recall: " + str(recall_res/self.k) + "F1-Score: " + str(f1_res/self.k) f.write(line) f.close() except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror))