def __init__(self, config_path, verbose=False):
        '''
        Constructor of AutosarkasmusBot

        Keyword arguments:
            config_path (str): path to the json configuration file
            verbose (str): stdout verbosity
        '''
        self.verbose = verbose
        self._load_config(config_path)  # load config from file
        # Twitter API parameters
        self.oauth = tweepy.OAuthHandler(self.CONSUMER_KEY,
                                         self.CONSUMER_SECRET)
        self.oauth.set_access_token(self.ACCESS_KEY, self.ACCESS_SECRET)
        self.twitter_api = tweepy.API(self.oauth)
        # tweet processing
        self.pipeline = Pipeline(self.training_corpus_positive_path,
                                 self.pipeline_tagger_mapping_path)
        self.feature_extractor = FeatureExtractor(self.features,
                                                  self.feature_order)
        self.classifier = MultiLayerPerceptronClassifier(self.feature_order,
                                                         verbose=self.verbose)
Exemple #2
0
    arg_parser = argparse.ArgumentParser(
        description='Feature Extraction for the Autosarkasmus Baseline')
    arg_parser.add_argument('corpus_file_pos',
                            help='path to the positive corpus file')
    arg_parser.add_argument('corpus_file_neg',
                            help='path to the negative corpus file')
    arg_parser.add_argument('output_file', help='path to the output file')
    args = arg_parser.parse_args()

    print('\n - Autosarkasmus Baseline Feature Extraction (Unigrams) -\n')

    # feature setup
    print('setting up features...')
    features, feature_order = setup_features()
    print('setting up feature extractor...')
    feature_extractor = FeatureExtractor(features, feature_order)

    # ARFF document setup
    arff_doc = ARFFDocument('Sarkasmuserkennung', features, feature_order)

    # the magic
    tweets_ext = feature_extractor.extract_features(args.corpus_file_pos,
                                                    args.corpus_file_neg,
                                                    verbose=True)

    # generate final ARFF document
    print('generating ARFF document...')
    for tweet_ext in tweets_ext:
        arff_doc.add_data(tweet_ext)
    arff_doc.generate_document(args.output_file)
                  str(is_sarcastic) + '...')
            # preprocess tweets
            if is_sarcastic:
                pipeline = Pipeline(args.corpus_file_pos,
                                    '../rsrc/de-tiger.map',
                                    verbose=True)
            else:
                pipeline = Pipeline(args.corpus_file_neg,
                                    '../rsrc/de-tiger.map',
                                    verbose=True)
            tweets_tkn, tweets_proc = pipeline.process()
            for tweet_proc in tweets_proc:
                data.append({'tweet': tweet_proc, 'class': is_sarcastic})

    if args.model in ['svm', 'mlp']:
        feature_extractor = FeatureExtractor(features, feature_order)
        data = feature_extractor.extract_features(
            args.corpus_file_pos, args.corpus_file_neg,
            verbose=True)  # extract features from training corpora

    # classifier setup
    classifiers = []
    if args.model == 'svm':
        classifiers.append({
            'name':
            'svm_classifier',
            'classifier':
            SVMClassifier(feature_order, verbose=True)
        })
    elif args.model == 'mlp':
        classifiers.append({
Exemple #4
0
                            help='path to the tagger mapping')
    arg_parser.add_argument('-f', '--input_file', help='path to input file')
    args = arg_parser.parse_args()

    print('\n - Autosarkasmus Demo -\n')

    # preprocessing pipeline setup
    pipeline = Pipeline(args.training_pos_path, args.tagger_mapping_path)

    # feature setup
    print('setting up features...')
    features, feature_order = setup_features()

    # feature extraction
    print('setting up feature extractor...')
    feature_extractor = FeatureExtractor(features, feature_order)
    tweets_ext = feature_extractor.extract_features(args.training_pos_path,
                                                    args.training_neg_path,
                                                    verbose=True)
    print('extracted features from ' + str(len(tweets_ext)) + ' tweets.')

    # svm training
    print('training classifier...')
    classifier = MultiLayerPerceptronClassifier(feature_order, verbose=True)
    classifier.train(tweets_ext)
    print('\nready to classify.')

    # classification
    if args.input_file:
        print('preprocessing tweets...')
        pipeline = Pipeline(args.input_file, args.tagger_mapping_path)
class AutosarkasmusBot:
    '''
    A bot for the @autosarkasmus twitter account

    Processes sarcasm classification requests and corresponding feedback.
    '''
    def __init__(self, config_path, verbose=False):
        '''
        Constructor of AutosarkasmusBot

        Keyword arguments:
            config_path (str): path to the json configuration file
            verbose (str): stdout verbosity
        '''
        self.verbose = verbose
        self._load_config(config_path)  # load config from file
        # Twitter API parameters
        self.oauth = tweepy.OAuthHandler(self.CONSUMER_KEY,
                                         self.CONSUMER_SECRET)
        self.oauth.set_access_token(self.ACCESS_KEY, self.ACCESS_SECRET)
        self.twitter_api = tweepy.API(self.oauth)
        # tweet processing
        self.pipeline = Pipeline(self.training_corpus_positive_path,
                                 self.pipeline_tagger_mapping_path)
        self.feature_extractor = FeatureExtractor(self.features,
                                                  self.feature_order)
        self.classifier = MultiLayerPerceptronClassifier(self.feature_order,
                                                         verbose=self.verbose)

    def _load_config(self, config_path):
        '''
        Loads configuration from JSON file

        Keyword arguments:
            config_path (str): path to the json configuration file
        '''
        config_json = {}
        try:
            with open(config_path, 'r', encoding='utf8') as fop:
                config_json = json.load(fop)
        except Exception as ex:
            print('Error: Could not read config file at "' + config_path +
                  '".')
            print(ex)
        self.screen_name = config_json.get('SCREEN_NAME',
                                           None)  # screen name used by the bot
        self.enquiry_pattern = config_json.get(
            'ENQUIRY_PATTERN', None)  # pattern that matches an enquiry
        self.enquiry_responses = config_json.get('ENQUIRY_RESPONSES', {
            'positive': [],
            'negative': []
        })  # responses to an enquiry
        self.feedback_responses = config_json.get('FEEDBACK_RESPONSES', {
            'positive': [],
            'negative': []
        })  # responses to feedback
        self.CONSUMER_KEY = config_json.get('CONSUMER_KEY',
                                            None)  # Twitter API consumer key
        self.CONSUMER_SECRET = config_json.get(
            'CONSUMER_SECRET', None)  # Twitter API consumer token
        self.ACCESS_KEY = config_json.get(
            'ACCESS_KEY', None)  # Twitter API application access key
        self.ACCESS_SECRET = config_json.get(
            'ACCESS_SECRET', None)  # Twitter API application secret key
        self.pipeline_tagger_mapping_path = config_json.get(
            'PIPELINE_TAGGER_MAPPING_PATH',
            None)  # path to tagger mapping file
        self.training_corpus_positive_path = config_json.get(
            'TRAINING_CORPUS_POSITIVE_PATH',
            None)  # path to corpus with positive training data
        self.training_corpus_negative_path = config_json.get(
            'TRAINING_CORPUS_NEGATIVE_PATH',
            None)  # path to corpus with negative training data
        self.history_path = config_json.get('HISTORY_PATH',
                                            None)  # path to the bot's history
        # load history to memory
        self.history = {}
        try:
            with open(self.history_path, 'r', encoding='utf8') as fop:
                self.history = json.load(fop)
        except Exception as ex:
            print('Error: Could not read history file at "' + config_path +
                  '".')
            print(ex)
        # load features to memory
        self.features = {}
        self.feature_order = []
        features_json = config_json.get('FEATURES', [])
        for feature_json in features_json:
            self.feature_order.append(feature_json['key'])
            self.features[feature_json['key']] = feature_json['values']

    def train(self):
        '''
        Train the bot on given data
        '''
        tweets_ext = self.feature_extractor.extract_features(
            self.training_corpus_positive_path,
            self.training_corpus_negative_path,
            verbose=self.verbose)  # extract features from training corpora
        if self.verbose: print('training classifier...')
        self.classifier.train(tweets_ext)

    def classify_tweet(self, tweet_raw):
        '''
        Classifies a single tweet

        Keyword arguments:
            tweet_raw (str): text of the tweet to classify

        Returns:
            dict: the extracted features of the tweet in addition to class
        '''
        if self.verbose: print('classifying tweet: "' + tweet_raw + '"')
        tweet_tkn, tweet_proc = self.pipeline.process_tweet(
            tweet_raw)  # preprocess the raw tweet
        if self.verbose: print(str(tweet_tkn) + '\n' + str(tweet_proc))
        tweet_ext = self.feature_extractor.extract_features_from_tweet(
            tweet_tkn, tweet_proc,
            True)  # extract features from tweet (sarcasm is True per default)
        del (tweet_ext['class']
             )  # delete class since it is only a default value
        if self.verbose:
            print([(feature, tweet_ext[feature])
                   for feature in self.feature_order
                   if tweet_ext.get(feature, 0) != 0
                   ])  # print all features != 0
        tweet_class = self.classifier.classify([tweet_ext
                                                ])  # classify the tweet
        if self.verbose:
            print('classified with sarcasm:', tweet_class[0]['class'])
        return tweet_class[0]

    def is_sarcastic_tweet(self, tweet_raw):
        '''
        Identifies sarcasm in a single tweet

        Keyword arguments:
            tweet_raw (str): text of the tweet to classify

        Returns:
            bool: whether the tweet was classified as being sarcastic
        '''
        return self.classify_tweet(tweet_raw)['class']

    def is_valid_enquiry(self, tweet_json):
        '''
        Checks whether the given tweet is a valid enquiry

        Keyword arguments:
            tweet_json (dict): JSON representation of the tweet object

        Returns:
            bool: whether the tweet is a valid enquiry
        '''
        res = False
        # check if tweet matches enquiry pattern, no case matching
        if re.match(self.enquiry_pattern, tweet_json['text'], re.IGNORECASE):
            # check if tweet wasn't authored by the bot itself
            if tweet_json['user']['screen_name'] != self.screen_name:
                # check for retweeted enquiry
                if not tweet_json['retweeted']:
                    # check if tweet is a reply to anything
                    if tweet_json['in_reply_to_status_id']:
                        res = True
        return res

    def is_valid_feedback(self, tweet_json):
        '''
        Checks whether the given tweet is valid feedback

        Keyword arguments:
            tweet_json (dict): JSON representation of the tweet object

        Returns:
            bool: whether the tweet is valid feedback
        '''
        res = False
        # check if tweet wasn't authored by the bot itself
        if tweet_json['user']['screen_name'] != self.screen_name:
            # check for retweet
            if not tweet_json['retweeted']:
                # check if tweet is reply to a tweet sent by the bot
                if tweet_json['in_reply_to_status_id']:
                    reply_tweet_status = self.twitter_api.get_status(
                        tweet_json['in_reply_to_status_id'])
                    if reply_tweet_status.user.screen_name == self.screen_name:
                        # check if bot tweeted a classification
                        for enquiry_response in self.enquiry_responses[
                                'positive'] + self.enquiry_responses[
                                    'negative']:
                            if enquiry_response in reply_tweet_status.text:
                                res = True
                                break
        return res

    def gen_enquiry_response(self, recipient, tweet_is_sarcastic):
        '''
        Generate response to an enquiry

        Keyword arguments:
            recipient (str): user handle of the addressee
            tweet_is_sarcastic (bool): whether the tweet was classified as sarcastic

        Returns:
            str: twitter-ready response
        '''
        response = '😓'  # default response
        # pick fitting response at random
        if tweet_is_sarcastic:
            response = self.enquiry_responses['positive'][randint(
                0,
                len(self.enquiry_responses['positive']) - 1)]
        else:
            response = self.enquiry_responses['negative'][randint(
                0,
                len(self.enquiry_responses['negative']) - 1)]
        # prepend the recipient
        response = '@' + recipient + ' ' + response + ' Korrekt? (j/n)'
        # trim tweet if necessary
        if len(response) > 140:
            response = response[:137] + '...'
        return response

    def gen_feedback_response(self, recipient, correctly_classified):
        '''
        Generate response to feedback

        Keyword arguments:
            recipient (str): user handle of the addressee
            correctly_classified (bool): whether the tweet was correctly classified

        Returns:
            str: twitter-ready response
        '''
        response = 'Danke! ^^'  # default response
        # pick fitting response at random
        if correctly_classified:
            response = self.feedback_responses['positive'][randint(
                0,
                len(self.feedback_responses['positive']) - 1)]
        else:
            response = self.feedback_responses['negative'][randint(
                0,
                len(self.feedback_responses['negative']) - 1)]
        # prepend the recipient
        response = '@' + recipient + ' ' + response
        # trim tweet if necessary
        if len(response) > 140:
            response = response[:137] + '...'
        return response

    def respond(self, tweet_json):
        '''
        Respond to a tweet

        Responses are generated for classification enquiries and feedback to said enquiries

        Keyword arguments:
            tweet_json (dict): JSON representation of the tweet to respond to
        '''
        if self.verbose:
            print('mentioned by @' + tweet_json['user']['screen_name'] +
                  '\n"' + tweet_json['text'] + '"')
        bot_response = None

        if self.is_valid_enquiry(
                tweet_json):  # if tweet is a classification enquiry
            if tweet_json[
                    'in_reply_to_status_id'] not in self.history:  # check if tweet has already been classifed
                eval_tweet_status = self.twitter_api.get_status(
                    tweet_json['in_reply_to_status_id']
                )  # get tweet to be classified (as Tweepy.Status object)
                eval_tweet_sarcastic = self.is_sarcastic_tweet(
                    eval_tweet_status.text)  # classify the tweet
                bot_response = self.gen_enquiry_response(
                    tweet_json['user']['screen_name'],
                    eval_tweet_sarcastic)  # generate response accordingly
                # save tweet and its classification in history
                self.history[eval_tweet_status.id] = eval_tweet_status._json
                self.history[eval_tweet_status.
                             id]['sarcasm_predicted'] = eval_tweet_sarcastic
                self.save_history()

        elif self.is_valid_feedback(tweet_json):  # if tweet is feedback
            correctly_classified = None
            # analyze feedback
            if re.match(r'.*?\b(j(a|o|ep)?|y(es|o)?)\b.*?', tweet_json['text'],
                        re.IGNORECASE):
                correctly_classified = True
            elif re.match(r'.*?\b(n(e(in)?|o(pe)?)?)\b.*?', tweet_json['text'],
                          re.IGNORECASE):
                correctly_classified = False
            if correctly_classified is not None:  # if feedback could be parsed
                # follow the tweet trail back to the source (feedback -> classification -> enquiry -> classified_tweet)
                class_tweet_status = self.twitter_api.get_status(
                    tweet_json['in_reply_to_status_id'])
                enq_tweet_status = self.twitter_api.get_status(
                    class_tweet_status.in_reply_to_status_id)
                eval_tweet_status = self.twitter_api.get_status(
                    enq_tweet_status.in_reply_to_status_id)
                # save the evaluation
                if 'sarcasm_actual' not in self.history[eval_tweet_status.id]:
                    bot_response = self.gen_feedback_response(
                        tweet_json['user']['screen_name'],
                        correctly_classified)
                    self.history[
                        eval_tweet_status.id]['sarcasm_actual'] = self.history[
                            eval_tweet_status.
                            id]['sarcasm_predicted'] and correctly_classified
                    self.save_history()

        if bot_response:
            self.twitter_api.update_status(
                bot_response, tweet_json['id'])  # post response to twitter
        if self.verbose and bot_response:
            print('responded with: "' + str(bot_response) + '"')

    def save_history(self):
        '''
        Saves the bot's history to file
        '''
        try:
            with open(self.history_path, 'w', encoding='utf8') as fop:
                json.dump(self.history, fop)
        except Exception as ex:
            print('Error: Could not save history to "' + self.history_path +
                  '"')
            print(ex)
Exemple #6
0
	def evaluate(self):
		'''start evaluation and output the results in a file'''
		try:
			# Count the folds
			n = 1
			# Create the outputfile
			f = open(self.outputfile, "w")

			# Set the finale score variables
			accuracy_res = 0
			precision_res = 0
			recall_res = 0
			f1_res = 0

			# Start the evaluation using Xvalidator
			for training_pos, validation_pos, training_neg, validation_neg in Xvalidator(self.pos_set, self.k, self.neg_set).k_fold_cross_validation():

				# Create temporary files to store the splitted datasets for classification
				training_pos_file = open ("temp_tweets-training-pos-"+str(n), "w")
				writer_pos_file = csv.writer(training_pos_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n')
				writer_pos_file.writerows(training_pos)

				training_pos_file.flush()

				training_neg_file = open ("temp_tweets-training-neg-"+str(n), "w")
				writer_neg_file = csv.writer(training_neg_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n')
				writer_neg_file.writerows(training_neg)

				training_neg_file.flush()

				validation_pos_file = open ("temp_tweets-validation-pos-"+str(n), "w")
				writer_validation_pos_file = csv.writer(validation_pos_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n')
				writer_validation_pos_file.writerows(validation_pos)

				validation_pos_file.flush()

				validation_neg_file = open ("temp_tweets-validation-neg-"+str(n), "w")
				writer_validation_neg_file = csv.writer(validation_neg_file,quoting=csv.QUOTE_NONNUMERIC,lineterminator='\n')
				writer_validation_neg_file.writerows(validation_neg)

				validation_neg_file.flush()

				# Extract feature list
				if self.features is "full_featured":
				    feature_list, feature_order = full_featured("temp_tweets-training-pos-"+str(n))
				elif self.features is "unigram_featured":
					feature_list, feature_order = unigram_featured("temp_tweets-training-pos-"+str(n))

				feature_extractor = FeatureExtractor(feature_list, feature_order)

				tweets_train_ext = feature_extractor.extract_features("temp_tweets-training-pos-"+str(n), "temp_tweets-training-neg-"+str(n), verbose=True)
				tweets_test_ext = feature_extractor.extract_features("temp_tweets-validation-pos-"+str(n), "temp_tweets-validation-neg-"+str(n), verbose=True)
				# Create lists, to store the labels for predicted and true classes
				y_true = []
				y_pred = []

				# Merge the validation dataset for classification
				validation = validation_pos + validation_neg

				# Set the line variable to write the results to a file
				line = ""


				line += "+++++++++++++++++++++BEGIN OF " + str(n) + " nth fold++++++++++++++++++++++++" + "\n"
				line += "---------------------REAL LABELS--------------------------------" + "\n"
				for tweet, y in zip(validation, tweets_test_ext):
					y_true.append(y['class'])
					line += "Tweet: " + tweet[2]
					line += "\n"
					line += "Label: " + str(y['class'])
					line += "\n"

				Classifier = SVMClassifier(feature_order,'linear',-1,False)
				Classifier.train(tweets_train_ext)
				classification_results = Classifier.classify(tweets_test_ext)

				training = training_pos + training_neg
				print("Validation "+str(len(validation)))
				print("Training "+str(len(training)))


				line += "----------------------PREDICTED LABELS--------------------------------" + "\n"
				for tweet, val in zip(validation, classification_results):
					y_pred.append(val['class'])

					line += "Tweet: " + tweet[2]
					line += "\n"
					line += "Label: " + str(val['class'])
					line += "\n"


				# Remove the temporary files
				os.remove("temp_tweets-training-pos-"+str(n))
				os.remove("temp_tweets-training-neg-"+str(n))
				os.remove("temp_tweets-validation-pos-"+str(n))
				os.remove("temp_tweets-validation-neg-"+str(n))

				# Show most significant features
				def show_most_informative_features(features, clf, n=20):
					feature_names = features
					output = "Most significant features: \n"
					coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
					top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
					for (coef_1, fn_1), (coef_2, fn_2) in top:
						output += "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2) + "\n"
					return output

				# Calculate the current scores
				accuracy = accuracy_score(y_true, y_pred)
				print("Accuracy: "+str(accuracy))
				precision = precision_score(y_true, y_pred, average='binary', pos_label='sarcastic')
				print("Precision: "+str(precision))
				recall = recall_score(y_true, y_pred, average='binary', pos_label='sarcastic')
				print("Recall: "+str(recall))
				f1 = f1_score(y_true, y_pred, average='binary', pos_label='sarcastic')
				print("F1-Score: "+str(f1))


				# Calculate the final scores
				accuracy_res += accuracy
				precision_res += precision
				recall_res += recall
				f1_res += f1

				line += "Accuracy: " + str(accuracy) + "\n"
				line += "Precision: " + str(precision) + "\n"
				line += "Recall: " + str(recall) + "\n"
				line += "F1-Score: " + str(f1) + "\n"

				line += show_most_informative_features(feature_order, Classifier.svm)
				line += "++++++++++++++++++++++END OF " + str(n) + " nth fold++++++++++++++++++++++++++++++" + "\n"
				f.write(line)

				print(n)
				n += 1

			line = ""
			line += "Accuracy: " + str(accuracy_res/self.k) + "Precision: " + str(precision_res/self.k) + "Recall: " + str(recall_res/self.k) + "F1-Score: " + str(f1_res/self.k)

			f.write(line)
			f.close()

		except IOError as e:
			print("I/O error({0}): {1}".format(e.errno, e.strerror))