count += ALPHA total += vocabulary_size*ALPHA #End Smoothing return float(count)/total def total(bag): total = 0 #return the total count of words in a bag for value in bag.values(): total += value return total; #################################### #Step 1: Get the distinct tokens in the training data and the total number of tokens a.k.a bag-of-words wordTokenizer = WordTokenizer() words_in_training = wordTokenizer.tokenizeDirectoryAsDictionary(TRAINING_DATA_DIRECTORY) training_document_bag_dictionary = {} for key, token_list in words_in_training.iteritems(): bag_of_words = defaultdict(int) for word in token_list: bag_of_words[word]+=1 training_document_bag_dictionary[key] = bag_of_words training_document_bag_dictionary = OrderedDict(sorted(training_document_bag_dictionary.items(), key=lambda t: t[0])) #print training_document_bag_dictionary positive_training_document_bag_dictionary = {} negative_training_document_bag_dictionary = {} objective_training_document_bag_dictionary = {}
#######Constants########### POSITIVE_POLARITY = "positive" NEGATIVE_POLARITY = "negative" OBJECTIVE_POLARITY = "neutral" POSITIVE_POLARITY_FOR_SCORER = "POS" NEGATIVE_POLARITY_FOR_SCORER = "NEG" OBJECTIVE_POLARITY_FOR_SCORER = "OBJ" GENERATED_FILES_DIRECTORY = "../../generated_files/" tuning_parameter = 6 #####End of Constants##### classificationSentimentDictionary = SentimentParser().wordsClassification() #print classificationSentimentDictionary tokenizer = WordTokenizer() fileDictionary = tokenizer.tokenizeDirectoryAsDictionary("dev") fileDictionary = OrderedDict(sorted(fileDictionary.items(), key=lambda t: t[0])) fileClassDictionary = {} for key, value_as_list in fileDictionary.iteritems(): positive = 0 negative = 0 for word in value_as_list: if word in classificationSentimentDictionary: word_sense_list = classificationSentimentDictionary[word] word_sense = word_sense_list[random.randint(0, len(word_sense_list)-1)] if word_sense == POSITIVE_POLARITY: positive+=1 elif word_sense == NEGATIVE_POLARITY: negative+=1 abstotal = abs(positive - negative)
def minmaxSmoothing(power): alpha = math.pow(10, power) #Step 1: Get the distinct tokens in the training data and the total number of tokens a.k.a bag-of-words wordTokenizer = WordTokenizer() words_in_training = wordTokenizer.tokenizeDirectoryAsDictionary(TRAINING_DATA_DIRECTORY) training_document_bag_dictionary = {} for key, token_list in words_in_training.iteritems(): bag_of_words = defaultdict(int) for word in token_list: bag_of_words[word]+=1 training_document_bag_dictionary[key] = bag_of_words training_document_bag_dictionary = OrderedDict(sorted(training_document_bag_dictionary.items(), key=lambda t: t[0])) #print training_document_bag_dictionary positive_training_document_bag_dictionary = {} negative_training_document_bag_dictionary = {} objective_training_document_bag_dictionary = {} total_positive = 0 total_negative = 0 total_objective = 0 for line in fileinput.input([TRAIN_KEY_FILE]): temp_line = line.split(" ") fileName = temp_line[0] sense = temp_line[1].strip() fileName = "../../"+fileName #Make it into a suitable format if sense == POSITIVE_POLARITY_FOR_SCORER: #document is a positive one #Add the fileName -> bag-of-words(fileName) to a dict positive_training_document_bag_dictionary[fileName] = training_document_bag_dictionary[fileName] total_positive += total(training_document_bag_dictionary[fileName]) elif sense == NEGATIVE_POLARITY_FOR_SCORER: negative_training_document_bag_dictionary[fileName] = training_document_bag_dictionary[fileName] total_negative += total(training_document_bag_dictionary[fileName]) elif sense == OBJECTIVE_POLARITY_FOR_SCORER: objective_training_document_bag_dictionary[fileName] = training_document_bag_dictionary[fileName] total_objective += total(training_document_bag_dictionary[fileName]) training_total_documents = len(training_document_bag_dictionary) training_total_positive_documents = len(positive_training_document_bag_dictionary) training_total_negative_documents = len(negative_training_document_bag_dictionary) training_total_objective_documents = len(objective_training_document_bag_dictionary) probability_positive = float(training_total_positive_documents)/ training_total_documents probability_negative = float(training_total_negative_documents)/ training_total_documents probability_objective = float(training_total_objective_documents)/ training_total_documents log_positive = math.log(probability_positive) log_negative = math.log(probability_negative) log_objective = math.log(probability_objective) words_in_development = wordTokenizer.tokenizeDirectoryAsDictionary(DEVELOPMENT_DATA_DIRECTORY) words_in_development = OrderedDict(sorted(words_in_development.items(), key=lambda t: t[0])) total_count = total_positive + total_negative + total_objective responseFile = GENERATED_FILES_DIRECTORY+"smoothing_response_MIN_MAX"+str(power) f = open(responseFile, "w") for key, token_list in words_in_development.iteritems(): sentiment = "" p = 0 n = 0 o = 0 #Step 2: P(POS|Document) P(NEG|Document} P(OBJ|Document) for word in token_list: #Step 2.1 #P(k| POS) P(k| NEG) P(k| OBJ) for each word in this document probability_word_positive_documents = probabilityOfWordInSeveralDocumentsPutTogether(word, positive_training_document_bag_dictionary, total_count, alpha) log_probability_positive = math.log(probability_word_positive_documents) if probability_word_positive_documents > 0 else NEG_INFINITY p+=log_probability_positive probability_word_negative_documents = probabilityOfWordInSeveralDocumentsPutTogether(word, negative_training_document_bag_dictionary, total_count, alpha) log_probability_negative = math.log(probability_word_negative_documents) if probability_word_negative_documents > 0 else NEG_INFINITY n+=log_probability_negative probability_word_objective_documents = probabilityOfWordInSeveralDocumentsPutTogether(word, objective_training_document_bag_dictionary, total_count, alpha) log_probability_objective = math.log(probability_word_objective_documents) if probability_word_objective_documents > 0 else NEG_INFINITY o+=log_probability_objective #Step 2.2: P(C|Document) = P(C) + all the P(k|C) --> Here in log form p+=log_positive n+=log_negative o+=log_objective if p > n: if p > o: sentiment = POSITIVE_POLARITY_FOR_SCORER theta = p else: sentiment = OBJECTIVE_POLARITY_FOR_SCORER theta = o else: if n > o: sentiment = NEGATIVE_POLARITY_FOR_SCORER theta = n else: sentiment = OBJECTIVE_POLARITY_FOR_SCORER theta = o print "Document Name: "+key+" is classified as: "+sentiment f.write(sentiment+"\t"+str(theta)+"\n") f.close()
count += ALPHA total += vocabulary_size*ALPHA #End Smoothing return float(count)/total def total(bag): total = 0 #return the total count of words in a bag for value in bag.values(): total += value return total; #################################### #Step 1: Get the distinct tokens in the training data and the total number of tokens a.k.a bag-of-words wordTokenizer = WordTokenizer() words_in_training = wordTokenizer.tokenizeDirectoryAsDictionary(TRAINING_DATA_DIRECTORY) words_in_training = OrderedDict(sorted(words_in_training.items(), key=lambda t: t[0])) training_document_bag_dictionary = {} for key, token_list in words_in_training.iteritems(): bag_of_words = defaultdict(int) for word in token_list: bag_of_words[word]+=1 training_document_bag_dictionary[key] = bag_of_words training_document_bag_dictionary = OrderedDict(sorted(training_document_bag_dictionary.items(), key=lambda t: t[0])) #print training_document_bag_dictionary positive_training_document_bag_dictionary = {} negative_training_document_bag_dictionary = {}
#######Constants########### POSITIVE_POLARITY = "positive" NEGATIVE_POLARITY = "negative" OBJECTIVE_POLARITY = "neutral" POSITIVE_POLARITY_FOR_SCORER = "POS" NEGATIVE_POLARITY_FOR_SCORER = "NEG" OBJECTIVE_POLARITY_FOR_SCORER = "OBJ" GENERATED_FILES_DIRECTORY = "../../generated_files/" tuning_parameter = 10 #####End of Constants##### classificationSentimentDictionary = SentimentParser().wordsClassification() #print classificationSentimentDictionary tokenizer = WordTokenizer() fileDictionary = tokenizer.tokenizeDirectoryAsDictionary("train") fileDictionary = OrderedDict(sorted(fileDictionary.items(), key=lambda t: t[0])) fileClassDictionary = {} for key, value_as_list in fileDictionary.iteritems(): positive = 0 negative = 0 for word in value_as_list: if word in classificationSentimentDictionary: word_sense_list = classificationSentimentDictionary[word] word_sense = word_sense_list[random.randint(0, len(word_sense_list)-1)] if word_sense == POSITIVE_POLARITY: positive+=1 elif word_sense == NEGATIVE_POLARITY: negative+=1 abstotal = abs(positive - negative)