def get_vocab_counts_from_csv(self): review_count = 0 with open(self.input_file, "rb") as csvfile: review_reader = csv.reader(csvfile, delimiter="\t", quotechar='"') review_reader.next() for row in review_reader: rows = row[0].split(",", 1) review_type = rows[0] review_content = rows[1] # print review_content self.total_sentence_count = self.total_sentence_count + 1 review_type = str(review_type) # Organizing the reviews based on the review type self.update_review_number_for_review_type(review_type) # updating the class counts if review_type in self.class_counts_hash.keys(): class_count = self.class_counts_hash[review_type] class_count = class_count + 1 self.class_counts_hash[review_type] = class_count else: self.class_counts_hash.update({review_type: 1}) # @depricated POS tagging # pos_tagging_tokens=preprocessor.tokenize_string_without_punctuations(review_content) # self.update_pos_tags(pos_tagging_tokens) final_tokens = preprocessor.pre_process_sentence(review_content) self.construct_vocabulary_hash(review_type, final_tokens) csvfile.close()
import naive_bayes_classifier as nb (script_name,model_file,test_file)=argv prior_word_conditional_prob_hash=pickle.load(open(model_file,"rb")) feature_rank_hash=pickle.load(open("feature_ranks.p","rb")) prior_probabilities=pickle.load(open("prior_class_probabilities.p","rb")) count=-1 with open(test_file, 'rb') as csvfile: review_reader = csv.reader(csvfile, delimiter='\t', quotechar='"') #review_reader.next() for row in review_reader: count=count+1 if count==0: print "Id,Category" continue #rows=row[0].split(",",1) review_content="" #if len(rows)==2: # review_content=rows[1] #elif len(rows)==1: # review_content=rows[0] review_content=row[0] words=preprocessor.pre_process_sentence(review_content) negative_words=pickle.load(open("negative_words.p","rb")) positive_words=pickle.load(open("positive_words.p","rb")) class_data=nb.naive_bayes_classifier(prior_probabilities,prior_word_conditional_prob_hash,words,positive_words,negative_words ) #print str(count)+","+class_data.keys()[0] print class_data.keys()[0]+","+review_content #break