def classify_reviews(testfolder):
    '''Classifying the actual test data'''
    model=pk.load(open('classifier.p','rb'))
    outputf=open('g4_output.txt','w+')
    for testfile in os.listdir(testfolder):
        # skip the file like '.DS_store' in Mac OS
        if testfile.startswith('.'):
          continue
        testpath=os.path.join(testfolder,testfile)
        linenos,test_reviews=load_test(testpath)
        test_set_pi=extract_features(test_reviews,mode='test')
        test_set_prabha=classifier_prabha.extract_features(test_reviews,mode='test')
        test_set_david=extract_features_david.extract_features_david(test_reviews,mode='test')
        test_set_jt=classifier_jt.extract_unigram_feature(test_reviews,feature_words,mode='test')
        test_set = combine_sets(test_set_pi, test_set_prabha, test_set_david, test_set_jt, mode='test')
        i=0
        for each_res in test_set:
            # TODO: [t] as netural
            if test_reviews[i].startswith('[t]'):
              outputf.write(str(testfile)+'\t'+str(i+1)+'\t0\n')
            else:
              result=model.classify(each_res)
              outputf.write(str(testfile)+'\t'+str(i+1)+'\t'+str(result)+'\n')
            i+=1
    outputf.close()
def evaluate_clf(heldout):
    '''Testing the model on the heldout file'''
    products,scores,reviews=load_text_from_file(heldout)

    heldout_set_pi=extract_features(reviews,scores)
    heldout_set_prabha=classifier_prabha.extract_features(reviews,scores)
    heldout_set_david=extract_features_david.extract_features_david(reviews,scores)
    heldout_set_jt=classifier_jt.extract_unigram_feature(reviews,feature_words,scores)

    heldout_set = combine_sets(heldout_set_pi, heldout_set_prabha, heldout_set_david, heldout_set_jt)
    model=pk.load(open('classifier.p','rb'))
    print 'Accuracy for the heldout set: ',nltk.classify.accuracy(model,heldout_set)
    print model.show_most_informative_features(5)
def train_classifier(trainfile):
    '''Training the classifier '''
    products,scores,reviews=load_text_from_file(trainfile)

    train_set_pi=extract_features(reviews,scores)
    train_set_prabha=classifier_prabha.extract_features(reviews,scores)
    train_set_david=extract_features_david.extract_features_david(reviews,scores)
    # get feature words from review texts
    feature_words = classifier_jt.get_feature_words(reviews, scores)
    train_set_jt=classifier_jt.extract_unigram_feature(reviews,feature_words,scores)

    train_set = combine_sets(train_set_pi, train_set_prabha, train_set_david, train_set_jt)

    clf=SklearnClassifier(LinearSVC())
    #trainlen=int(len(train_set)*0.9)
    #model=clf.train(train_set)
    model=nltk.NaiveBayesClassifier.train(train_set)
    pk.dump(model,open('classifier.p','wb'))