def median_approach(llimit,ulimit,isphrase,pathname):

    posmedlist=[]
    negmedlist=[]
    medians=[]

    lpcount=0
    totalcount=ulimit-llimit
    cnt_var=0
    print '\nNo of +ve reviews trained : '
    for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
        testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
        posmedlist.append(testmed)
        lpcount=lpcount+1
	cnt_var+=1
        print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    lpcount=0
    cnt_var=0
    print '\nNo of -ve reviews trained : '
    for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
        testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
        negmedlist.append(testmed)
        lpcount=lpcount+1
	cnt_var+=1
        print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    medians.append([numpy.median(x) for x in itertools.izip(*posmedlist)])
    medians.append([numpy.median(x) for x in itertools.izip(*negmedlist)])

    f = open('train_result\proximity_median_train_result_'+str(isphrase),'w')
    json.dump(medians,f)
    f.close()
def bins_svm_approach(llimit,ulimit,isphrase,pathname):

    posbinlist=[]
    negbinlist=[]
    trainingdata=[]
    trainingclass=[]
    bin_train_set=[]
    totalcount=ulimit-llimit

    lpcount=0
    cnt_var=0
    print '\nNo of +ve reviews scanned for training : '
    for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
        testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
        posbinlist.append(testbin)
        lpcount+=1
	cnt_var+=1
        print 'Scanning +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
        

    lpcount=0
    cnt_var=0
    print '\nNo of -ve reviews scanned for training : '
    for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
        testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
        negbinlist.append(testbin)
        lpcount+=1
	cnt_var+=1
        print 'Scanning -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'


    lpcount=0
    totalcount=len(posbinlist)
    print '\nNo of +ve reviews trained : '
    trainingdata.extend(posbinlist)
    for i in range(totalcount):
        trainingclass.append(1)
        lpcount+=1
        print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    lpcount=0
    totalcount=len(negbinlist)
    print '\nNo of -ve reviews trained : '
    trainingdata.extend(negbinlist)
    for i in range(totalcount):
        trainingclass.append(0)
        lpcount+=1
        print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    bin_train_set.append(trainingdata)
    bin_train_set.append(trainingclass)

    f = open('train_result\proximity_bin_train_result_'+str(isphrase),'w')
    json.dump(bin_train_set,f)
    f.close()
def phrase_analysis_call(llimit,ulimit):

    from nltk.corpus import movie_reviews
    
    lpcount=0
    totalcount=ulimit-llimit
    testmed=[]
    phrase_medlist=[]
    file_exist=0

    
    if os.path.isfile('phrase_analysis_part_file'):
        fid = open('phrase_analysis_part_file')
        phrase_medlist=json.load(fid)
        fid.close()
        file_exist=1

    print '\nNo of +ve reviews trained : '
    for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
        if file_exist:
            phrase_medlist[0].append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid)))
        else:
            testmed.append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid)))
        lpcount=lpcount+1
        print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    if not file_exist:
        phrase_medlist.append(testmed)

    lpcount=0
    testmed=[]
    print '\nNo of -ve reviews trained : '
    for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
        if file_exist:
            phrase_medlist[1].append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid)))
        else:
            testmed.append(proximity_tagger.phrase_analysis(movie_reviews.abspath(fid)))
        lpcount=lpcount+1
        print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'

    if not file_exist:
        phrase_medlist.append(testmed)

    fid = open('phrase_analysis_part_file','w')
    json.dump(phrase_medlist,fid)
    fid.close()
def imdb_reviews_df():
    #print(movie_reviews.fileids())
    #print("POS", len(movie_reviews.fileids("pos")))
    #print("NEG", len(movie_reviews.fileids("neg")))
    #pos_texts = parse_text_files("/Users/USERNAME/nltk_data/corpora/movie_reviews/pos")
    #neg_texts = parse_text_files("/Users/USERNAME/nltk_data/corpora/movie_reviews/neg")
    pos_texts = parse_text_files(movie_reviews.abspath("pos"))
    neg_texts = parse_text_files(movie_reviews.abspath("neg"))
    #texts = pos_texts + neg_texts

    pos_df = pd.DataFrame(pos_texts)
    #pos_df["label"] = ["pos" for d in pos_texts]
    pos_df["label"] = [1 for d in pos_texts]

    neg_df = pd.DataFrame(neg_texts)
    #pos_df["label"] = ["neg" for d in pos_texts]
    neg_df["label"] = [0 for d in neg_texts]

    combined_df = pd.concat([pos_df, neg_df])
    return combined_df
def findTrainingMedian():
	posMedianList = []
	negMedianList = []
	posFileIdList = mr.fileids(categories = 'pos')[0:50]
	negFileIdList = mr.fileids(categories = 'neg')[0:50]
	i = 1
	print '\nTraining with Positive Reviews....'
	for fid in posFileIdList:
		print 'Review ' + `i`
		temporaryMedian = findMedian(mr.abspath(fid))
		posMedianList.append(temporaryMedian)
		i += 1
	i = 1
	print '\nTraining with Negative Reviews....'
	for fid in negFileIdList:
		print 'Review ' + `i`
		temporaryMedian = findMedian(mr.abspath(fid))
		negMedianList.append(temporaryMedian)
		i += 1
	trainingMedianList = [posMedianList, negMedianList]
	return trainingMedianList