Ejemplo n.º 1
0
def main():
	for fname in f_names:
		corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed
		vocab = create_corpus.form_vocab(corpus[0:100000]) #number of unique words..	vocab is a dict
		corpus = create_corpus.space_strip(corpus[0:100000])
		sort_vocab = sort(vocab)	#sorted vocab in the form a list of elements [(key,freq),..]
		#freq_vocab = trim(sort_vocab,1)
		#write_vocab(freq_vocab,'freq_hist_1_'+fname)
		write_vocab(sort_vocab,'hist_'+fname)
		print fname+" vocab size : "+ str(len(vocab))+ " corpus size : "+str(len(corpus))
Ejemplo n.º 2
0
def main():
	f_names = ['output_brown_religion.txt','output_coffee.txt']
	for fname in f_names:
		corpus = create_corpus.load_corpus(fname[7:]) #load corpus with space replaced by underscores,lowercase and all punc removed
		i_vocab = create_corpus.form_vocab(corpus) #number of unique words..	vocab is a dict
		vocab = output_vocab(fname)
		sort_vocab = plot_histogram.sort(vocab)
		err_vocab = error(i_vocab,sort_vocab)
		plot_histogram.write_vocab(err_vocab,'hist_'+fname)
		print 'Average word length is ',avg_word_length(vocab)
Ejemplo n.º 3
0
<<<<<<< HEAD
def main():
	for fname in f_names:
		corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed
		vocab = create_corpus.form_vocab(corpus[0:100000]) #number of unique words..	vocab is a dict
		corpus = create_corpus.space_strip(corpus[0:100000])
		sort_vocab = sort(vocab)	#sorted vocab in the form a list of elements [(key,freq),..]
		#freq_vocab = trim(sort_vocab,1)
		#write_vocab(freq_vocab,'freq_hist_1_'+fname)
		write_vocab(sort_vocab,'hist_'+fname)
		print fname+" vocab size : "+ str(len(vocab))+ " corpus size : "+str(len(corpus))
		#print "contribution of freq vocab in corpus size : "+ str(contribution(freq_vocab))

if __name__ ==  "__main__":
	main()	
=======
for fname in f_names:
	corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed
	vocab = create_corpus.form_vocab(corpus) #number of unique words..	vocab is a dict
	corpus = create_corpus.space_strip(corpus)
	sort_vocab = sort(vocab)	#sorted vocab in the form a list of elements [(key,freq),..]
	freq_vocab = trim(sort_vocab,1)
	write_vocab(freq_vocab,'freq_hist_1_'+fname)
	write_vocab(sort_vocab,'hist_'+fname)
	print fname+" vocab size : "+ str(len(vocab))+" freq_vocab : "+ str(len(freq_vocab)) + " corpus size : "+str(len(corpus))
	print "contribution of freq vocab in corpus size : "+ str(contribution(freq_vocab))

	
>>>>>>> ba02c4f692249cdf8bcbc2930b4c74b2499e2e75