Exemple #1
0
def main():
	for fname in f_names:
		corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed
		vocab = create_corpus.form_vocab(corpus[0:100000]) #number of unique words..	vocab is a dict
		corpus = create_corpus.space_strip(corpus[0:100000])
		sort_vocab = sort(vocab)	#sorted vocab in the form a list of elements [(key,freq),..]
		#freq_vocab = trim(sort_vocab,1)
		#write_vocab(freq_vocab,'freq_hist_1_'+fname)
		write_vocab(sort_vocab,'hist_'+fname)
		print fname+" vocab size : "+ str(len(vocab))+ " corpus size : "+str(len(corpus))
Exemple #2
0
<<<<<<< HEAD
def main():
	for fname in f_names:
		corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed
		vocab = create_corpus.form_vocab(corpus[0:100000]) #number of unique words..	vocab is a dict
		corpus = create_corpus.space_strip(corpus[0:100000])
		sort_vocab = sort(vocab)	#sorted vocab in the form a list of elements [(key,freq),..]
		#freq_vocab = trim(sort_vocab,1)
		#write_vocab(freq_vocab,'freq_hist_1_'+fname)
		write_vocab(sort_vocab,'hist_'+fname)
		print fname+" vocab size : "+ str(len(vocab))+ " corpus size : "+str(len(corpus))
		#print "contribution of freq vocab in corpus size : "+ str(contribution(freq_vocab))

if __name__ ==  "__main__":
	main()	
=======
for fname in f_names:
	corpus = create_corpus.load_corpus(fname) #load corpus with space replaced by underscores,lowercase and all punc removed
	vocab = create_corpus.form_vocab(corpus) #number of unique words..	vocab is a dict
	corpus = create_corpus.space_strip(corpus)
	sort_vocab = sort(vocab)	#sorted vocab in the form a list of elements [(key,freq),..]
	freq_vocab = trim(sort_vocab,1)
	write_vocab(freq_vocab,'freq_hist_1_'+fname)
	write_vocab(sort_vocab,'hist_'+fname)
	print fname+" vocab size : "+ str(len(vocab))+" freq_vocab : "+ str(len(freq_vocab)) + " corpus size : "+str(len(corpus))
	print "contribution of freq vocab in corpus size : "+ str(contribution(freq_vocab))

	
>>>>>>> ba02c4f692249cdf8bcbc2930b4c74b2499e2e75