def vector_of_language(source_file):
	opened_file = open(source_file, encoding="utf-8")
	text = opened_file.read()

	unigram_probability = ngrams.probability(ngrams.count_ngrams(text,1))
	bigram_probability = ngrams.probability_of_bigram(ngrams.count_ngrams(text, 2))
	trigram_probability = ngrams.probability_of_trigram(ngrams.count_ngrams(text, 3))
	return [unigram_probability, bigram_probability, trigram_probability]
Beispiel #2
0
def vector_of_language(source_file):
	opened_file = open(source_file, encoding="utf-8")
	unigrams = [{},{}]
	bigrams = [{},{}]
	trigrams = [{},{}]
	for line in opened_file:
		unigrams[1] = ngrams.count_ngrams(line,1)
		unigrams[0] = sum((collections.Counter(dict(lines)) for lines in unigrams), collections.Counter())
		bigrams[1] = ngrams.count_ngrams(line,2)
		bigrams[0] = sum((collections.Counter(dict(lines)) for lines in bigrams), collections.Counter())	
		trigrams[1] = ngrams.count_ngrams(line,3)
		trigrams[0] = sum((collections.Counter(dict(lines)) for lines in trigrams), collections.Counter())	

	unigram_probability = ngrams.probability(unigrams[0])
	bigram_probability = ngrams.probability_of_bigram(bigrams[0])
	trigram_probability = ngrams.probability_of_trigram(trigrams[0])
	return [unigram_probability, bigram_probability, trigram_probability]