def vector_of_language(source_file):
	opened_file = open(source_file, encoding="utf-8")
	text = opened_file.read()

	unigram_probability = ngrams.probability(ngrams.count_ngrams(text,1))
	bigram_probability = ngrams.probability_of_bigram(ngrams.count_ngrams(text, 2))
	trigram_probability = ngrams.probability_of_trigram(ngrams.count_ngrams(text, 3))
	return [unigram_probability, bigram_probability, trigram_probability]
Exemple #2
0
def get_ngrams_rel(filter):
    path = get_data_file() + "/"
    files = [f for f in listdir(path) if isfile(join(path, f))]
    listRel = [
        dict(string='', ngrams=dict(), rel=0),
        dict(string='', ngrams=dict(), rel=1),
        dict(string='', ngrams=dict(), rel=2),
        dict(string='', ngrams=dict(), rel=3),
        dict(string='', ngrams=dict(), rel=4),
        dict(string='', ngrams=dict(), rel=5),
        dict(string='', ngrams=dict(), rel=6),
        dict(string='', ngrams=dict(), rel=7),
        dict(string='', ngrams=dict(), rel=8),
        dict(string='', ngrams=dict(), rel=9)
    ]
    dictRel = dl.getReliability(files)
    for file in files:
        with open(os.path.join(path, file)) as f:
            data = json.load(f)
        data = get_string(data)
        if filter:
            data = remove_noise(data)
        rel = dictRel[file]
        listRel[rel]['string'] += data
    for d in listRel:
        d['ngrams'] = ng.count_ngrams(io.StringIO(d['string']),
                                      min_length=2,
                                      max_length=5)
    return listRel
Exemple #3
0
def get_ngrams_bias(filter):
    path = get_data_file() + "/"
    files = [f for f in listdir(path) if isfile(join(path, f))]
    listBias = [
        dict(string='', ngrams=dict(), bias=0),
        dict(string='', ngrams=dict(), bias=1),
        dict(string='', ngrams=dict(), bias=2),
        dict(string='', ngrams=dict(), bias=3),
        dict(string='', ngrams=dict(), bias=4),
    ]
    dictBias = dl.getBias(files)
    for file in files:
        with open(os.path.join(path, file)) as f:
            data = json.load(f)
        data = get_string(data)
        if filter:
            data = remove_noise(data)
        bias = dictBias[file]
        if bias == 0:
            continue
        listBias[bias]['string'] += data
    for d in listBias:
        d['ngrams'] = ng.count_ngrams(io.StringIO(d['string']),
                                      min_length=2,
                                      max_length=5)
    return listBias
Exemple #4
0
def vector_of_language(source_file):
	opened_file = open(source_file, encoding="utf-8")
	unigrams = [{},{}]
	bigrams = [{},{}]
	trigrams = [{},{}]
	for line in opened_file:
		unigrams[1] = ngrams.count_ngrams(line,1)
		unigrams[0] = sum((collections.Counter(dict(lines)) for lines in unigrams), collections.Counter())
		bigrams[1] = ngrams.count_ngrams(line,2)
		bigrams[0] = sum((collections.Counter(dict(lines)) for lines in bigrams), collections.Counter())	
		trigrams[1] = ngrams.count_ngrams(line,3)
		trigrams[0] = sum((collections.Counter(dict(lines)) for lines in trigrams), collections.Counter())	

	unigram_probability = ngrams.probability(unigrams[0])
	bigram_probability = ngrams.probability_of_bigram(bigrams[0])
	trigram_probability = ngrams.probability_of_trigram(trigrams[0])
	return [unigram_probability, bigram_probability, trigram_probability]