Esempio n. 1
0
def create_bigrams(training_set, classes_labels):
	fail_counter = 0
	vocabulary = create_vocabulary(training_set)
	histograms = create_histograms(classes_labels)
	for lab in histograms[2]:
		histograms[2][lab] = np.zeros(len(vocabulary), dtype=int)
	for ts in training_set:
		for lab in utility.extract_annotations(ts[0]):
			try:
				histograms[0][lab] += 1
				f_bis = extract_bigrams_from_corpus(utility.extract_words(ts[2]))
				histograms[1][lab] += len(f_bis)
				for bi in f_bis:
					histograms[2][lab][vocabulary.index(bi)] += f_bis[bi]
			except KeyError:
				fail_counter += 1
	"""
	for ts in training_set:
		f_bis = extract_bigrams_from_corpus(extract_words(ts[2]))
		for bi in f_bis:
			for lab in extract_annotations(ts[0]):
				try:
					histograms[2][lab][vocabulary.index(bi)] += f_bis[bi]
					histograms[0][lab] += 1
					histograms[1][lab] += 1
				except KeyError:
					fail_counter += 1
	"""
	print(f"Failed bigram additions: {fail_counter}")
	return [histograms, vocabulary]
Esempio n. 2
0
def extract_words_with_count(text):
    text = utility.extract_words(text)
    result = {}
    for w in text:
        try:
            result[w] += 1
        except KeyError:
            result[w] = 1
    return result
Esempio n. 3
0
def file_to_dictionary(text):
	result = {}
	# add stop-words?
	for w in utility.extract_words(text):
		try:
			result[w] += 1
		except KeyError:
			result[w] = 1
	return result
Esempio n. 4
0
def create_bigrams(training_set, classes_labels):
	fail_counter = 0
	histograms = create_histograms(classes_labels)
	for ts in training_set:
		for lab in utility.extract_annotations(ts[0]):
			try:
				histograms[0][lab] += 1
				f_bis = extract_bigrams_from_corpus(utility.extract_words(ts[2]))
				histograms[1][lab] += len(f_bis)
				for bi in f_bis:
					try:
						histograms[2][lab][bi] += f_bis[bi]
					except KeyError:
						histograms[2][lab][bi] = 1
			except KeyError:
				fail_counter += 1
	print(f"Failed bigram additions: {fail_counter}")
	return [histograms, None]
Esempio n. 5
0
def fill_classes_histograms(training_set, classes_file_content):
    histograms = create_histograms(classes_file_content)
    counter = 0
    for ts in training_set:
        for an in utility.extract_annotations(ts[0]):
            if an.strip() in histograms[0]:
                histograms[0][an] += 1
                for w in utility.extract_words(ts[2]):
                    #print(histograms[1][an])
                    try:
                        histograms[2][an][w] += 1
                    except KeyError:
                        histograms[2][an][w] = 1
                    histograms[1][an] += 1
            else:
                print(f"Unknown annotation: {an}")
                counter += 1
    print(f"Unrecognized annotations: {counter}")
    return histograms
def fill_classes_histograms(classes_file_content, training_set, vocabulary):
    histograms = create_histograms(classes_file_content)
    counter = 0
    for k in histograms[2]:
        histograms[2][k] = np.zeros(len(vocabulary), dtype=int)
    for ts in training_set:
        for an in utility.extract_annotations(ts[0]):
            # SOLVED: find out why some an is 4 long and strip nor replace doesn't work
            # (0xff na prvnim indexu) --- NEED TO USE utf-8-sig ENCODING!!!
            #if len(an) == 4:
            #	an = an[1:]
            if an.strip() in histograms[0]:
                histograms[0][an] += 1
                for w in utility.extract_words(ts[2]):
                    #print(histograms[1][an])
                    histograms[2][an][vocabulary.index(w)] += 1
                    histograms[1][an] += 1
            else:
                print(f"Unknown annotation: {an}")
                counter += 1
    print(f"Unrecognized annotations: {counter}")
    return histograms
def create_vocabulary(training_set):
    vocabulary = set()
    for ts in training_set:
        vocabulary.update(utility.extract_words(ts[2]))
    return sorted(tuple(
        vocabulary))  #tuple for indexes, sorted to keep indexes consistent
Esempio n. 8
0
def create_vocabulary(training_set):
	vocab = set()
	for ts in training_set:
		vocab.update(extract_bigrams_from_corpus(utility.extract_words(ts[2])))
	return sorted(tuple(vocab))