def create_bigrams(training_set, classes_labels): fail_counter = 0 vocabulary = create_vocabulary(training_set) histograms = create_histograms(classes_labels) for lab in histograms[2]: histograms[2][lab] = np.zeros(len(vocabulary), dtype=int) for ts in training_set: for lab in utility.extract_annotations(ts[0]): try: histograms[0][lab] += 1 f_bis = extract_bigrams_from_corpus(utility.extract_words(ts[2])) histograms[1][lab] += len(f_bis) for bi in f_bis: histograms[2][lab][vocabulary.index(bi)] += f_bis[bi] except KeyError: fail_counter += 1 """ for ts in training_set: f_bis = extract_bigrams_from_corpus(extract_words(ts[2])) for bi in f_bis: for lab in extract_annotations(ts[0]): try: histograms[2][lab][vocabulary.index(bi)] += f_bis[bi] histograms[0][lab] += 1 histograms[1][lab] += 1 except KeyError: fail_counter += 1 """ print(f"Failed bigram additions: {fail_counter}") return [histograms, vocabulary]
def extract_words_with_count(text): text = utility.extract_words(text) result = {} for w in text: try: result[w] += 1 except KeyError: result[w] = 1 return result
def file_to_dictionary(text): result = {} # add stop-words? for w in utility.extract_words(text): try: result[w] += 1 except KeyError: result[w] = 1 return result
def create_bigrams(training_set, classes_labels): fail_counter = 0 histograms = create_histograms(classes_labels) for ts in training_set: for lab in utility.extract_annotations(ts[0]): try: histograms[0][lab] += 1 f_bis = extract_bigrams_from_corpus(utility.extract_words(ts[2])) histograms[1][lab] += len(f_bis) for bi in f_bis: try: histograms[2][lab][bi] += f_bis[bi] except KeyError: histograms[2][lab][bi] = 1 except KeyError: fail_counter += 1 print(f"Failed bigram additions: {fail_counter}") return [histograms, None]
def fill_classes_histograms(training_set, classes_file_content): histograms = create_histograms(classes_file_content) counter = 0 for ts in training_set: for an in utility.extract_annotations(ts[0]): if an.strip() in histograms[0]: histograms[0][an] += 1 for w in utility.extract_words(ts[2]): #print(histograms[1][an]) try: histograms[2][an][w] += 1 except KeyError: histograms[2][an][w] = 1 histograms[1][an] += 1 else: print(f"Unknown annotation: {an}") counter += 1 print(f"Unrecognized annotations: {counter}") return histograms
def fill_classes_histograms(classes_file_content, training_set, vocabulary): histograms = create_histograms(classes_file_content) counter = 0 for k in histograms[2]: histograms[2][k] = np.zeros(len(vocabulary), dtype=int) for ts in training_set: for an in utility.extract_annotations(ts[0]): # SOLVED: find out why some an is 4 long and strip nor replace doesn't work # (0xff na prvnim indexu) --- NEED TO USE utf-8-sig ENCODING!!! #if len(an) == 4: # an = an[1:] if an.strip() in histograms[0]: histograms[0][an] += 1 for w in utility.extract_words(ts[2]): #print(histograms[1][an]) histograms[2][an][vocabulary.index(w)] += 1 histograms[1][an] += 1 else: print(f"Unknown annotation: {an}") counter += 1 print(f"Unrecognized annotations: {counter}") return histograms
def create_vocabulary(training_set): vocabulary = set() for ts in training_set: vocabulary.update(utility.extract_words(ts[2])) return sorted(tuple( vocabulary)) #tuple for indexes, sorted to keep indexes consistent
def create_vocabulary(training_set): vocab = set() for ts in training_set: vocab.update(extract_bigrams_from_corpus(utility.extract_words(ts[2]))) return sorted(tuple(vocab))