Esempi in Python per extract_words

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: utility

Metodo/funzione: extract_words

Esempi su hotexamples.com: 8

extract_words in Python: 8 esempi trovati. Questi sono i migliori esempi reali in Python per utility.extract_words, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: bigram_vect.py Progetto: trestikp/uir_semestral

def create_bigrams(training_set, classes_labels):
	fail_counter = 0
	vocabulary = create_vocabulary(training_set)
	histograms = create_histograms(classes_labels)
	for lab in histograms[2]:
		histograms[2][lab] = np.zeros(len(vocabulary), dtype=int)
	for ts in training_set:
		for lab in utility.extract_annotations(ts[0]):
			try:
				histograms[0][lab] += 1
				f_bis = extract_bigrams_from_corpus(utility.extract_words(ts[2]))
				histograms[1][lab] += len(f_bis)
				for bi in f_bis:
					histograms[2][lab][vocabulary.index(bi)] += f_bis[bi]
			except KeyError:
				fail_counter += 1
	"""
	for ts in training_set:
		f_bis = extract_bigrams_from_corpus(extract_words(ts[2]))
		for bi in f_bis:
			for lab in extract_annotations(ts[0]):
				try:
					histograms[2][lab][vocabulary.index(bi)] += f_bis[bi]
					histograms[0][lab] += 1
					histograms[1][lab] += 1
				except KeyError:
					fail_counter += 1
	"""
	print(f"Failed bigram additions: {fail_counter}")
	return [histograms, vocabulary]

Esempio n. 2

Mostra file

File: main.py Progetto: trestikp/uir_semestral

def extract_words_with_count(text):
    text = utility.extract_words(text)
    result = {}
    for w in text:
        try:
            result[w] += 1
        except KeyError:
            result[w] = 1
    return result

Esempio n. 3

Mostra file

def file_to_dictionary(text):
	result = {}
	# add stop-words?
	for w in utility.extract_words(text):
		try:
			result[w] += 1
		except KeyError:
			result[w] = 1
	return result

Esempio n. 4

Mostra file

File: bigram_dict.py Progetto: trestikp/uir_semestral

def create_bigrams(training_set, classes_labels):
	fail_counter = 0
	histograms = create_histograms(classes_labels)
	for ts in training_set:
		for lab in utility.extract_annotations(ts[0]):
			try:
				histograms[0][lab] += 1
				f_bis = extract_bigrams_from_corpus(utility.extract_words(ts[2]))
				histograms[1][lab] += len(f_bis)
				for bi in f_bis:
					try:
						histograms[2][lab][bi] += f_bis[bi]
					except KeyError:
						histograms[2][lab][bi] = 1
			except KeyError:
				fail_counter += 1
	print(f"Failed bigram additions: {fail_counter}")
	return [histograms, None]

Esempio n. 5

Mostra file

def fill_classes_histograms(training_set, classes_file_content):
    histograms = create_histograms(classes_file_content)
    counter = 0
    for ts in training_set:
        for an in utility.extract_annotations(ts[0]):
            if an.strip() in histograms[0]:
                histograms[0][an] += 1
                for w in utility.extract_words(ts[2]):
                    #print(histograms[1][an])
                    try:
                        histograms[2][an][w] += 1
                    except KeyError:
                        histograms[2][an][w] = 1
                    histograms[1][an] += 1
            else:
                print(f"Unknown annotation: {an}")
                counter += 1
    print(f"Unrecognized annotations: {counter}")
    return histograms

Esempio n. 6

Mostra file

File: bag_of_words_vect.py Progetto: trestikp/uir_semestral

def fill_classes_histograms(classes_file_content, training_set, vocabulary):
    histograms = create_histograms(classes_file_content)
    counter = 0
    for k in histograms[2]:
        histograms[2][k] = np.zeros(len(vocabulary), dtype=int)
    for ts in training_set:
        for an in utility.extract_annotations(ts[0]):
            # SOLVED: find out why some an is 4 long and strip nor replace doesn't work
            # (0xff na prvnim indexu) --- NEED TO USE utf-8-sig ENCODING!!!
            #if len(an) == 4:
            #	an = an[1:]
            if an.strip() in histograms[0]:
                histograms[0][an] += 1
                for w in utility.extract_words(ts[2]):
                    #print(histograms[1][an])
                    histograms[2][an][vocabulary.index(w)] += 1
                    histograms[1][an] += 1
            else:
                print(f"Unknown annotation: {an}")
                counter += 1
    print(f"Unrecognized annotations: {counter}")
    return histograms

Esempio n. 7

Mostra file

File: bag_of_words_vect.py Progetto: trestikp/uir_semestral

def create_vocabulary(training_set):
    vocabulary = set()
    for ts in training_set:
        vocabulary.update(utility.extract_words(ts[2]))
    return sorted(tuple(
        vocabulary))  #tuple for indexes, sorted to keep indexes consistent

Esempio n. 8

Mostra file

File: bigram_vect.py Progetto: trestikp/uir_semestral

def create_vocabulary(training_set):
	vocab = set()
	for ts in training_set:
		vocab.update(extract_bigrams_from_corpus(utility.extract_words(ts[2])))
	return sorted(tuple(vocab))