Python strip_junk_tokens Exemples

Langage de programmation: Python

Espace de nommage/Pack: project.utils.text

Méthode/Fonction: strip_junk_tokens

Exemples au hotexamples.com: 6

Python strip_junk_tokens - 6 exemples trouvés. Ce sont les exemples réels les mieux notés de project.utils.text.strip_junk_tokens extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

Fichier : tfidf_bag_of_words.py Projet : mikeswoods/cis530-project

def compute_TF(all_tokens_dict): """ Compute term frequeny per sentence @returns {str:sentence : {str:word: float:P(word|sentence)}} """ return {sentence: counts_to_probs(Counter(strip_junk_tokens(words))) \ for (sentence, words) in all_tokens_dict.items()}

Exemple #2

0

Afficher le fichier

def build(all_tokens_dict): # Strip all the junk: corpus = strip_junk_tokens(chain.from_iterable(all_tokens_dict.values())) # Get all unique words: unique_words = list(set(corpus)) # Sort in ascending order from A..Z unique_words.sort() # Assign an index to each word corenlp_words_index = {k: i for (i, k) in enumerate(unique_words)} return (all_tokens_dict, corenlp_words_index)

Exemple #3

0

Afficher le fichier

Fichier : tfidf_bag_of_words.py Projet : mikeswoods/cis530-project

def featureize(F, observation_ids): (all_tokens_dict, word_indices, TFIDF, UNK) = F n = len(word_indices) m = len(observation_ids) # Observations X = np.zeros((m,n), dtype=np.float) for (i,ob_id) in enumerate(observation_ids, start=0): for token in strip_junk_tokens(all_tokens_dict[ob_id]): j = word_indices[token] X[i][j] = TFIDF[ob_id][token] return X

Exemple #4

0

Afficher le fichier

def featureize(F, observation_ids): (all_tokens_dict, corenlp_words_index) = F n = len(corenlp_words_index) m = len(observation_ids) # Observations X = np.zeros((m, n), dtype=np.float) for (i, ob_id) in enumerate(observation_ids, start=0): for token in strip_junk_tokens(all_tokens_dict[ob_id]): # Binary indicator: X[i][corenlp_words_index[token]] = 1 return X

Exemple #5

0

Afficher le fichier

Fichier : tfidf_bag_of_words.py Projet : mikeswoods/cis530-project

def compute_DF(all_tokens_dict): """ Compute document frequency per word @returns {str:word : int:document-count} """ df_counts = Counter() # Number of times a word occurs in all observations # Tabulate the number of documents each word appears in for words in all_tokens_dict.values(): for word in set(strip_junk_tokens(words)): if word not in df_counts: df_counts[word] = 1 else: df_counts[word] += 1 return df_counts

Exemple #6

0

Afficher le fichier

Fichier : tfidf_bag_of_words.py Projet : mikeswoods/cis530-project

def build(train_ids, all_tokens_dict): # Strip all the junk: corpus = strip_junk_tokens(chain.from_iterable(all_tokens_dict.values())) # Get all unique words: unique_words = list(set(corpus)) # Term frequeny probabilities per sentence TFIDF = compute_TFIDF(all_tokens_dict) # Unknown token IDF value UNK = math.log(len(all_tokens_dict)) # Sort in ascending order from A..Z unique_words.sort() # Assign an index to each word word_indices = {k:i for (i,k) in enumerate(unique_words, start=0)} return (all_tokens_dict, word_indices, TFIDF, UNK)