def split_wordtags(corpus, delimiter='/', start_word='*', stop_word='STOP', ngram_used=3): """ Splits a corpus into a words vector and a tag vector :param corpus: :param delimiter: :param start_word: :param stop_word: :param ngram_used: Default=3 . # of ngrams to use. Will insert start and stop accordingly. :return: """ tag_sentences = [] word_sentences = [] # for each sentence for sentence in corpus: # split on space word_list = n_gramer.explode(sentence) words = [] tags = [] for el in word_list: word, tag = el.rsplit(delimiter, 1) words.append(word) tags.append(tag) # Insert start and end token in each vector n_gramer.insert_start_end_tokens(words, start_word, stop_word, ngram_used) n_gramer.insert_start_end_tokens(tags, start_word, stop_word, ngram_used) words_sentence = ' '.join(words) tags_sentence = ' '.join(tags) tag_sentences.append(tags_sentence) word_sentences.append(words_sentence) return word_sentences, tag_sentences
def frequency_dict(corpus): """ Returns frequency counts for a corpus. :param corpus: Array of sentences (strings) :return: """ all_words = {} for sentence in corpus: word_list = n_gramer.explode(sentence) for word in word_list: all_words[word] = all_words.get(word, 0) + 1 return all_words
def replace_rare_words(corpus, known_words, rare_symbol): """ Replaces rare words in corpus from known_words. :param corpus: :param known_words: List of whitelisted words :param rare_symbol: Symbol used in replacement """ results = [] # Iterate sentence for sentence in corpus: dirty_words = n_gramer.explode(sentence) clean_words = [] # Replace word if rare for word in dirty_words: clean_words.append(word) if word in known_words else clean_words.append(rare_symbol) # Place back in corpus as word list results.append(clean_words) return results
def replace_rare_words(corpus, known_words, rare_symbol): """ Replaces rare words in corpus from known_words. :param corpus: :param known_words: List of whitelisted words :param rare_symbol: Symbol used in replacement """ results = [] # Iterate sentence for sentence in corpus: dirty_words = n_gramer.explode(sentence) clean_words = [] # Replace word if rare for word in dirty_words: clean_words.append( word) if word in known_words else clean_words.append( rare_symbol) # Place back in corpus as word list results.append(clean_words) return results
def __build_word_list(sentence, start_token, end_token, n): word_list = gramer.explode(sentence) gramer.insert_start_end_tokens(word_list, start_token, end_token, n) return word_list