def split_wordtags(corpus, delimiter='/', start_word='*', stop_word='STOP', ngram_used=3):
    """
    Splits a corpus into a words vector and a tag vector
    :param corpus:
    :param delimiter:
    :param start_word:
    :param stop_word:
    :param ngram_used: Default=3 . # of ngrams to use. Will insert start and stop accordingly.
    :return:
    """
    tag_sentences = []
    word_sentences = []

    # for each sentence
    for sentence in corpus:
        # split on space
        word_list = n_gramer.explode(sentence)
        words = []
        tags = []
        for el in word_list:
            word, tag = el.rsplit(delimiter, 1)
            words.append(word)
            tags.append(tag)

        # Insert start and end token in each vector
        n_gramer.insert_start_end_tokens(words, start_word, stop_word, ngram_used)
        n_gramer.insert_start_end_tokens(tags, start_word, stop_word, ngram_used)

        words_sentence = ' '.join(words)
        tags_sentence = ' '.join(tags)

        tag_sentences.append(tags_sentence)
        word_sentences.append(words_sentence)

    return word_sentences, tag_sentences
def frequency_dict(corpus):
    """
    Returns frequency counts for a corpus.
    :param corpus: Array of sentences (strings)
    :return:
    """
    all_words = {}

    for sentence in corpus:
        word_list = n_gramer.explode(sentence)
        for word in word_list:
            all_words[word] = all_words.get(word, 0) + 1

    return all_words
def frequency_dict(corpus):
    """
    Returns frequency counts for a corpus.
    :param corpus: Array of sentences (strings)
    :return:
    """
    all_words = {}

    for sentence in corpus:
        word_list = n_gramer.explode(sentence)
        for word in word_list:
            all_words[word] = all_words.get(word, 0) + 1

    return all_words
def replace_rare_words(corpus, known_words, rare_symbol):
    """
    Replaces rare words in corpus from known_words.
    :param corpus:
    :param known_words: List of whitelisted words
    :param rare_symbol: Symbol used in replacement
    """
    results = []

    # Iterate sentence
    for sentence in corpus:
        dirty_words = n_gramer.explode(sentence)
        clean_words = []

        # Replace word if rare
        for word in dirty_words:
            clean_words.append(word) if word in known_words else clean_words.append(rare_symbol)

        # Place back in corpus as word list
        results.append(clean_words)

    return results
def replace_rare_words(corpus, known_words, rare_symbol):
    """
    Replaces rare words in corpus from known_words.
    :param corpus:
    :param known_words: List of whitelisted words
    :param rare_symbol: Symbol used in replacement
    """
    results = []

    # Iterate sentence
    for sentence in corpus:
        dirty_words = n_gramer.explode(sentence)
        clean_words = []

        # Replace word if rare
        for word in dirty_words:
            clean_words.append(
                word) if word in known_words else clean_words.append(
                    rare_symbol)

        # Place back in corpus as word list
        results.append(clean_words)

    return results
Esempio n. 6
0
def __build_word_list(sentence, start_token, end_token, n):
    word_list = gramer.explode(sentence)
    gramer.insert_start_end_tokens(word_list, start_token, end_token, n)

    return word_list
def __build_word_list(sentence, start_token, end_token, n):
    word_list = gramer.explode(sentence)
    gramer.insert_start_end_tokens(word_list, start_token, end_token, n)

    return word_list