Ejemplo n.º 1
0
def remove_end_characters(text):
    """Removes end characters from word token list."""

    # If text is empty, return None.
    if not text: return None
    # If text is not tokenize, tokenize text.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text
    normalized_text += ' '
    # Replace stopwords with spaces.
    normalized_text = sub(get_end_characters_pattern(), r' ', normalized_text)
    # Then remove multiple adjacent spaces.
    normalized_text = sub(' +', ' ', normalized_text)
    # Then strip text.
    normalized_text = normalized_text.strip()
    if normalized_text[-1] == r'.':
        normalized_text = normalized_text[:-1]
    # If text was tokenized, then re-tokenize.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)
    # Return normalized text.
    return normalized_text

    stops = stopwords.words('english')
Ejemplo n.º 2
0
def convert_case(text, to_lower=True):
    """Converts text to defined case."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text

    # If to lower, convert to lower case. Else, convert to upper case.
    if to_lower:
        normalized_text = normalized_text.lower()
    else:
        normalized_text = normalized_text.upper()

    # If text was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)

    # Return normalized text.
    return normalized_text
Ejemplo n.º 3
0
def remove_special_characters(text):
    """Removes special characters from text."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text
    # Retrieve special characters pattern.
    special_characters_pattern = get_special_characters_pattern()
    # Remove all tailing white spaces.
    normalized_text = normalized_text.strip()
    # Replace all special characters with spaces.
    normalized_text = sub(special_characters_pattern, r' ', normalized_text)
    # Then remove multiple adjacent spaces.
    normalized_text = sub(' +', ' ', normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If text was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)
    # Return normalized text.
    return normalized_text
Ejemplo n.º 4
0
def expand_abbreviations(text):
    """Expands emoticons in text."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text
    # If last character is not space, add space.
    try:
        if normalized_text[-1] != ' ':
            normalized_text += ' '
    except IndexError:
        print(1)
    # Creates abbreviations pattern.
    abbreviations_pattern = compile('({})'.format(r'\.?\s|'.join(
        get_abbreviation_dict().keys())),
                                    flags=IGNORECASE | DOTALL)

    def expand_match(abbreviation):
        """Expands matched contraction."""

        # Retrieves matched contraction from string.
        match = abbreviation.group(0)
        # If last character is space, remove space.
        if match[-1] == " ":
            match = match[:-1]
            remove_space = True
        else:
            remove_space = False
        # If last character is dot, remove dot.
        if match[-1] == r'.':
            match = match[:-1]
        # Find expanded contraction in dictionary, based on contraction key.
        expanded_contraction = get_abbreviation_dict().get(match.lower())
        if not expanded_contraction:
            return abbreviation.group(0)
        if remove_space:
            expanded_contraction += " "
        # Add first character to expanded contraction.
        return expanded_contraction

    # Replaces contractions with expanded contractions in text.
    normalized_text = abbreviations_pattern.sub(expand_match, normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If text was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)

    # Return expanded text.
    return normalized_text
Ejemplo n.º 5
0
def lemmatize_text(text):
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = text
    else:
        normalized_text = word_tokenize(text, 'whitespace')
        was_tokenized = False

    pos_tagged_text = pos_tag_text(normalized_text)
    lemmatized_text = [
        WordNetLemmatizer().lemmatize(word, pos_tag) if pos_tag else word
        for word, pos_tag in pos_tagged_text
    ]
    if not was_tokenized:
        lemmatized_text = merge_tokens(lemmatized_text)
    return lemmatized_text
Ejemplo n.º 6
0
def expand_contractions(text):
    """Expands contractions in text."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text

    # Creates contractions pattern.
    contractions_pattern = compile('({})'.format('|'.join(
        get_contraction_dict().keys())),
                                   flags=IGNORECASE | DOTALL)

    def expand_match(contraction):
        """Expands matched contraction."""

        # Retrieves matched contraction from string.
        match = contraction.group(0)
        # Stores first character for case sensitivity.
        first_char = match[0]
        # Find expanded contraction in dictionary, based on contraction key.
        expanded_contraction = get_contraction_dict().get(match)
        # If the contraction could not be found, try again with lower case.
        if not expanded_contraction:
            expanded_contraction = get_contraction_dict().get(match.lower())
        # Add first character to expanded contraction.
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    # Replaces contractions with expanded contractions in text.
    normalized_text = contractions_pattern.sub(expand_match, normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If text was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)

    # Return expanded text.
    return normalized_text
Ejemplo n.º 7
0
def replace_apostrophes(text):
    """Replaces apostrophe pattern with '."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text
    # Replaces apostrophe pattern with '.
    normalized_text = sub(get_apostrophe_pattern(), "'", normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)
    # Return normalized text.
    return normalized_text
Ejemplo n.º 8
0
def replace_multiple_stopwords(text):
    """Replaces multiple stopwords with single stopwords."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text
    # Replaces apostrophe pattern with '.
    normalized_text = sub('[.!?]+', _get_single_match, normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)
    # Return normalized text.
    return normalized_text
Ejemplo n.º 9
0
def expand_emoticons(text):
    """Expands emoticons in text."""

    # If text is empty, return None.
    if not text: return None
    # If texts is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text

    # Creates emoticons pattern.
    emoticons_pattern = compile('({})'.format(r'|'.join(
        get_emoticon_dict().keys())),
                                flags=IGNORECASE | DOTALL)
    get_emoticon_dict().keys()

    def expand_match(emoticon):
        """Expands matched emoticon."""

        # Retrieves matched emoticon from string.
        match = emoticon.group(0)
        # Find expanded emoticon in dictionary, based on emoticon key.
        expanded_emoticon = get_emoticon_dict().get(match)
        return expanded_emoticon

    # Replaces emoticons with expanded emoticons in text.
    normalized_text = emoticons_pattern.sub(expand_match, normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If text was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)

    # Return expanded text.
    return normalized_text
Ejemplo n.º 10
0
def remove_hyperlinks(text):
    """Remove hyperlinks from text."""

    # If text is empty, return None.
    if not text: return None
    # If is tokenized, merge tokens.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = merge_tokens(text)
    else:
        was_tokenized = False
        normalized_text = text
    # Replace hyperlinks with space.
    normalized_text = sub(get_hyperlink_pattern(), r' ', normalized_text)
    # Then remove multiple adjacent spaces.
    normalized_text = sub(' +', ' ', normalized_text)
    # Strip text.
    normalized_text = normalized_text.strip()
    # If text was tokenized, re-tokenize text.
    if was_tokenized:
        normalized_text = word_tokenize(normalized_text)
    # Return normalized text.
    return normalized_text
Ejemplo n.º 11
0
def remove_stopwords(text):
    """Remove stopwords from word token list"""

    # If text is empty, return None.
    if not text: return None
    # If text is not tokenize, tokenize text.
    if is_tokenized(text):
        was_tokenized = True
        normalized_text = text
    else:
        was_tokenized = False
        normalized_text = word_tokenize(text, 'whitespace')
    # Create stopwords list.
    stop_set = set(stopwords.words('english'))
    # Filter stopwords from text.
    normalized_text = [
        token for token in normalized_text if token not in stop_set
    ]
    # If text was not tokenize, merge tokens.
    if not was_tokenized:
        normalized_text = merge_tokens(normalized_text)
    # Return normalized text.
    return normalized_text
Ejemplo n.º 12
0
def clean_text_sentiment(text, tokenize=False):
    clean_dict = {}
    # Replace whitespaces.
    clean_dict['replace_whitespaces'] = replace_whitespaces(text)
    # Expand abbreviations.
    clean_dict['expand_emoticons'] = expand_emoticons(
        clean_dict['replace_whitespaces'])
    # Replace multiple stopwords.
    clean_dict['replace_multiple_stopwords'] = replace_multiple_stopwords(
        clean_dict['expand_emoticons'])
    # Replace apostrophes.
    clean_dict['replace_apostrophes'] = replace_apostrophes(
        clean_dict['replace_multiple_stopwords'])
    # Expand contractions.
    clean_dict['expand_contractions'] = expand_contractions(
        clean_dict['replace_apostrophes'])
    # Remove hyperlinks.
    clean_dict['remove_hyperlinks'] = remove_hyperlinks(
        clean_dict['expand_contractions'])
    # Expand abbreviations.
    clean_dict['expand_abbreviations'] = expand_abbreviations(
        clean_dict['remove_hyperlinks'])
    # Remove special characters.
    clean_dict['remove_special_characters'] = remove_special_characters(
        clean_dict['expand_abbreviations'])
    # Remove numbers.
    clean_dict['remove_numbers'] = remove_numbers(
        clean_dict['remove_special_characters'])
    # Convert to lower case.
    clean_dict['convert_case'] = convert_case(clean_dict['remove_numbers'])
    # Tokenize sentences.
    clean_dict['sentence_tokenize'] = sentence_tokenize(
        clean_dict['convert_case'])
    # If sentence tokenize is empty, return None.
    if not clean_dict['sentence_tokenize']:
        return clean_dict, None
    else:
        # Remove end characters.
        clean_dict['remove_end_characters'] = [
            remove_end_characters(item)
            for item in clean_dict['sentence_tokenize'] if len(item) > 1
        ]
        # Lemmatize words.
        clean_dict['lemmatize'] = [
            lemmatize_text(item)
            for item in clean_dict['remove_end_characters'] if len(item) > 1
        ]
        #stemming words
        #clean_dict['stemming'] = [convert_word_stem(item) for item in clean_dict['lemmatize'] if len(item) > 1]
        #removing repeated characters
        # Remove stopwords.
        clean_dict['remove_stopwords'] = [
            remove_stopwords_sentiment(item)
            for item in clean_dict['lemmatize'] if len(item) > 1
        ]
        # Remove stopwords.
        clean_dict['remove_repeated_characters'] = [
            remove_repeated_characters(item)
            for item in clean_dict['remove_stopwords'] if len(item) > 1
        ]
        # If tokenize, tokenize words.
        if tokenize:
            clean_dict['word_tokenize'] = [
                word_tokenize(item, 'whitespace')
                for item in clean_dict['remove_repeated_characters']
                if len(item) > 1
            ]
            clean_dict['word_merge'] = merge(clean_dict['word_tokenize'])
            #clean_dict['removing_repeated_characters'] = [remove_repeated_characters(item) for item in clean_dict['word_merge'] if len(item) > 1]

        # Return dictionary and cleaned text.
        if tokenize:
            return clean_dict, clean_dict['word_merge']  # clean_dict
        else:
            return clean_dict, clean_dict['remove_repeated_characters']