def preprocess_text(text_string, function_list):
    '''
    Given each function within function_list, applies the order of functions put forward onto
    text_string, returning the processed string as type str.

    Keyword argument:

    - function_list: list of functions available in preprocessing.text
    - text_string: string instance

    Exceptions raised:
    
    - FunctionError: occurs should an invalid function be passed within the list of functions
    - InputError: occurs should text_string be non-string, or function_list be non-list
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        if isinstance(function_list, list):
            for func in function_list:
                try:
                    text_string = func(text_string)
                except (NameError, TypeError):
                    raise FunctionError(
                        "invalid function passed as element of function_list")
                except:
                    raise
            return text_string
        else:
            raise InputError(
                "list of functions not passed as argument for function_list")
    else:
        raise InputError("string not passed as argument for text_string")
Ejemplo n.º 2
0
def find_one_letter_edits(word_string):
    '''
    Finds all possible one letter edits of word_string:
    - Splitting word_string into two words at all character locations
    - Deleting one letter at all character locations
    - Switching neighbouring characters
    - Replacing a character with every alphabetical letter
    - Inserting all possible alphabetical characters between each character location including boundaries

    Returns all one letter edits as a set instance.
    '''
    if word_string is None:
        return {}
    elif isinstance(word_string, str):
        splits = [(word_string[:i], word_string[i:])
                  for i in range(len(word_string) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in EN_ALPHABET]
        inserts = [L + c + R for L, R in splits for c in EN_ALPHABET]
        return set(deletes + transposes + replaces + inserts)
    else:
        raise InputError(
            "string or none type variable not passed as argument to find_one_letter_edits"
        )
Ejemplo n.º 3
0
def correct_word(word_string):
    '''
    Finds all valid one and two letter corrections for word_string, returning the word
    with the highest relative probability as type str.
    '''
    if word_string is None:
        return ""
    elif isinstance(word_string, str):
        return max(find_candidates(word_string), key=find_word_prob)
    else:
        raise InputError(
            "string or none type variable not passed as argument to correct_word"
        )
Ejemplo n.º 4
0
def find_word_prob(word_string, word_total=sum(WORD_DISTRIBUTION.values())):
    '''
    Finds the relative probability of the word appearing given context of a base corpus.
    Returns this probability value as a float instance.
    '''
    if word_string is None:
        return 0
    elif isinstance(word_string, str):
        return WORD_DISTRIBUTION[word_string] / word_total
    else:
        raise InputError(
            "string or none type variable not passed as argument to find_word_prob"
        )
Ejemplo n.º 5
0
def validate_words(word_list):
    '''
    Checks for each edited word in word_list if that word is a valid english word.abs
    Returns all validated words as a set instance.
    '''
    if word_list is None:
        return {}
    elif isinstance(word_list, list):
        if not word_list:
            return {}
        else:
            return set(word for word in word_list if word in WORD_DISTRIBUTION)
    else:
        raise InputError(
            "list variable not passed as argument to validate_words")
def remove_whitespace(text_string):
    '''
    Removes all whitespace found within text_string and returns new string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a string or NoneType not be passed as an argument
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return " ".join(text_string.split())
    else:
        raise InputError("none type or string not passed as an argument")
def remove_urls(text_string):
    '''
    Removes all URLs within text_string and returns the new string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return " ".join(re.sub(r'http\S+', "", text_string).split())
    else:
        raise InputError("string not passed as argument")
def remove_numbers(text_string):
    '''
    Removes any digit value discovered within text_string and returns the new string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return " ".join(re.sub(r'\b[\d.\/,]+', "", text_string).split())
    else:
        raise InputError("string not passed as argument")
def lowercase(text_string):
    '''
    Converts text_string into lowercase and returns the converted string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return text_string.lower()
    else:
        raise InputError("string not passed as argument for text_string")
def lemmatize(text_string):
    '''
        Returns base from of text_string using NLTK's WordNetLemmatizer as type str.

        Keyword argument:

        - text_string: string instance

        Exceptions raised:

        - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return LEMMATIZER.lemmatize(text_string)
    else:
        raise InputError("string not passed as primary argument")
def convert_html_entities(text_string):
    '''
    Converts HTML5 character references within text_string to their corresponding unicode characters
    and returns converted string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return html.unescape(text_string).replace(""", "'")
    else:
        raise InputError("string not passed as argument for text_string")
def create_sentence_list(text_string):
    '''
    Splits text_string into a list of sentences based on NLTK's english.pickle tokenizer, and
    returns said list as type list of str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return []
    elif isinstance(text_string, str):
        return SENTENCE_TOKENIZER.tokenize(text_string)
    else:
        raise InputError(
            "non-string passed as argument for create_sentence_list")
Ejemplo n.º 13
0
def find_candidates(word_string):
    '''
    Finds all potential words word_string could have intended to mean. If a word is not incorrectly
    spelled, it will return this word first, else if will look for one letter edits that are correct.
    If there are no valid one letter edits, it will perform a two letter edit search.

    If valid corrections are found, all are returned as a set instance. Should a valid word not be
    found, the original word is returned as a set instance.
    '''
    if word_string is None:
        return {}
    elif isinstance(word_string, str):
        return (validate_words([word_string])
                or validate_words(list(find_one_letter_edits(word_string)))
                or validate_words(list(find_two_letter_edits(word_string)))
                or set([word_string]))
    else:
        raise InputError(
            "string or none type variable not passed as argument to find_candidates"
        )
def remove_time_words(text_string):
    '''
    Removes any word associated to time (day, week, month, etc.) within text_string and returns the
    new string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        for word in TIME_WORDS:
            text_string = re.sub(r'[\S]*\b' + word + r'[\S]*', "", text_string)
        return " ".join(text_string.split())
    else:
        raise InputError("string not passed as argument")
def remove_number_words(text_string):
    '''
    Removes any integer represented as a word within text_string and returns the new string as
    type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        for word in NUMBER_WORDS:
            text_string = re.sub(r'[\S]*\b' + word + r'[\S]*', "", text_string)
        return " ".join(text_string.split())
    else:
        raise InputError("string not passed as argument")
def convert_ligatures(text_string):
    '''
    Coverts Latin character references within text_string to their corresponding unicode characters
    and returns converted string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a string or NoneType not be passed as an argument
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        for i in range(0, len(LIGATURES)):
            text_string = text_string.replace(LIGATURES[str(i)]["ligature"],
                                              LIGATURES[str(i)]["term"])
        return text_string
    else:
        raise InputError("none type or string not passed as an argument")
def remove_unbound_punct(text_string):
    '''
    Removes all punctuation unattached from a non-whitespace or attached to another punctuation
    character unexpectedly (e.g. ".;';") within text_string and returns the new string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return " ".join(
            re.sub(
                r''.join([r'[', PUNCT, r'][', PUNCT, r']+|\B[', PUNCT, r']+']),
                "", text_string).split())
    else:
        raise InputError("string not passed as argument")
def keyword_tokenize(text_string):
    '''
    Extracts keywords from text_string using NLTK's list of English stopwords, ignoring words of a
    length smaller than 3, and returns the new string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a non-string argument be passed
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        return " ".join([
            word for word in KEYWORD_TOKENIZER.tokenize(text_string)
            if word not in STOPWORDS and len(word) >= 3
        ])
    else:
        raise InputError("string not passed as argument for text_string")
Ejemplo n.º 19
0
def find_two_letter_edits(word_string):
    '''
    Finds all possible two letter edits of word_string:
    - Splitting word_string into two words at all character locations
    - Deleting one letter at all character locations
    - Switching neighbouring characters
    - Replacing a character with every alphabetical letter
    - Inserting all possible alphabetical characters between each character location including boundaries

    This can be seen as a reapplication of find_one_letter_edits to all words found via a first
    instantiation of find_one_letter_edits on word_string.

    Returns all two letter edits as a set instance.
    '''
    if word_string is None:
        return {}
    elif isinstance(word_string, str):
        return (e2 for e1 in find_one_letter_edits(word_string)
                for e2 in find_one_letter_edits(e1))
    else:
        raise InputError(
            "string or none type variable not passed as argument to find_two_letter_edits"
        )
def correct_spelling(text_string):
    '''
    Splits string and converts words not found within a pre-built dictionary to their
    most likely actual word based on a relative probability dictionary. Returns edited
    string as type str.

    Keyword argument:

    - text_string: string instance

    Exceptions raised:

    - InputError: occurs should a string or NoneType not be passed as an argument
    '''
    if text_string is None or text_string == "":
        return ""
    elif isinstance(text_string, str):
        word_list = text_string.split()
        spellchecked_word_list = []
        for word in word_list:
            spellchecked_word_list.append(spellcheck.correct_word(word))
        return " ".join(spellchecked_word_list)
    else:
        raise InputError("none type or string not passed as an argument")