Exemple #1
0
def sort_terms(term_doc_matrix_df, ascending=False):
    """
    Sort a term document matrix Pandas DataFrame.

    Args:
        term_doc_matrix_df (Pandas DataFrame): Term document matrix as a Pandas DataFrame.
            For example, the output of :func:`term_doc_matrix_to_pandas`.
        ascending(boolean): Default is False.

    Returns:
        Pandas DataFrame

    """
    log.debug("Sorting n-gram DataFrame")
    if not isinstance(term_doc_matrix_df, pd.DataFrame):
        log.error("Parameter term_doc_matrix_df {} is not a DataFrame".format(
            term_doc_matrix_df))
        raise TypeError("term_doc_matrix_df should be a DataFrame")

    freq = term_doc_matrix_df.sum(axis=0)
    # sort according to freq/tf-idf weight and transfer to a data frame
    freq_df = freq.sort_values(ascending=ascending).to_frame().reset_index()
    freq_df.columns = ['term', 'sum(tfidf or count)']

    return freq_df
def consolidate_words(text, variants_dict, tokenizer="regexp", **kwargs):
    """
    Consolidate word into its canonical form according to variants_dict.

    Note that text will be tokenized first and then each token will be checked
    and consolidated. After consolidation, tokens will be joined together again.

    Args:
        text (List[unicode]): a list of strings that needs to be consolidated.
        variants_dict (dict): mapping of variants to canonical form
        tokenizer (str): Tokenizer name. Defaults to 'regexp'
        **kwargs: Optional keyword arguments for tokenizer

    Returns:
        List[unicode] with words consolidated

    """

    log.debug("Consolidating word")
    tokenized_text = tokenize(text, tokenizer, **kwargs)
    for i, text_i in enumerate(tokenized_text):
        tokenized_text[i] = [
            variants_dict.get(token, token) for token in text_i
        ]
    return [' '.join(sublist) for sublist in tokenized_text]
def wordnet_sanitize(tagged_text):
    """
    Ensure that each word is a (string, pos) pair that WordNet can understand.

    Args:
        tagged_text: Sentence or list of sentences, where each sentence is a list of (word, pos)
            tuples.

    Returns:
        Sentence or list of sentences as same form as tagged_text with cleaned pos tags for
        Wordnet.
    """
    log.debug("Sanitizing tagged_ext for WordNet")
    if not isinstance(tagged_text, list):
        log.error("Parameter tagged_text {} is not a list".format(tagged_text))
        raise TypeError(
            'tagged_text needs to a list in which each item is a (word, pos) tuple'
            'or a sentence, where the sentence is a list of (word, pos) tuples.'
        )
    if isinstance(tagged_text[0][0], basestring):
        # tagged_text is a sentence
        return [_wordnet_sanitize_word(word) for word in tagged_text]
    else:
        # tagged_text is a list of sentences
        return [[_wordnet_sanitize_word(word) for word in sentence]
                for sentence in tagged_text]
def tokenize(text, tokenizer='regexp', **kwargs):
    """
    Tokenizer a list of strings (usually each string represent a document) and return tokenized
    strings.

    Args:
        text (List(unicode)): input text/documents
        tokenizer (str): tokenizer name. Defaults to 'regexp'.
        **kwargs (Optional[dict]): Optional keyword arguments for tokenizer.

    Returns:
        a List of tokenizer text (which is a list)

    Examples:
        >>> text = ['this is a test. this is a test', 'this is another test']
        >>> tokenize(text, pattern = '\S+')
        [[u'this', u'is', u'a', u'test.', u'this', u'is', u'a', u'test'],
         [u'this', u'is', u'another', u'test']]

    """
    log.debug("Getting unique tokens")
    text = _u(text)
    if tokenizer == "regexp" and not "pattern" in kwargs:
        kwargs["pattern"] = r'\w+'
    tokenized_text = [
        get_tokenize(tokenizer)(doc.lower(), **kwargs) for doc in text
    ]

    return tokenized_text
def get_stemmer(stemmer_name='snowball'):
    """
    Get a stemmer for use with text cleaning, from a standard list.

    Args:
        stemmer_name (Optional[str]): Name of stemmer to use. Defaults to 'snowball'.
            Options: 'porter', 'lancaster', or 'snowball'.

    Returns:
        Instance of the requested stemmer.

    """
    stemmer_name = stemmer_name.lower()
    log.debug("Getting {} stemmer".format(stemmer_name))
    if 'porter'.startswith(stemmer_name):
        stemmer = nltk.stem.porter.PorterStemmer()
    elif 'lancaster'.startswith(stemmer_name):
        stemmer = nltk.stem.lancaster.LancasterStemmer()
    elif 'snowball'.startswith(stemmer_name):
        stemmer = nltk.stem.SnowballStemmer('english')
    else:
        raise ValueError(
            "Stemmer {} not found or not supported".format(stemmer_name))

    return stemmer
def get_words_from_file(filepath, sep=" "):
    """
    Load a set of unique words from a file.

    Args:
        filepath (str): Path to text file.
        sep (str): Delimiter between words on a line. Defaults to ' '.

    Returns:
        Set[unicode]

    """
    log.info("Loading list of words from {}".format(filepath))
    if not isinstance(filepath, str):
        raise TypeError("Path to file {} is not a string".format(filepath))
    if not os.path.isfile(filepath):
        raise IOError("File {} does not exist".format(filepath))

    with open(filepath) as infile:

        words = set(
            _u(word for line in infile
               for word in line.lower().strip().split(sep)))
    log.debug("Returning a set of {} words".format(len(words)))
    return words
def remove_short_tokens(text,
                        minimum_token_length=3,
                        tokenizer='regexp',
                        **kwargs):
    """
    Remove small words from input text.

    The individual string will be tokenized first using a tokenizer (see function get_tokenize) and then small words (tokens) will be
    filtered out.

    Args:
        text (List[unicode])
        minimum_token_length (int): minimum length of tokens to be retained. Defaults to 3.
        tokenizer (str): Tokenizer name. Defaults to 'regexp'.
        **kwargs: Optional keyword arguments for tokenizer.

    Returns:
        List[unicode] with short words removed

    """

    log.debug("Removing short tokens")
    tokenized_text = tokenize(text, tokenizer, **kwargs)
    for i, text_i in enumerate(tokenized_text):
        tokenized_text[i] = [
            token for token in text_i if len(token) >= minimum_token_length
        ]
    return [' '.join(sublist) for sublist in tokenized_text]
def remove_punctuation(text, ignore=None, punctuation=string.punctuation):
    """
    Remove punctuations from input text.

    Args:
        text (List[unicode]): Punctuation removed from text.
        ignore (Optional[str]):  punctuations to be kept. Defaults to None.
            For example, ignore="@+" does not remove the @ or + characters.
        punctuation (Optional[str]): String of punctuation characters.
            Only these characters will be removed. Defaults to string.punctuation.

    Returns:
        List[unicode]

    """
    log.debug("Removing punctuation {}, ignoring {}".format(
        punctuation, ignore))
    if isinstance(punctuation, list):
        punctuation = ''.join(punctuation)
    if ignore:
        ignore = ''.join(ignore)
        punc_remove = ''.join([p for p in punctuation if not p in ignore])
    else:
        punc_remove = punctuation
    remove_punctuation_map = {ord(char): None for char in punc_remove}
    text = _u(text)
    return [x.translate(remove_punctuation_map) for x in text]
def clean_specific_phrases(text, context):
    """
    Depending on the context, e.g. Agent chat logs, specific spellings should be unified.

    Args:
        text (List[str]): Text to be cleaned.
        context (str): A filename or (multiline) str defining the context.
            Each row starts with the position,
            followed by the characters to be replaced, and what they should be replaced by.
            Position can be:
                * START - Only replace this at the beginning of text or each item of text
                * END - Only replace this at the beginning of text or each item of text
                * EQUALS - Replace if text or an item of text is exactly this phrase
                * ALL - Replace this character sequence everywhere it occurs
            The file can also contain empty lines or comment lines starting with '#'

    Returns:
        text with phrases replaced
    """
    log.info("Cleaning phrases from a text")
    if os.path.exists(context):
        log.debug("Trying to read context from file {}".format(infile))
        with open(context) as infile:
            context = infile.readlines()
    else:
        log.debug("Parameter context is a string, not a file name")
        context = context.split("\n")

    if not isinstance(text, list):
        text = [text]

    for line in context:
        if not len(line.strip()) or line.startswith('#'):
            continue
        pp = line.strip().split()
        if not len(pp) == 3:
            msg = "Line '{}' does not have the right format".format(line)
            log.error(msg)
            raise ValueError(msg)
        if pp[0] == "START":
            text = [
                pp[2] + word[len(pp[1]):] if word.startswith(pp[1]) else word
                for word in text
            ]
        elif pp[0] == "END":
            text = [
                word[:-len(pp[1])] + pp[2] if word.endswith(pp[1]) else word
                for word in text
            ]
        elif pp[0] == "EQUAL":
            text = [pp[2] if word == pp[1] else word for word in text]
        elif pp[0] == "ALL":
            text = [word.replace(pp[1], pp[2]) for word in text]
        else:
            raise ValueError("Invalid keyword {} encountered".format(pp[0]))
    return _u(text)
def lower_all(text):
    """
    Ensures all text is lowercase.

    Args:
        text (List[unicode])

    Returns:
        List[unicode]

    """
    log.debug("Lowering all text")
    text = _u(text)
    return [word.lower() for word in text]
def remove_numeric(text):
    """
    Remove numbers from input text.

    Args:
        text (List[unicode]):

    Returns:
        List[unicode]

    """
    log.debug("Removing numbers")
    numbers = re.compile(r'[0-9]')
    text = _u(text)
    return _u([numbers.sub('', word) for word in text])
def get_pos_tag(sentences):
    """
    Return pos tags for words in sentences.

    Args:
        sentences (List[unicode]): A list of sentences, for which the part of speech will be
            tagged for each word.  Can use sent_tokenize() to get sentences from text.

    Returns:
        List of (word, pos) for each sentence.
    """
    log.debug("Getting positional tags")
    if not isinstance(sentences, list):
        log.error("Parameter sentences {} is not a list".format(sentences))
        raise TypeError('sentences must be a list of strings or unicode.')
    sentences = _u(sentences)
    sentences_toks = [nltk.word_tokenize(sentence) for sentence in sentences]

    return [nltk.pos_tag(sentence) for sentence in sentences_toks]
def get_tokenize(tokenizer_name="regexp"):
    """
    Returns tokenize function from a standard list.

    So far, only regexp tokenizer is supported.

    Args:
        tokenizer_name (str):  Defaults to 'regexp'.

    Returns:
        Tokenizer function

    """
    tokenizer_name = tokenizer_name.lower()
    log.debug("Getting {} tokenizer".format(tokenizer_name))
    if 'regexp'.startswith(tokenizer_name):
        tokenize_func = nltk.tokenize.regexp_tokenize
    else:
        raise ValueError(
            "Tokenizer {} not found or not supported".format(tokenizer_name))
    return tokenize_func
def get_stopwords(base_src=None, extra_src=None, exclude_src=None):
    """
    Get custom stopwords list from files or lists of words.

    Args:
        base_src (List[str] or str (if filename)): Path to file or list with base stopwords.
            Defaults to None. If None, then nltk's english stopwords list will be used.
        extra_src (List[str] or str (if filename)): Path to file or list with extra stopwords.
        exclude_src (List[str] or str (if filename)): Path to file or list with words that
            should be retained.

    Returns:
        Set[unicode]

    """
    log.info("Getting stopwords")
    if extra_src is None:
        extra_src = []
    if exclude_src is None:
        exclude_src = []

    if base_src is None:
        base_set = set(_u(corpus.stopwords.words('english')))
    else:
        base_list = get_words_from_file(base_src) if isinstance(
            base_src, str) else base_src
        base_set = set(_u(word for word in base_list))

    extra_list = get_words_from_file(extra_src) if isinstance(
        extra_src, str) else extra_src
    extra_set = set(_u(word for word in extra_list))

    exclude_list = get_words_from_file(exclude_src) if isinstance(
        exclude_src, str) else exclude_src
    exclude_set = set(_u(word for word in exclude_list))

    stopwords = base_set.union(extra_set).difference(exclude_set)
    log.debug("Returning {} stopwords".format(len(stopwords)))
    return stopwords
def lemmatize_tagged_word(tagged_word, lemmatizer):
    """
    Lemmatize a tagged word in the form of (word, tag).

    Args:
        tagged_word (Tuple(unicode, str)): Tuple of (word, tag).
            Tags can only be 'a', 'n', 'r', or 'v'.
            'a' -> adjective, 'n' -> noun, 'r' -> adverb, 'v' -> verb
        lemmatizer (lemmatizer function)

    Returns:
        Lemmatized word in unicode

    """
    log.debug("Lemmatizing tagged word")
    if _tagged_word_checker(tagged_word):
        word, tag = tagged_word
        if tag is not None:
            return lemmatizer.lemmatize(word, tag)
        else:
            log.debug("tag is None, using lemmatizer without tag")
            return lemmatizer.lemmatize(word)
def _wordnet_sanitize_word(tagged_word):
    """
    Helper function for wordnet_sanitize to ensure that tagged_word is a (string, pos) pair that
    WordNet can understand.
    """
    if not isinstance(tagged_word, tuple):
        log.error(
            "Parameter tagged_word {} is not a tuple".format(tagged_word))
        raise TypeError('tagged_word must be a tuple of (string, pos)')
    if len(tagged_word) != 2:
        log.error("Parameter tagged_word has invalid length {}".format(
            len(tagged_word)))
        raise TypeError(
            'tagged_word must be a tuple of length 2 of the form (string, pos)'
        )

    stri, tag = tagged_word
    if not isinstance(stri, basestring):
        log.error("Value of tagged_word {} is not a string".format(
            tagged_word[0]))
        raise TypeError(
            'tagged_word must be a tuple of (string, pos) where both string and pos'
            'are type str or unicode.')
    tag = tag.lower()

    if tag.startswith('v'):
        tag = 'v'
    elif tag.startswith('n'):
        tag = 'n'
    elif tag.startswith('j'):
        log.debug("Changing tag from 'j' to 'a' for {}".format(tagged_word))
        tag = 'a'
    elif tag.startswith('rb'):
        log.debug("Changing tag from 'rb' to 'b' for {}".format(tagged_word))
        tag = 'r'

    if tag in ('a', 'n', 'r', 'v'):
        return (stri, tag)
    else:
        log.debug(
            "Setting tag to None, since it's not in ('a', 'n', 'r', 'v')")
        return (stri, None)
def get_lemmatizer():
    """
    Returns an instance of WordNet's lemmatizer.
    """
    log.debug("Returning WordNetLemmatizer")
    return WordNetLemmatizer()