def sort_terms(term_doc_matrix_df, ascending=False): """ Sort a term document matrix Pandas DataFrame. Args: term_doc_matrix_df (Pandas DataFrame): Term document matrix as a Pandas DataFrame. For example, the output of :func:`term_doc_matrix_to_pandas`. ascending(boolean): Default is False. Returns: Pandas DataFrame """ log.debug("Sorting n-gram DataFrame") if not isinstance(term_doc_matrix_df, pd.DataFrame): log.error("Parameter term_doc_matrix_df {} is not a DataFrame".format( term_doc_matrix_df)) raise TypeError("term_doc_matrix_df should be a DataFrame") freq = term_doc_matrix_df.sum(axis=0) # sort according to freq/tf-idf weight and transfer to a data frame freq_df = freq.sort_values(ascending=ascending).to_frame().reset_index() freq_df.columns = ['term', 'sum(tfidf or count)'] return freq_df
def consolidate_words(text, variants_dict, tokenizer="regexp", **kwargs): """ Consolidate word into its canonical form according to variants_dict. Note that text will be tokenized first and then each token will be checked and consolidated. After consolidation, tokens will be joined together again. Args: text (List[unicode]): a list of strings that needs to be consolidated. variants_dict (dict): mapping of variants to canonical form tokenizer (str): Tokenizer name. Defaults to 'regexp' **kwargs: Optional keyword arguments for tokenizer Returns: List[unicode] with words consolidated """ log.debug("Consolidating word") tokenized_text = tokenize(text, tokenizer, **kwargs) for i, text_i in enumerate(tokenized_text): tokenized_text[i] = [ variants_dict.get(token, token) for token in text_i ] return [' '.join(sublist) for sublist in tokenized_text]
def wordnet_sanitize(tagged_text): """ Ensure that each word is a (string, pos) pair that WordNet can understand. Args: tagged_text: Sentence or list of sentences, where each sentence is a list of (word, pos) tuples. Returns: Sentence or list of sentences as same form as tagged_text with cleaned pos tags for Wordnet. """ log.debug("Sanitizing tagged_ext for WordNet") if not isinstance(tagged_text, list): log.error("Parameter tagged_text {} is not a list".format(tagged_text)) raise TypeError( 'tagged_text needs to a list in which each item is a (word, pos) tuple' 'or a sentence, where the sentence is a list of (word, pos) tuples.' ) if isinstance(tagged_text[0][0], basestring): # tagged_text is a sentence return [_wordnet_sanitize_word(word) for word in tagged_text] else: # tagged_text is a list of sentences return [[_wordnet_sanitize_word(word) for word in sentence] for sentence in tagged_text]
def tokenize(text, tokenizer='regexp', **kwargs): """ Tokenizer a list of strings (usually each string represent a document) and return tokenized strings. Args: text (List(unicode)): input text/documents tokenizer (str): tokenizer name. Defaults to 'regexp'. **kwargs (Optional[dict]): Optional keyword arguments for tokenizer. Returns: a List of tokenizer text (which is a list) Examples: >>> text = ['this is a test. this is a test', 'this is another test'] >>> tokenize(text, pattern = '\S+') [[u'this', u'is', u'a', u'test.', u'this', u'is', u'a', u'test'], [u'this', u'is', u'another', u'test']] """ log.debug("Getting unique tokens") text = _u(text) if tokenizer == "regexp" and not "pattern" in kwargs: kwargs["pattern"] = r'\w+' tokenized_text = [ get_tokenize(tokenizer)(doc.lower(), **kwargs) for doc in text ] return tokenized_text
def get_stemmer(stemmer_name='snowball'): """ Get a stemmer for use with text cleaning, from a standard list. Args: stemmer_name (Optional[str]): Name of stemmer to use. Defaults to 'snowball'. Options: 'porter', 'lancaster', or 'snowball'. Returns: Instance of the requested stemmer. """ stemmer_name = stemmer_name.lower() log.debug("Getting {} stemmer".format(stemmer_name)) if 'porter'.startswith(stemmer_name): stemmer = nltk.stem.porter.PorterStemmer() elif 'lancaster'.startswith(stemmer_name): stemmer = nltk.stem.lancaster.LancasterStemmer() elif 'snowball'.startswith(stemmer_name): stemmer = nltk.stem.SnowballStemmer('english') else: raise ValueError( "Stemmer {} not found or not supported".format(stemmer_name)) return stemmer
def get_words_from_file(filepath, sep=" "): """ Load a set of unique words from a file. Args: filepath (str): Path to text file. sep (str): Delimiter between words on a line. Defaults to ' '. Returns: Set[unicode] """ log.info("Loading list of words from {}".format(filepath)) if not isinstance(filepath, str): raise TypeError("Path to file {} is not a string".format(filepath)) if not os.path.isfile(filepath): raise IOError("File {} does not exist".format(filepath)) with open(filepath) as infile: words = set( _u(word for line in infile for word in line.lower().strip().split(sep))) log.debug("Returning a set of {} words".format(len(words))) return words
def remove_short_tokens(text, minimum_token_length=3, tokenizer='regexp', **kwargs): """ Remove small words from input text. The individual string will be tokenized first using a tokenizer (see function get_tokenize) and then small words (tokens) will be filtered out. Args: text (List[unicode]) minimum_token_length (int): minimum length of tokens to be retained. Defaults to 3. tokenizer (str): Tokenizer name. Defaults to 'regexp'. **kwargs: Optional keyword arguments for tokenizer. Returns: List[unicode] with short words removed """ log.debug("Removing short tokens") tokenized_text = tokenize(text, tokenizer, **kwargs) for i, text_i in enumerate(tokenized_text): tokenized_text[i] = [ token for token in text_i if len(token) >= minimum_token_length ] return [' '.join(sublist) for sublist in tokenized_text]
def remove_punctuation(text, ignore=None, punctuation=string.punctuation): """ Remove punctuations from input text. Args: text (List[unicode]): Punctuation removed from text. ignore (Optional[str]): punctuations to be kept. Defaults to None. For example, ignore="@+" does not remove the @ or + characters. punctuation (Optional[str]): String of punctuation characters. Only these characters will be removed. Defaults to string.punctuation. Returns: List[unicode] """ log.debug("Removing punctuation {}, ignoring {}".format( punctuation, ignore)) if isinstance(punctuation, list): punctuation = ''.join(punctuation) if ignore: ignore = ''.join(ignore) punc_remove = ''.join([p for p in punctuation if not p in ignore]) else: punc_remove = punctuation remove_punctuation_map = {ord(char): None for char in punc_remove} text = _u(text) return [x.translate(remove_punctuation_map) for x in text]
def clean_specific_phrases(text, context): """ Depending on the context, e.g. Agent chat logs, specific spellings should be unified. Args: text (List[str]): Text to be cleaned. context (str): A filename or (multiline) str defining the context. Each row starts with the position, followed by the characters to be replaced, and what they should be replaced by. Position can be: * START - Only replace this at the beginning of text or each item of text * END - Only replace this at the beginning of text or each item of text * EQUALS - Replace if text or an item of text is exactly this phrase * ALL - Replace this character sequence everywhere it occurs The file can also contain empty lines or comment lines starting with '#' Returns: text with phrases replaced """ log.info("Cleaning phrases from a text") if os.path.exists(context): log.debug("Trying to read context from file {}".format(infile)) with open(context) as infile: context = infile.readlines() else: log.debug("Parameter context is a string, not a file name") context = context.split("\n") if not isinstance(text, list): text = [text] for line in context: if not len(line.strip()) or line.startswith('#'): continue pp = line.strip().split() if not len(pp) == 3: msg = "Line '{}' does not have the right format".format(line) log.error(msg) raise ValueError(msg) if pp[0] == "START": text = [ pp[2] + word[len(pp[1]):] if word.startswith(pp[1]) else word for word in text ] elif pp[0] == "END": text = [ word[:-len(pp[1])] + pp[2] if word.endswith(pp[1]) else word for word in text ] elif pp[0] == "EQUAL": text = [pp[2] if word == pp[1] else word for word in text] elif pp[0] == "ALL": text = [word.replace(pp[1], pp[2]) for word in text] else: raise ValueError("Invalid keyword {} encountered".format(pp[0])) return _u(text)
def lower_all(text): """ Ensures all text is lowercase. Args: text (List[unicode]) Returns: List[unicode] """ log.debug("Lowering all text") text = _u(text) return [word.lower() for word in text]
def remove_numeric(text): """ Remove numbers from input text. Args: text (List[unicode]): Returns: List[unicode] """ log.debug("Removing numbers") numbers = re.compile(r'[0-9]') text = _u(text) return _u([numbers.sub('', word) for word in text])
def get_pos_tag(sentences): """ Return pos tags for words in sentences. Args: sentences (List[unicode]): A list of sentences, for which the part of speech will be tagged for each word. Can use sent_tokenize() to get sentences from text. Returns: List of (word, pos) for each sentence. """ log.debug("Getting positional tags") if not isinstance(sentences, list): log.error("Parameter sentences {} is not a list".format(sentences)) raise TypeError('sentences must be a list of strings or unicode.') sentences = _u(sentences) sentences_toks = [nltk.word_tokenize(sentence) for sentence in sentences] return [nltk.pos_tag(sentence) for sentence in sentences_toks]
def get_tokenize(tokenizer_name="regexp"): """ Returns tokenize function from a standard list. So far, only regexp tokenizer is supported. Args: tokenizer_name (str): Defaults to 'regexp'. Returns: Tokenizer function """ tokenizer_name = tokenizer_name.lower() log.debug("Getting {} tokenizer".format(tokenizer_name)) if 'regexp'.startswith(tokenizer_name): tokenize_func = nltk.tokenize.regexp_tokenize else: raise ValueError( "Tokenizer {} not found or not supported".format(tokenizer_name)) return tokenize_func
def get_stopwords(base_src=None, extra_src=None, exclude_src=None): """ Get custom stopwords list from files or lists of words. Args: base_src (List[str] or str (if filename)): Path to file or list with base stopwords. Defaults to None. If None, then nltk's english stopwords list will be used. extra_src (List[str] or str (if filename)): Path to file or list with extra stopwords. exclude_src (List[str] or str (if filename)): Path to file or list with words that should be retained. Returns: Set[unicode] """ log.info("Getting stopwords") if extra_src is None: extra_src = [] if exclude_src is None: exclude_src = [] if base_src is None: base_set = set(_u(corpus.stopwords.words('english'))) else: base_list = get_words_from_file(base_src) if isinstance( base_src, str) else base_src base_set = set(_u(word for word in base_list)) extra_list = get_words_from_file(extra_src) if isinstance( extra_src, str) else extra_src extra_set = set(_u(word for word in extra_list)) exclude_list = get_words_from_file(exclude_src) if isinstance( exclude_src, str) else exclude_src exclude_set = set(_u(word for word in exclude_list)) stopwords = base_set.union(extra_set).difference(exclude_set) log.debug("Returning {} stopwords".format(len(stopwords))) return stopwords
def lemmatize_tagged_word(tagged_word, lemmatizer): """ Lemmatize a tagged word in the form of (word, tag). Args: tagged_word (Tuple(unicode, str)): Tuple of (word, tag). Tags can only be 'a', 'n', 'r', or 'v'. 'a' -> adjective, 'n' -> noun, 'r' -> adverb, 'v' -> verb lemmatizer (lemmatizer function) Returns: Lemmatized word in unicode """ log.debug("Lemmatizing tagged word") if _tagged_word_checker(tagged_word): word, tag = tagged_word if tag is not None: return lemmatizer.lemmatize(word, tag) else: log.debug("tag is None, using lemmatizer without tag") return lemmatizer.lemmatize(word)
def _wordnet_sanitize_word(tagged_word): """ Helper function for wordnet_sanitize to ensure that tagged_word is a (string, pos) pair that WordNet can understand. """ if not isinstance(tagged_word, tuple): log.error( "Parameter tagged_word {} is not a tuple".format(tagged_word)) raise TypeError('tagged_word must be a tuple of (string, pos)') if len(tagged_word) != 2: log.error("Parameter tagged_word has invalid length {}".format( len(tagged_word))) raise TypeError( 'tagged_word must be a tuple of length 2 of the form (string, pos)' ) stri, tag = tagged_word if not isinstance(stri, basestring): log.error("Value of tagged_word {} is not a string".format( tagged_word[0])) raise TypeError( 'tagged_word must be a tuple of (string, pos) where both string and pos' 'are type str or unicode.') tag = tag.lower() if tag.startswith('v'): tag = 'v' elif tag.startswith('n'): tag = 'n' elif tag.startswith('j'): log.debug("Changing tag from 'j' to 'a' for {}".format(tagged_word)) tag = 'a' elif tag.startswith('rb'): log.debug("Changing tag from 'rb' to 'b' for {}".format(tagged_word)) tag = 'r' if tag in ('a', 'n', 'r', 'v'): return (stri, tag) else: log.debug( "Setting tag to None, since it's not in ('a', 'n', 'r', 'v')") return (stri, None)
def get_lemmatizer(): """ Returns an instance of WordNet's lemmatizer. """ log.debug("Returning WordNetLemmatizer") return WordNetLemmatizer()