Example #1
0
def tokenize_document(doc, is_ted, is_only_nouns):
    """
    For a given string text, the script get the text's tokens. The text is pre-processd and filtered, after that the NLTK tokenizer
    process is carried out, if a flag is enabled, the tokens are tagged and filtered out only the nouns and finally the tokens are
    lemmatized.
    PARAMETERS:
       1. doc: The string text from which extract the tokens
       2. is_ted: A flag to say if to add to the english standard stopword the custom stopwords prepared for the TED talks corpus
       3. is_only_nouns: A flag to say if extract only the tokens tagged as a nouns
    RETURNS:
       A list of strings where each string is a token from the given text
    """
    res = []
    
    try: 
        # First pre-process and filter the given text
        doc2= remove_punctuation_stopwords(doc, is_ted)
        # From the pre-proccesed and filtered text apply the NLTK tokenizer process
        tokens = PunktWordTokenizer().tokenize(' '.join(doc2))
        # If enabled the flag, then only extract the tokens tagged as a nouns
        if is_only_nouns:
            tagged_tokens = nltk.pos_tag(tokens)
            tokens = []
            for token, tag in tagged_tokens:
                if (tag == 'NN') or (tag == 'NNP') or (tag == 'NNS'):
                    tokens.append(token)
        # Lemmatize the tokens using the NLTK lemmatizer
        for i in range(0,len(tokens)):
            lema = WordNetLemmatizer().lemmatize(tokens[i])
            # If the token was not lemmatized, then apply verb lemmatization
            if lema == tokens[i]:
                lema = WordNetLemmatizer().lemmatize(tokens[i], 'v')
            if (len(lema) > 1) and (not lema.isdigit()):
                # Append the lema to the result to be returned
                res.append(lema)
    except:
        print "tokenize_document"
        print ""
        traceback.print_exc()

    return res