Beispiel #1
0
def readAndCleanDoc(doc):
    # 1. Open document, read text into *single* string
    with open(doc, 'r') as file:
        myfile = file.read()
    # 2. Tokenize string using nltk.tokenize.word_tokenize
    # docs = myfile.splitlines()
    nltk.download('punkt')
    doc_tokens = nltk.tokenize.word_tokenize(myfile)

    # 3. Filter out punctuation from list of words (use remove_punc)
    filtrd_doc = remove_punc(doc_tokens)

    # 4. Make the words lower case

    # 5. Filter out stopwords
    nltk.download('stopwords')
    stop = stopwords.words('english')
    doc_tokens_clean = [words.lower() for words in filtrd_doc if words.lower() not in stop]

    # 6. Stem words
    stemmer = PorterStemmer()

    words = [stemmer.stem(wrds) for wrds in doc_tokens_clean]

    return words
Beispiel #2
0
def readAndCleanDoc(doc):

    #1. Open document, read text into *single* string
    with open(doc, "r") as file:
        text = file.read()

    #2. Tokenize string using nltk.tokenize.word_tokenize
    text = word_tokenize(text)

    #3. Filter out punctuation from list of words (use remove_punc)
    text = remove_punc(text)
    #4. Make the words lower case

    #text = rm_punch.lower()
    for i, word in enumerate(text):
        text[i] = text[i].lower()

    #5. Filter out stopwords
    stop_set = stopwords.words('english')
    text_nostop = [n for n in text if not n in stop_set]

    #6. Lemmatize words
    lmtzr = WordNetLemmatizer()
    words = [lmtzr.lemmatize(x) for x in text_nostop]

    return words
Beispiel #3
0
def readAndCleanDoc(doc) :
    #1. Open document, read text into *single* string
    f = open(doc, 'r')
    words = f.read()
    f.close()

    #2. Tokenize string using nltk.tokenize.word_tokenize
    words = nltk.tokenize.word_tokenize(words)

    #3. Filter out punctuation from list of words (use remove_punc)
    words = remove_punc(words)

    #4. Make the words lower case
    tempWords = []
    for content in words:
        tempWords.append(content.lower())
    words = tempWords

    #5. Filter out stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    words = [content for content in words if content not in stopwords]

    #6. Stem words
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(content) for content in words]    

    return words
Beispiel #4
0
def readAndCleanDoc(doc):
    #1. Open document, read text into *single* string
    with open(doc, 'r') as file:
        words = file.read().replace('\n', ' ')
    #2. Tokenize string using nltk.tokenize.word_tokenize
    words = nltk.tokenize.word_tokenize(words)
    #3. Filter out punctuation from list of words (use remove_punc)
    words = (remove_punc(words))
    #4. Make the words lower case
    words = [i.lower() for i in words]
    #5. Filter out stopwords
    stop_words = stopwords.words('english')
    words = [w for w in words if not w in stop_words]
    #words = [''.join(remove_punc(i)) for i in words]
    #6. Lemmatize words
    lemmentizer = WordNetLemmatizer()
    words = [lemmentizer.lemmatize(i) for i in words]
    return words
Beispiel #5
0
def readAndCleanDoc(doc):
    #1. Open document, read text into *single* string
    with open(doc, 'r') as f:
        data = f.read()
    data = data.lower()
    #2. Tokenize string using nltk.tokenize.word_tokenize
    word_tokens = word_tokenize(data, language='english')
    #3. Filter out punctuation from list of words (use remove_punc)
    word_tokens = remove_punc(word_tokens)
    #4. Make the words lower case
    #word_tokens = word_tokens.lower()
    #5. Filter out stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in word_tokens if not w in stop_words]
    #6. Stem words
    ps = PorterStemmer()
    for index, w in enumerate(words):
        words[index] = ps.stem(w)

    return words
def readAndCleanDoc(doc):
    #1. Open document, read text into *single* string
    with open(doc, "r") as f:
        text = f.read()
    #2. Tokenize string using nltk.tokenize.word_tokenize
    #nltk.download('punkt')
    word_token = word_tokenize(text)
    #3. Filter out punctuation from list of words (use remove_punc)
    word_filtered = remove_punc(word_token)
    #4. Make the words lower case
    word_lower = [words.lower() for words in word_filtered]
    #5. Filter out stopwords
    #nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    doc_clean = [words for words in word_lower if words not in stop_words]
    #6. Stem words
    ps = PorterStemmer()
    words = [ps.stem(w) for w in doc_clean]

    return words
Beispiel #7
0
def readAndCleanDoc(doc) :
    #1. Open document, read text into *single* string
    myfile = open(doc, 'r')
    tmp = myfile.read()

    #2. Tokenize string using nltk.tokenize.word_tokenize
    tokens = word_tokenize(tmp)

    #3. Filter out punctuation from list of words (use remove_punc)
    tokens = remove_punc(tokens)

    #4. Make the words lower case
    tokens = [word.lower() for word in tokens]

    #5. Filter out stopwords
    stop = stopwords.words('english')
    tokens_clean = [x for x in tokens if not x in stop]

    #6. Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens_clean_lem = [lemmatizer.lemmatize(word) for word in tokens_clean]
    words = tokens_clean_lem

    return words