Exemple #1
0
import re
import numpy as np
from numpy import log

from Vector import Vector

# ==== Cleaning Text ===

# Load stopwords
from stopwords import stopwords

new_stop_words = set()
for word in stopwords:
    if "'" in word:
        new_stop_words.add(word.replace("'", ""))
stopwords = stopwords.union(new_stop_words)


# Sanitizer
def sanitize(text):
    """
    clean up:
        1. split a string into a list of words
        2. remove all @ handles
        3. remove all hash tags
        4. remove all liks
        5. remove all stopwords
        6. throw away punctuation, except smiley faces
        7. make the final vector of words a set

    @args:
import sys
import re
import math

from Vector import Vector


# ==== Cleaning Text ===

# Load stopwords
from stopwords import stopwords
new_stop_words = set()
for word in stopwords:
    if "'" in word:
        new_stop_words.add(word.replace("'",""))
stopwords = stopwords.union(new_stop_words)


# Sanitizer
def sanitize(text):
    """
    clean up:
        1. split a string into a list of words
        2. remove all @ handles
        3. remove all hash tags
        4. remove all liks
        5. remove all stopwords
        6. throw away punctuation, except smiley faces
        7. make the final vector of words a set

    @args: