Beispiel #1
0
def tokenize_lemm(text):
    lemmatizer = WordNetLemmatizer()
    digits_tbl = {ord(d): u' ' for d in digits}
    punc_tbl = {ord(p): u' ' for p in string.punctuation}

    text = text.translate(digits_tbl)
    text = text.translate(punc_tbl)
    #text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = word_tokenize(text)
    lemmas = lemm_tokens(tokens, lemmatizer)
    return lemmas
Beispiel #2
0
def tokenize_stem(text):
    stemmer = PorterStemmer()
    digits_tbl = {ord(d): u' ' for d in digits}
    punc_tbl = {ord(p): u' ' for p in string.punctuation}
    text = text.translate(digits_tbl)
    text = text.translate(punc_tbl)

    #additional_stop_words =set([u'--', u'san',u'francisco',u'ca',u'ibm', u'race', u'color', u'religion', u'twitter', u'microsoft', u'emc', u'ebay', u'yahoo', u'google', u'accenture', u'intel', u'amazone', u'oracle', u'gender', u'disability', u'veteran', u'status', u'age', u'status', u'genetic', u'origin', u'marital',u'ibm', u'race', u'color', u'disabilities', u'religion', u'twitter', u'microsoft', u'emc', u'ebay', u'yahoo', u'google', u'accenture', u'intel', u'amazone', u'oracle', u'MS', u'Intel', u'Hewlett', u'Packard', u'ebay', u'Twitter', u'IBM', u'yahoo', u'EMC', u'Accenture', u'Corporate', u'id', 'francisco','ca','ibm', 'race', 'color', 'religion', 'twitter', 'microsoft', 'emc', 'ebay', 'yahoo', 'google', 'accenture', 'intel', 'amazone', 'oracle', 'gender', 'disability', 'veteran', 'status', 'age', 'status', 'genetic', 'origin', 'marital','ibm', 'race', 'color', 'disabilities', 'religion', 'twitter', 'microsoft', 'emc', 'ebay', 'yahoo', 'google', 'accenture', 'intel', 'amazone', 'oracle', 'MS', 'Intel', 'Google', 'Hewlett', 'Packard', 'ebay', 'Twitter', 'IBM', 'yahoo', 'EMC', 'Accenture', 'Corporate', 'Oracle', 'id', 'travel', 'job', 'role' , 'committed', 'employment', 'jobs', 'citizenship', 'fair', 'position', 'type', 'environment', 'orientation', 'national', 'regard', 'identity', 'sexual', 'equal', 'francisco','ca','ibm', 'race', 'color', 'religion', 'twitter', 'microsoft', 'emc', 'ebay', 'yahoo', 'google', 'accenture', 'intel', 'amazone', 'oracle', 'gender', 'disability', 'veteran', 'status', 'age', 'status', 'genetic', 'origin', 'marital', 'city', 'compliance', 'country', 'diverse','genetics','immigration','opportunity','proud','regarding','usa','work', 'required', 'bonus','core','corporate','diversity','encouraged','involvement','join','regardless','rewarding','salary','workplace'])  
    #text = "".join([ch for ch in text if ch not in string.punctuation and ch not in additional_stop_words])
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
Beispiel #3
0
def preprocess(text):
    text= re.sub(b"<.*?>", b" ", text)#no_tags
    text= re.sub(b"\n", b" ", text)#no_new_lines
    text= re.sub(b"\r", b" ", text)#no_returns
    #lowered with no punctuation
    text= text.translate(None, b'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~').lower()
    #removing the footer for all the reviews
    text= text[:-579]
    return text
Beispiel #4
0
def sent_list(docs,splitStr='__label__'):
    sent_analysis = []
    for i in range(1,len(docs)):
        text=str(lines[i])
        splitText=text.split(splitStr)
        secHalf=splitText[1]
        sentiment=secHalf[0]
        text=secHalf[2:len(secHalf)-1].lower()

        table=str.maketrans(' ',' ', string.punctuation)
        text.translate(table)

        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text:
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", text)

        text = re.sub(r'\d+', '', text)



        sent_analysis.append([text,sentiment])
    return sent_analysis
Beispiel #5
0
def tokenize_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = text.translate(string.punctuation)
    words = nltk.word_tokenize(text)
    words = [re.sub("[^A-Za-z0-9]", "", word) for word in words]

    final_words = []
    for word in words:
        if not word:
            continue
        if word in nltk.corpus.stopwords.words("english"):
            continue
        if word.startswith("@") or word.startswith("#"):
            continue
        if word.isnumeric():
            continue
        final_words.append(word)
    return final_words
Beispiel #6
0
def clean_text(text, remove_stop_words=False):
    text = text.lower()
    replace_punctuation = str.maketrans(string.punctuation,
                                        ' ' * len(string.punctuation))
    text = text.translate(replace_punctuation)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[\n\r]', '', text)

    if remove_stop_words == True:
        text = text.split()
        new_text = []
        stemmer = PorterStemmer()

        for word in text:
            if word not in STOPWORDS:
                new_text.append(stemmer.stem(word))

        text = ' '.join(new_text)

    return text
Beispiel #7
0
def clean_text(text, country):
    text = reduce(lambda a, kv: a.replace(*kv), contractions.items(),
                  text.lower())
    text = text.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
    text = strip_accents(text)
    text = text.translate(
        str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    tokens = tk.tokenize(text)
    if country == 'USA':
        stopwords = usa_stopwords
    elif country == 'Canada':
        stopwords = canada_stopwords
    elif country == 'UK':
        stopwords = britain_stopwords
    else:
        raise ValueError("Country is invalid.")
    tokens = [
        w for w in tokens
        if w not in stopwords and len(w) > 2 and w != ' ' and not w.isdigit()
    ]
    return ' '.join(tokens)
def cleanup(text):
    text = textClean(text)
    text= text.translate(string.maketrans("",""))
    return text
Beispiel #9
0
def remove_punc(text):
    table = str.maketrans(' ', ' ', string.punctuation)
    return text.translate(table)