Beispiel #1
0
 def expand_match(contraction):
     match = contraction.group(0)
     first_char = match[0]
     expanded_contraction = CONTRACTION_MAP.get(match)\
                             if CONTRACTION_MAP.get(match)\
                             else CONTRACTION_MAP.get(match.lower())
     expanded_contraction = first_char + expanded_contraction[1:]
     return expanded_contraction
Beispiel #2
0
def get_cleaned_text_data():
    df = pd.read_sql('SELECT * FROM security_tweet_data', con=con)

    ##########################preprocessing#######################################################################################################
    df['sql_tweet_text'] = df['sql_tweet_text'].replace(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '',
        regex=True)
    df['sql_tweet_text'] = df['sql_tweet_text'].replace(r'\B@\w+',
                                                        '',
                                                        regex=True)
    df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: [
        CONTRACTION_MAP[item.lower()] if item.lower() in CONTRACTION_MAP.keys(
        ) else item for item in str(x).split()
    ])
    df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: [
        item[:-2] if item[-2:] == "'s" or item[-2:] == "’s" else item
        for item in x
    ])
    df["sql_tweet_text"] = df["sql_tweet_text"].str.join(" ")
    df["sql_tweet_text"] = df["sql_tweet_text"].replace('[^a-zA-Z0-9 ]',
                                                        '',
                                                        regex=True)
    df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: [
        item.lower() for item in str(x).split()
        if item.isdigit() == False and item not in stop
    ])
    df["sql_tweet_text"] = df["sql_tweet_text"].str.join(" ")
    #["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: ''.join(item[0] for item in itertools.groupby(x)))
    df["sql_tweet_text"] = df['sql_tweet_text'].apply(lambda x: " ".join([
        lemmatizer.lemmatize(item.lower()) for item in str(x).split()
        if len(item) <= 10 and len(item) > 2 and item not in stop
    ]))
    ##############################################################################################################################################
    return (df)
Beispiel #3
0
def processing_text(tweetobj):
    ##make a list of the main words in the tweet, minus punctuation, links, @ mentions, stop words, does have hashtags

    no_links = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', tweetobj)
    no_hashtags_ats = re.sub(r'\B@\w+', '', no_links)
    expand_contractions = [
        CONTRACTION_MAP[item.lower()]
        if item.lower() in CONTRACTION_MAP.keys() else item
        for item in str(no_hashtags_ats).split()
    ]
    remove_extra_contractions = " ".join([
        item[:-2] if item[-2:] == "'s" or item[-2:] == "’s" else item
        for item in expand_contractions
    ])
    alphanumeric_only = re.sub(r'[^a-zA-Z0-9 ]', '', remove_extra_contractions)
    minus_stopw_and_hashes = " ".join([
        item.lower() for item in str(alphanumeric_only).split()
        if item.isdigit() == False and item not in stopwords
    ])
    minus_repeat_characters = ''.join(
        item[0] for item in itertools.groupby(minus_stopw_and_hashes))
    filtered_words = [
        snowballstemmer.stem(item.lower())
        for item in str(minus_repeat_characters).split()
        if len(item) <= 10 and len(item) > 3
    ]
    return (filtered_words)
    def expand_contractions(text):
        pattern = re.compile("({})".format("|".join(CONTRACTION_MAP.keys())),flags = re.DOTALL| re.IGNORECASE)

        def replace_text(t):
            txt = t.group(0)
            if txt.lower() in CONTRACTION_MAP.keys():
                return CONTRACTION_MAP[txt.lower()]

        expand_text = pattern.sub(replace_text,text)
        return expand_text
Beispiel #5
0
def expand_contractions(text):

    contractions_pattern = re.compile('({})'.format('|'.join(
        CONTRACTION_MAP.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = CONTRACTION_MAP.get(match)\
                                if CONTRACTION_MAP.get(match)\
                                else CONTRACTION_MAP.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
Beispiel #6
0
def clean_text(text):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''

    # Convert words to lower case
    text = text.lower()

    # Add contractions

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    contractions_pattern = re.compile('({})'.format('|'.join(
        CONTRACTION_MAP.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = CONTRACTION_MAP.get(match)\
                                if CONTRACTION_MAP.get(match)\
                                else CONTRACTION_MAP.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)

    # Optionally, remove stop words

    text = expanded_text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    logging.info("sentence cleaned")
    # Lemmatization

    wordnet_lemmatizer = WordNetLemmatizer()

    def get_tag(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    text_result = []
    tokens = word_tokenize(text)  # Generate list of tokens
    tagged = pos_tag(tokens)
    for t in tagged:
        try:
            text_result.append(
                wordnet_lemmatizer.lemmatize(t[0], get_tag(t[1][:2])))
        except:
            text_result.append(wordnet_lemmatizer.lemmatize(t[0]))
    paragraph = " ".join(str(x) for x in text_result)
    return paragraph
 def replace_text(t):
     txt = t.group(0)
     if txt.lower() in CONTRACTION_MAP.keys():
         return CONTRACTION_MAP[txt.lower()]