コード例 #1
0
 def Text_Cleanup(f):
         text = f.lower()
         text_contraction = cont.fix(text)
         text_punc = text_contraction.translate(translator)
         text_clean = ' '.join([word for word in text_punc.split() if(len(word.lower())>1)])
         text_sent = ' '.join([i for i in text_clean.split() if(i.isalnum() and not i.isdigit())])
         return text_sent
コード例 #2
0
ファイル: preprocess.py プロジェクト: pranab/avenir
	def replaceContractions(self, text):
		"""Replace contractions in string of text"""
		return contractions.fix(text)
コード例 #3
0
    x23 = x22.replace(":-/", "sad")
    x24 = x23.replace(":/", "sad")
    x25 = x24.replace(":|", "sad")
    return x25


df['emoticons_replacment'] = df['textOriginal'].apply(smiley)
#-----------------------------------------------------------------------------------------------------------------------

df["less_spaces"] = df['emoticons_replacment'].apply(
    lambda x: re.sub(' +', ' ', x))

#https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28

df['text_expan_contractions'] = df['less_spaces'].apply(
    lambda x: [contractions.fix(word) for word in x.split()])
df['text_expan_contractions'] = [
    ' '.join(map(str, l)) for l in df['text_expan_contractions']
]

#removes non alphanumeric/ whitespace characters from strings
df['text_misc_char_removed'] = df['text_expan_contractions'].str.replace(
    ''', '')  # just a lil something to replace the weird apostroph thing
df['text_misc_char_removed'] = df['text_misc_char_removed'].map(
    lambda x: re.sub("[^0-9a-zA-Z\s]+", '', x)
)  #this includes puncutation which shoes little value in analysis

#removes emojis


def deEmojify(text):
コード例 #4
0
def con(text):
  expand = contractions.fix(text)
  return expand
コード例 #5
0
def expand_contractions(text):
    """ expand shortened words, e.g. don't to do not """

    return contractions.fix(text)
コード例 #6
0
def replace_contractions(df):
    # return contractions.fix(t)
    # df['text_prep'] = df.text_prep.apply(lambda x: nltk.word_tokenize(contractions.fix(TreebankWordDetokenizer().detokenize(x))))
    df['text_prep'] = df.text_prep.apply(lambda x: contractions.fix(x))
    print('contractions expansion done')
    return df
コード例 #7
0
def convert_emoticons(text):
    for emot, desc in EMOTICONS.items():
        text = re.sub(u'(' + emot + ')', desc, text)
    return text


udf_convert_emoticons = udf(convert_emoticons)


def convert_contractions(text):
    return contractions.fix(text)


# udf_convert_contractions = udf(convert_contractions)
udf_convert_contractions = udf(lambda text: contractions.fix(text))


def convert_numbers_to_text(text):
    return ' '.join([num2words(w) if w.isdigit() else w for w in text.split()])


# udf_convert_numbers_to_text = udf(convert_numbers_to_text, ArrayType(StringType()))
udf_convert_numbers_to_text = udf(
    lambda text: ' '.join(
        [num2words(w) if w.isdigit() else w for w in text.split()]),
    ArrayType(StringType()))


def convert_date_to_text(text):
    result = []
コード例 #8
0
def test_add():
    contractions.add('mychange', 'my change')
    assert contractions.fix('mychange') == 'my change'
コード例 #9
0
def data_cleaning(df):
    info = [df['Sentiment'].values.tolist(), df['Text'].values.tolist()]
    df_data = list(zip(*info))
    clean_data = []
    sentiment = []
    pos = {}
    neg = {}
    counts = {}
    for i in range(0, len(df_data)):

        if int(df_data[i][0]) == 1:
            sentiment.append(1)
        if int(df_data[i][0]) == -1:
            sentiment.append(-1)
        #1) remove emails
        clean_sentence = re.sub(r'\s*\S*(@)\S*', '', str(df_data[i][1]))
        #remove mentions
        clean_sentence = re.sub(r"([@][\w_-]+)", "", clean_sentence)
        #2) remove 10 digit phone numbers
        #clean_sentence = re.sub(r'\d{10}', '', clean_sentence)
        #3) remove $n
        clean_sentence = re.sub(r'\$[^ ]+', '', clean_sentence)
        #4) remove Times & dates 2/24 2:10pm, 6/30, 7:00 AM
        clean_sentence = re.sub(
            r'[0-9]*[:/][0-9]*\S*\s[A][M]|[0-9]*[:/][0-9]*\S*\s[P][M]|[0-9]*[:/][0-9]*\S*',
            '', clean_sentence)
        #5) convert emojis
        clean_sentence = emoji.demojize(clean_sentence, delimiters=(' ', ' '))
        #6) fix contractions
        clean_sentence = contractions.fix(clean_sentence)
        #7) remove links
        clean_sentence = re.sub(r'\s*\S*(http)\S*', '', clean_sentence)
        #8) keeping the hashtage info but removing the sign
        clean_sentence = clean_sentence.replace("#", "")
        #9) all to lower case for easy tokenization and less features
        #clean_sentence = clean_sentence.lower()
        #10) remove < >
        clean_sentence = clean_sentence.replace("<", "")
        clean_sentence = clean_sentence.replace(">", "")
        #11) remove Punctuations
        clean_sentence = re.sub(r'[^A-Za-z0-9]+', ' ', clean_sentence)
        #12) lemmatize verbs
        tokenized = word_tokenize(clean_sentence)
        lemmatizer = WordNetLemmatizer()
        clean_tokens = []
        for word in tokenized:
            cur = lemmatizer.lemmatize(word, pos='v')
            clean_tokens.append(cur)
        #13) remove stop words
        stop_words = set(stopwords.words('english'))
        filtered_sentence = [w for w in clean_tokens if not w in stop_words]
        clean_data.append(filtered_sentence)
        for word in filtered_sentence:
            if sentiment[i] == 1:
                if word in pos and not word.isupper():
                    pos[word] += 1
                else:
                    pos[word] = 1
            else:
                if word in neg and not word.isupper():
                    neg[word] += 1
                else:
                    neg[word] = 1
            if word not in counts:
                counts[word] = 1
            else:
                counts[word] += 1
        neg = {key: val for key, val in neg.items() if val != 1}
        pos = {key: val for key, val in pos.items() if val != 1}
        alldata = {key: val for key, val in counts.items() if val != 1}
    #print(clean_data[5])
    #print(len(sentiment) == len(clean_data))

    f = open("pos.txt", "w")
    f.write(str(pos))
    f.close()

    f = open("neg.txt", "w")
    f.write(str(neg))
    f.close()

    f = open("counts.txt", "w")
    f.write(str(alldata))
    f.close()
    a_dictionary = dict(Counter(counts).most_common(50))
    keys = a_dictionary.keys()
    values = a_dictionary.values()
    plt.xticks(fontsize=6)
    plt.bar(keys, values, color='pink')
    plt.show()

    #check for duplicates
    new_clean_data = []
    for i in range(0, len(clean_data)):
        cur_data = (' '.join(clean_data[i]), sentiment[i])
        if cur_data not in new_clean_data:
            #new_clean_data.append(clean_data[i])
            #new_senti.append(sentiment[i])
            new_clean_data.append(cur_data)

    #print(new_clean_data[:3])

    return new_clean_data
コード例 #10
0
def contraction_expansion(text):
    text = contractions.fix(text)
    return text
コード例 #11
0
def remove_contractions(x):
    rem_cont = [contractions.fix(word) for word in x.split()]
    return " ".join(map(str, rem_cont))
コード例 #12
0
def standardised_query(pl, text):
    text = remove_punctuation(text)
    text = contractions.fix(text)
    text = lemmatise(text)
    return escape_and_call_prolexa(pl, text)
コード例 #13
0
def get_token_words(survey_data, col_name, stopwords_list, title):
    """ Return a list of all the token words that we can use to
    generate the bigrams.

    survey_data: Pandas data frame, contains data to analyze
    col_name: Name of specific column to analyze responses
    stopwords_list: List of words to ignore that might be in the question
    title: Title of sentiment
    """
    # Drop null values, reset the index
    print(survey_data.columns)
    data = survey_data.dropna(subset=[col_name])
    data = data.reset_index(drop=True)

    # Get only the column that you need
    responses_data = data[col_name]
    # Make it a list without the col name
    responses = []
    for i in range(len(responses_data) - 1):
        responses.append(str(responses_data[i + 1]))

    sentiment_list = sentiment(responses)

    sent = []

    for i in range(len(sentiment_list)):
        if sentiment_list[i] != "n/a":
            if sentiment_list[i] > 0:
                sent.append("pos")
            elif sentiment_list[i] == 0:
                sent.append("neutral")
            else:
                sent.append("neg")
        else:
            sent.append("n/a")

    sentiment_table = pd.DataFrame({'sentiment': sentiment_list})
    sentiment_table['sent_word'] = sent
    sentiment_table['responses'] = responses

    sentiment_table.to_csv("files/Neut_" + title, index=False)

    processed = []
    index = 0
    while index < len(responses):
        # Look at words in one response
        # lowercase
        responsewords = responses[index].lower()

        # remove punctuation
        responsewords = re.sub('[!#?,.:";\']', "", responsewords)

        # split into a list
        resultwords = responsewords.split()

        # expand contractions
        for i in range(len(resultwords)):
            word = resultwords[i]
            resultwords[i] = contractions.fix(word)

        # remove stopwords
        for word in stopwords_list:
            if word in resultwords:
                resultwords.remove(word)

        # Joing back as text
        processed.append(" ".join(resultwords))
        index += 1

    # Join the responses into one big text
    text = " ".join(processed)

    # Stemming: removes suffices: ing, ly, s
    # Lemmatization (root word)
    st = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    text_stem = []
    for word in text.split(" "):
        word = st.stem(word)
        text_stem.append(lemmatizer.lemmatize(word))
    text = " ".join(text_stem)

    # Passing the string text into word tokenize for breaking the sentences
    tokens = word_tokenize(text)

    # Word cloud
    #wordcloud = WordCloud(background_color="white").generate(text)
    #plt.imshow(wordcloud, interpolation='bilinear')
    #plt.axis("off")
    #plt.show()

    # Removing english stopwords
    eng_stopwords = set(stopwords.words("english"))

    tokens = [w for w in tokens if w not in eng_stopwords]

    return tokens
コード例 #14
0
def replace_contractions(text):
    """Replaces contractions (it's -> it is)"""
    return contractions.fix(text)
コード例 #15
0
def fix_contractions(s):
    s = contractions.fix(s)
    return s
コード例 #16
0
wordset = set()     # initialise empty set of words
bodies = list()      # initialise empty set to contain bodies of text gathered

with open('text.txt', 'w') as f:
    for tag_idx in range(1, 174):
        print(tags[tag_idx])
        href = tags[tag_idx].attrs['href']
        page = urllib.request.urlopen(site + href)
        page_soup = bs(page, 'html.parser')
        #print(page_soup.prettify())
        text = page_soup.find('font', face='verdana')
        #print(str(text))
        print('\n\n\n')
        text = text.text                        # get just the text (remove tags etc)
        f.write(text)
        text = re.sub('\[[^]]*\]', '', text)            # remove square brackets
        text = contractions.fix(text)                   # replaced contractions with their full words
        words = nltk.word_tokenize(text)                # make list of word tokens
        words = [word.lower() for word in words]        # lowercase
        words = [re.sub(r'[^\w\s]', '', word) for word in words]    # replace punctuation with empty string
        words = [word for word in words if word != '']  # remove empty strings
        bodies.append(words)
        print(bodies)
        wordset = wordset.union(words)      # add any new words in this body to the set of words
        break

print('Number of individual words:', len(wordset))

#print(text)
print(words)
コード例 #17
0
def test_fix():
    assert contractions.fix("you're happy now") == "you are happy now"
コード例 #18
0
def replace_contractions(text, verbose=False):
    """Replace contractions in string of text"""
    new_text = contractions.fix(text)
    if verbose:
        print(new_text)
    return new_text
コード例 #19
0
ファイル: nlp.py プロジェクト: vladstanescu94/nlp-lab-school
def remove_contractions(text):
    return contractions.fix(text)
コード例 #20
0
 def replace_contractions(self, text):
     return contractions.fix(text)
コード例 #21
0
def convert_contractions(text):
    return contractions.fix(text)
コード例 #22
0
def preprocess(temp):
    # expand using contractions
    temp = re.sub(r"(http|https)\S+", "", temp)
    temp = contractions.fix(temp)
    # tokenize
    tokens = nltk.word_tokenize(temp)
    #tokens = tokenizer.tokenize(temp)

    # for i,token in enumerate(tokens):
    #    if token[0].isupper():
    #        print("{}:{}:{}".format(i,file,token))

    string.punctuation = string.punctuation + "''``--"

    #print(tokens)

    new_tokens = []
    pattern = ('\d+(\.\d+)?')

    for i, token in enumerate(tokens):
        if i == len(tokens):
            break
        if (token == '@'):
            #replace twitter handle with screen name
            temp = tokens[i] + tokens[i + 1]
            #print(temp)
            if get_username(temp) is not None:
                #print(get_username(temp))
                temp_list = nltk.word_tokenize(get_username(temp))
                for t in temp_list:
                    new_tokens.append(t.lower())
            i = i + 2
            continue
        if len(token) < 3:
            continue
        if token in english_stopwords:
            continue
        if (token not in string.punctuation):
            if not re.match(pattern, token):
                for s in token:
                    if s in string.punctuation:
                        token = token.replace(s, '')
            else:
                if token.isdigit():
                    token = num2words(float(token))
            new_tokens.append(token.lower().encode("ascii",
                                                   errors="ignore").decode())
    '''
        # lemmitization
        lemmatizer = nltk.WordNetLemmatizer()
        final_tokens=[]
        for token in new_tokens:
            final_tokens.append(lemmatizer.lemmatize(token))
  
    # stemmer
    stemmer = nltk.PorterStemmer()
    final_tokens = []
    for token in new_tokens:
        final_tokens.append(stemmer.stem(token))
    '''

    return new_tokens
コード例 #23
0
ファイル: pre-processing.py プロジェクト: ray476/Data-Mining
start_time = time.time()
vocab_full = {}
n_doc = 0
# Only keep the data dictionaries and ignore possible system files like .DS_Store
folders = [
    os.path.join(root_path, name) for name in os.listdir(root_path)
    if os.path.isdir(os.path.join(root_path, name))
]
for folder in folders:
    for filename in os.listdir(folder):
        file = os.path.join(folder, filename)
        n_doc += 1
        with open(file, 'r', encoding='utf8', errors='ignore') as f:
            for line in f:
                # split contractions into two words
                line = contractions.fix(line)
                tokens = word_tokenize(line)
                # force everything to lower case and remove non-alphabetic characters
                tokens = [token.lower() for token in tokens if token.isalpha()]
                for token in tokens:
                    # remove stop words, other words (above) and single characters
                    if (token not in stop_words) and (
                            token not in other_words) and (len(token) > 1):
                        vocab_full[token] = vocab_full.get(token, 0) + 1
print(
    f'{n_doc} documents in total with a total vocab size of {len(vocab_full)}')
vocab_sorted = sorted(vocab_full.items(),
                      key=operator.itemgetter(1),
                      reverse=True)
vocab_truncated = vocab_sorted[:MAX_VOCAB_SIZE]
# Save the vocabulary to file for visual inspection and possible analysis
コード例 #24
0
def expand_contractions(text):
    text_uncont = contractions.fix(text)
    return text_uncont
コード例 #25
0
def remove_html_tags(text):
    
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = contractions.fix(text)
    return text
コード例 #26
0
def expand_contraction(input_text: str) -> str:
    """ Expand contractions in input text """
    return contractions.fix(input_text)
コード例 #27
0
def replace_contractions(txt):
    return contractions.fix(txt)
コード例 #28
0
def replace_contractions(text):
    return (contractions.fix(text))
コード例 #29
0
ファイル: preprocess.py プロジェクト: data-mining/avenir
 def replaceContractions(self, text):
     """Replace contractions in string of text"""
     return contractions.fix(text)
コード例 #30
0
def remove_contractions(k):
    return k.apply(lambda x: contractions.fix(x))  # don't - do not
コード例 #31
0
ファイル: utils.py プロジェクト: darkestfloyd/nlp_pipline
def expand_contractions(text):
    return contractions.fix(text)
コード例 #32
0
def _replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)