def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them

    # Regex to remove all Non-Alpha Numeric and space
    special_character_removal = re.compile(r'[^a-z\d ]', re.IGNORECASE)
    # regex to replace all numerics
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)

    # text = text.lower().split()
    text = text.split()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)
    # Remove Special Characters
    text = special_character_removal.sub('', text)
    # Replace Numbers
    text = replace_numbers.sub('n', text)
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    # Return a list of words
    return (text)
Ejemplo n.º 2
0
def stopwords_remover(text):
    text = re.sub('\'\w+', '', text)  # Remove ticks and the next character
    text = ' '.join(word for word in text.split()
                    if word not in STOPWORDS)  # remove stopwors from text
    text = ' '.join(word for word in text.split()
                    if len(word) > 3)  # remove stopwors from text
    return text
def process_sentence(text, objects='False'):
    '''
    Simple and dirty text preprocessing to remove some mispelled words 
    and lemmatize
    '''
    text = text.lower()
    old_text = text
    
    text = text.replace('1', 'one').replace('2','two').replace(
        '3','three').replace('4','four').replace('5','five').replace('6','six').replace(
        '.','').replace('contains', 'contain').replace(
        'which','').replace('are there','there are').replace(
        'there is', '').replace('ablue', 'a blue').replace(
        'corner','edge').replace('wall', 'edge').replace('yelow', 'yellow').replace(
        'below','beneath').replace(
        'brick','block').replace('leats','least').replace('is touching', 'touching')
    text = re.sub(r'colour([\W])', 'color ', text)
    text = re.sub(r'colored([\W])', 'color ', text)
    text = re.sub(r'coloured([\W])', 'color ', text)
    text = text.split(' ')
    text = map(correction, [t for t in text if t])
    text = [lemmatizer.lemmatize(x) if not x in [u'as',u'wall'] else x for x in text]
    text = ' '.join(text)
    if 'that' in text:
        text = text.replace('that', '')
    if 'contain' in text or 'ha ' in text:
        text = text.replace('contain', 'with').replace('ha ','with ')
    text = re.sub(r'(^|\W)a([\W])', ' one ', text)
    text = re.sub(r'(^)ll ', ' ', text)
    text = re.sub(r'(^)t ', 'at ', text)
    text = ' '.join([t for t in text.split(' ') if t])
    text = text.replace('based', 'base')
    return text
Ejemplo n.º 4
0
def text_to_wordlist(text, remove_stopwords=False, stem_words=False,comma=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    import re
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    if not comma:
        text = re.sub(r"[,:\/\^.$%#+-></\?\=*\\]", " ", text) # origin is [^A-Za-z0-9^,!.\/'+-=?]
    else:
        pass
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return text
Ejemplo n.º 5
0
def contract(text):
    text = [wn.lemmatize(word) for word in text]
    text = [ps.stem(word) for word in text]
    text = "".join([char for char in text if char not in string.punctuation])
    text = ' '.join(word for word in text.split()
                    if word not in stopword)  # delete stopwors from text
    return text  #Write function to conduct all text contraction
def neg_emo(text):
    words = text.split(" ")
    sc = 0
    for word in words:
        if word.lower() in neg_e:
            sc += 1
    return sc
Ejemplo n.º 7
0
 def clean_text(text):
     text = text.lower()
     text = REPLACE_BY_SPACE.sub(' ', text)
     text = BAD_SYMBOLS.sub(' ', text)
     text = re.sub(r"\'s", " ", text)
     text = ' '.join(word for word in text.split() if word not in STOPWORDS)
     return text
Ejemplo n.º 8
0
def cleanText(text):
    # Replace non-ASCII characters with printable ASCII.
    # Use HTML entities when possible
    if None == text:
        return ''

    text = re.sub(r'\x85', '…', text)  # replace ellipses
    text = re.sub(r'\x91', "‘", text)  # replace left single quote
    text = re.sub(r'\x92', "’", text)  # replace right single quote
    text = re.sub(r'\x93', '“', text)  # replace left double quote
    text = re.sub(r'\x94', '”', text)  # replace right double quote
    text = re.sub(r'\x95', '•', text)  # replace bullet
    text = re.sub(r'\x96', '-', text)  # replace bullet
    text = re.sub(r'\x99', '™', text)  # replace TM
    text = re.sub(r'\xae', '®', text)  # replace (R)
    text = re.sub(r'\xb0', '°', text)  # replace degree symbol
    text = re.sub(r'\xba', '°', text)  # replace degree symbol

    # Do you want to keep new lines / carriage returns? These are generally
    # okay and useful for readability
    text = re.sub(r'[\n\r\t]+', ' ', text)  # remove embedded \n and \r

    #removes numbers
    text = re.sub(" \d+", " ", text)

    # This is a hard-core line that strips everything else.
    text = re.sub(r'[\x00-\x1f\x80-\xff]', ' ', text)

    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # tagged_sentence = nltk.tag.pos_tag(text.split())
    # text = ' '.join([word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS' and word != stop_words])
    return text
Ejemplo n.º 9
0
def get_high_frequency_word(textlist):
    dic = {}
    for text in textlist:
        splittest = text.split(" ")
        for word in splittest:
            if (word == ""
                ):  # spilt 后需要清洗==============================================
                continue
            if dic:
                flag = False
                for dicword in dic:
                    if (word == dicword):
                        flag = True
                        dic[word] += 1
                        break
                if (flag == False):
                    dic[word] = 1
            else:
                dic[word] = 1
    sorted_dict = sorted(dic.items(), key=lambda x: x[1], reverse=True)
    # print(sorted_dict) # 倒序排列词频 打印-------------------------------------
    # print(len(sorted_dict))
    # print("---------------------------------------------")
    # low_frequency_word =[]
    high_frequency_word = []
    count = 0
    for item in sorted_dict:
        if (item[1] > 20):
            count += 1
            high_frequency_word.append(item[0])
    count *= 0.7
    count = numpy.math.ceil(count)
    return high_frequency_word[:count]
Ejemplo n.º 10
0
def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    """Converts a text to a sequence of words (or tokens).
    # Arguments
        text: Input text (string).
        filters: Sequence of characters to filter out.
        lower: Whether to convert the input to lowercase.
        split: Sentence split marker (string).
    # Returns
        A list of words (or tokens).
    """
    if lower:
        text = text.lower()

    if sys.version_info < (3,) and isinstance(text, unicode):
        translate_map = dict((ord(c), unicode(split)) for c in filters)
    else:
        translate_map = maketrans(filters, split * len(filters))

    text = text.translate(translate_map)
    #seq = text.split(split)
    seq = text.split()
    
    #seq = word_tokenize(text)
    #print("text:",seq)
    
    #pos_seq = nltk.pos_tag(text)
    #return [i for i in seq if i]
    return nltk.pos_tag(seq)
def pos_emo(text):
    words = text.split(" ")
    sc = 0
    for word in words:
        if word.lower() in pos_e:
            sc = 1
    return sc
Ejemplo n.º 12
0
def statistics_unique_words(text):
    words_set = set()

    for token in text.split():
        words_set.add(token)

    return len(words_set)
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join(
        [mapping[t] if t in mapping else t for t in text.split(" ")])
    return text
def swear_number(text):
    words = text.split(" ")
    sc = 0
    for word in words:
        if word.lower() in sl:
            sc = +1
    return sc
Ejemplo n.º 15
0
    def apply_cleaning_function(self, fn, texts, description=""):
        result = [fn(text) for text in texts]
        sentences = [text.split() for text in result]
        tf_dict = self.build_tf_dict(sentences)
        oov = self.check_coverage(tf_dict)
        #         print(oov[:10])

        return result
Ejemplo n.º 16
0
 def text_to_data(self, text, author):
     #remove newlines, numbers, some punctuation
     text = text.replace('\n', " ")
     text = re.sub(r'[0-9]+', '', text)
     sent_tokenize_list = sent_tokenize(text)
     total_arr = [(x, author) for x in sent_tokenize_list]
     vocab_count = len(set(text.split(' ')))
     return total_arr, vocab_count
Ejemplo n.º 17
0
def percent(text):
    count = 0
    text = "".join(c for c in text if c not in ('!','.',':','?',';'))
    words = text.split()
    for word in words:
        if word.lower() in stop_words:
            count +=1
    return count / len(words)
Ejemplo n.º 18
0
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    wiki_reg = r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    url_reg = r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
    ip_reg = '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
    WIKI_LINK = ' WIKI_LINK '
    URL_LINK = ' URL_LINK '
    IP_LINK = ' IP_LINK '
    #clear link
    c = re.findall(wiki_reg, text)
    for u in c:
        text = text.replace(u, WIKI_LINK)
    c = re.findall(url_reg, text)
    for u in c:
        text = text.replace(u, WIKI_LINK)
    c = re.findall(wiki_reg, text)
    for u in c:
        text = text.replace(u, URL_LINK)
    c = re.findall(ip_reg, text)

    # Regex to remove all Non-Alpha Numeric and space
    special_character_removal = re.compile(r'[^A-Za-z\d!?*\' ]', re.IGNORECASE)
    # regex to replace all numerics
    replace_numbers = re.compile(r'\d+', re.IGNORECASE)

    # text = text.lower().split()
    text = text.split()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)
    # Remove Special Characters
    text = special_character_removal.sub('', text)
    # Replace Numbers
    text = replace_numbers.sub('NUMBERREPLACER', text)
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    # Return a list of words
    return (text)
Ejemplo n.º 19
0
def text_to_wordlist(text, remove_stopwords=True, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9']", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)

    text = re.sub(r"\s{2,}", " ", text)

    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iphone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"iii", "3", text)
    text = re.sub(r"the us", "america", text)
    text = re.sub(r" j k ", " jk ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
Ejemplo n.º 20
0
def _remove_pattern_2(input_text_list):
    stoplist = read_stopwords()

    cleaned_text_list = []
    for text in input_text_list:
        text = text.translate(string.punctuation)  # Remove puncuation 去除标点
        text = text.lower()  # Convert words to lower case and split them

        # text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ",
                      text)  # 除A-Za-z0-9(),!?'`外的字符,去除
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"https://t.co/[A-Za-z]{10}", " ", text)

        text = text.split()

        text = [word for word in text
                if word not in stoplist]  ## 在提取词根前清除一次停用词

        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]

        cleanwordlist = [
            word for word in stemmed_words if word not in stoplist
        ]  ## 提取词根后,再清除

        text = " ".join(cleanwordlist)

        cleaned_text_list.append(text)
    return cleaned_text_list
Ejemplo n.º 21
0
def negations(text):
    replacer = AntonymReplacer()

    sent = text.split()
    noneg = replacer.replace_negations(sent)
    separator = ' '
    out = separator.join(noneg)

    return out
Ejemplo n.º 22
0
def bigram_text_to_word_sequence(text, bigram, filters=base_filter(), lower=False, split=" "):
    '''prune: sequence of characters to filter out
    '''
    if lower:
        text = text.lower()
    text = text.translate(string.maketrans(filters, split*len(filters)))
    seq = text.split(split)
    sentences = [_f for _f in seq if _f]
    return bigram(sentences)
Ejemplo n.º 23
0
 def inputpreprocess(text):
     t = ' '.join([
         t for t in text.split() if t not in russian_stop
         and t not in punctuations and '\n' != t and " " != t
     ])
     t = [s for s in t if "\n" != s and '"' != s]
     text = "".join(t)
     text = re.sub(r"^\s+", "", text)
     return text
Ejemplo n.º 24
0
def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    punctuation = list(string.punctuation)
    stop.update(punctuation)

    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)
def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    if lower: text = text.lower()
    if type(text) == unicode:
        translate_table = {ord(c): ord(t) for c, t in zip(filters, split * len(filters))}
    else:
        translate_table = maketrans(filters, split * len(filters))
    text = text.translate(translate_table)
    seq = text.split(split)
    return [i for i in seq if i]
Ejemplo n.º 26
0
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in stopwords) # delete stopwors from text
    return text
def removeStopWords(sen):
    text = re.sub('[^a-zA-z&]', ' ', sen)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [
        ps.stem(word) for word in text
        if not word in set(stopwords.words('english'))
    ]
    text = ' '.join(text)
    return text
Ejemplo n.º 28
0
def clean_contractions(text, mapping):
    '''
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: current text, contraction mappings
    output: modify the comments to use the base form from contraction mapping
    '''
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e ?-? ?mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
Ejemplo n.º 30
0
def clean_special_chars(text, punct, mapping):
    '''
    credits to: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings 
    credits to: https://www.kaggle.com/anebzt/quora-preprocessing-model
    input: current text, punctuations, punctuation mapping
    output: cleaned text
    '''
    for p in punct:
        text = text.replace(p, " ")
        text = ' '.join(text.split())
    for p in mapping:
        text = text.replace(p, mapping[p])
        return text