def stopwords_round2(text): stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words) tweet_token_list = [word for word in text.split(' ') if word not in stop_words] # remove stopwords tweet = ' '.join(tweet_token_list) return tweet
def combine_words(text, dictionary): ''' Takes in text that has already been lowercased but not stemmed or lematized Also takes in a custom dictionary for the texts Combines words that should be analyzed together eg 'national monunments' Rejoins the text in the list so it can be vectorized Returns a pandas series ''' temp_list = [] text_list = text.split() text_list = [word.replace('diversity', 'diverse') for word in text_list] for e, word in enumerate(text_list[0:-2]): next_word = text_list[e + 1] try: for value in dictionary[word]: if value in next_word: text_list.append(word + '_' + next_word) temp_list.append(word) temp_list.append(next_word) except KeyError: pass for w in temp_list: text_list.remove(w) return text_list
def importance_scores(text, classifier_func, oov="OUT_OF_VOCABULARY"): # First, establish baseline prediction on original text. probs = classifier_func(text) print(probs) # Do note that if there are multiple max probabilities, argmax returns the first one max_class = np.argmax(probs) # Then, iterate over each word in the document, replacing it with oov and comparing the probability words = text.split() rvals = list() for word_id, word in enumerate(words): # Replace word with oov, then flatten list back into a string separated by spaces new_text = ' '.join(words[:word_id] + [oov] + words[(word_id+1):]) new_probs = classifier_func(new_text) print(new_probs) # Compare probs together importance = 0 new_max = np.argmax(new_probs) if max_class == new_max: importance = (probs[max_class] - new_probs[max_class]) print("Same max class, difference in importance;",importance) else: importance = (probs[max_class] - new_probs[max_class]) + (new_probs[new_max] - probs[new_max]) print("Different max class, sum of the importance differences;",importance) rvals.append(importance) return rvals
def transform_review_text(text): text = text.lower() text = re.sub('[^a-z ]', '', text) text_array = [ stemmer.stem(word) for word in text.split() if word not in stopwords ] return ' '.join(text_array)
def cloak_transposition(text, delta=1.0): # This startup process could probably be generalized, using a function handle instead of the length check. # Split the message into words. words = text.split() # We can only transpose letters in words with at least four letters, to avoid changing the start and end. # Find all those words. longword_indexes = list() for i in range(len(words)): if len(words[i]) >= 4: longword_indexes.append(i) # Determine how many of these eligible words we're supposed to modify. num_replace = math.ceil(delta * len(longword_indexes)) # Pick that many words from our eligible words. replace = random.sample(longword_indexes, num_replace) # Iterate over our chosen words, and fiddle with them. for index in replace: word = words[index] # Adjust the word. letters = list(word) # For now, pick a random letter that isn't at the ends, and switch it with an adjacent letter ind = random.randint(1,len(letters)-3) temp = letters[ind] letters[ind] = letters[ind+1] letters[ind+1] = temp new_word = ''.join(letters) # Put the word back. words[index] = new_word # simplistically we can just merge back with any whitespace, but ideally we would keep the whitespace return ' '.join(words)
def build_vocab(self, texts): print('building vocab...') wordcnt = {} for text in tqdm(texts): unigrams = text.split() unigram_num = len(unigrams) for word in unigrams: if word not in stopwords: if word in wordcnt.keys(): wordcnt[word] += 1 else: wordcnt[word] = 1 for n in [2, 3]: for i in range(unigram_num): if unigram_num <= i + n - 1: break ngram = unigrams[i:i + n] if not filter_ngram(ngram): ngram = " ".join(ngram) if ngram in wordcnt.keys(): wordcnt[ngram] += 1 else: wordcnt[ngram] = 1 vocab = {'[UNK]': 0} i = 1 for word, cnt in wordcnt.items(): if cnt >= filter_freq: vocab[word] = i i += 1 self.vocab = vocab self.vocab_size = i print('vocab size:', self.vocab_size) '''
def remove_extra_whitespace(text): ''' Input: "aslsj alksdla asdmda askldalk" Output: "aslsj alksdla asdmda askldalk" ''' #return re.sub(' +', ' ', text) return ' '.join([ele for ele in text.split(' ') if len(ele) > 0])
def clean_text(self): text = re.sub('[^a-zA-z]', ' ', str(self.doc)) text = re.sub('\[.*?\]', ' ', text) text = re.sub('\d', ' ', text) text = " ".join(text.split()) text = text.lower() return text
def wordnet_lemmetize_tokenize(text): ''' Custom tokenizer object that applies WordNetLemmatizer Intended to be passed into CountVectorizer as a tokenizer object ''' lemmatizer = WordNetLemmatizer() words = text.split() # additional lemmatization terms additional_lemmatize_dict = { "cancelled": "cancel", "cancellation": "cancel", "cancellations": "cancel", "delays": "delay", "delayed": "delay", "baggage": "bag", "bags": "bag", "luggage": "bag", "dms": "dm", "thanks": "thank" } tokens = [] for word in words: if word not in sw: if word in additional_lemmatize_dict: clean_word = additional_lemmatize_dict[word] else: clean_word = lemmatizer.lemmatize(word) tokens.append(clean_word) return tokens
def simple_stemmer(text): ''' Input: "My system keeps crashing his crashed yesterday, ours crashes daily" Output: "My system keep crash hi crash yesterday, our crash daili" ''' ps = nltk.porter.PorterStemmer() text = ' '.join([ps.stem(word) for word in text.split()]) return text
def sent_list(docs, splitStr='__label__'): for i in range(1, len(docs)): text = str(lines[i]) splitText = text.split(splitStr) #print(i) secHalf = splitText[1] text = secHalf[2:len(secHalf) - 1] sentiment = secHalf[0] sent_analysis.append([text, sentiment]) return sent_analysis
def checker( text, ): spell_checked_text_words = [ self.spelchek.correct( word=word, ) for word in text.split() ] return ' '.join(spell_checked_text_words)
def tokenize(text): """ 先进行 stemming 然后 tokenize params: text: 一个句子 return: tokens 列表 """ text = ' '.join([stemmer.stem(word) for word in text.split(' ')]) tokens = tokenizer.tokenize(text) return tokens
def getDocumentSentimentList(docs, splitStr='__label__'): for i in range(len(docs)): #print('Processing doc ',i,' of ',len(docs)) text = str(lines[i]) #print(text) splitText = text.split(splitStr) secHalf = splitText[1] text = secHalf[2:len(secHalf) - 1] sentiment = secHalf[0] #print('First half:',secHalf[0],'\nsecond half:',secHalf[2:len(secHalf)-1]) docSentimentList.append([text, sentiment]) print('Done!!') return docSentimentList
def sentence_score(score_dict, text): sent_dict = defaultdict(int) text_list = text.split('.') for e, s in enumerate(text_list): score = 0 temp_list = s.split(' ') for w in temp_list: w = w.lower() try: score += score_dict[w][0] except KeyError: continue sent_dict['sent{}'.format(e)] = score return sent_dict
def extract_statements(nlp, company, text): """ Extracting ESG statements from raw text by removing junk, URLs, etc. We group consecutive lines into paragraphs and use spacy to parse sentences. """ lines = [] sentences = [] # remove non ASCII characters text = remove_non_ascii(text) prev = "" for line in text.split('\n'): # aggregate consecutive lines where text may be broken down # only if next line starts with a space or previous does not end with dot. if (line.startswith(' ') or not prev.endswith('.')): prev = prev + ' ' + line else: # new paragraph lines.append(prev) prev = line # don't forget left-over paragraph lines.append(prev) # clean paragraphs from extra space, unwanted characters, urls, etc. # best effort clean up, consider a more versatile cleaner for line in lines: # removing header number line = re.sub(r'^\s?\d+(.*)$', r'\1', line) # removing trailing spaces line = line.strip() # words may be split between lines, ensure we link them back together line = re.sub('\s?-\s?', '-', line) # remove space prior to punctuation line = re.sub(r'\s?([,:;\.])', r'\1', line) # ESG contains a lot of figures that are not relevant to grammatical structure line = re.sub(r'\d{5,}', r' ', line) # remove mentions of URLs line = re.sub( r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', r' ', line) # remove multiple spaces line = re.sub('\s+', ' ', line) # split paragraphs into well defined sentences using spacy for part in list(nlp(line).sents): sentences.append([company, str(part).strip()]) return sentences
def get_hashtags_and_user_mentions(special_characters, text, wanted_characters=['#', '@']): # Identify hashtags, user mentions and remove urls results = {} for character in special_characters: text = re.sub('(' + character + ')+', ' ' + character, text) count_character = text.count(character) if count_character > 0: while count_character > 0: start = text.find(character) print(text.find(" ", start)) print(text.find("\n", start)) if text.find(" ", start) <= text.find("\n", start): end = text.find(" ", start) else: end = text.find("\n", start) if end == -1: end = len(text) text_to_remove = text[start:end] print(text_to_remove) if len(text_to_remove) > 2: if character in wanted_characters: if character in results.keys(): results[character].append(text_to_remove) else: results[character] = [text_to_remove] text = text.replace(text_to_remove, "") text = ' '.join(text.split()) count_character = text.count(character) for wanted_character in wanted_characters: if wanted_character not in results.keys(): results[wanted_character] = [] text = text.strip(' ') text = ' '.join(text.split()) results['clean_text'] = text return results
def segment_text( text, ): segmented_words = [ wordsegment.segment( text=word, ) for word in text.split() ] seperated_words = [ word for segment_text in segmented_words for word in segment_text ] segmented_text = ' '.join(seperated_words) return segmented_text
def corpus_specific_text_cleaning(text): """ For performing corpus specific cleaning. Added to this file, since it needs to be adapted to the corpus and therefore a kind of configuration """ text = text.replace('"full_text" : ', "").strip().replace('"', '').replace( '\\n*', ' ').replace('\\', ' ').replace('&', ' ').replace("'ve", ' have') text = text.replace("don't", 'do not').replace("doesn't", 'does not').replace( "Don't", 'Do not').replace("Doesn't", 'Does not') text = text.replace("_NEWLINE_", " ").replace( "_CITATION_PREVIOUS_POST_PARAGRAPH", " ").replace("_CITATION_PREVIOUS_POST_", " ").replace("_POSTER_", " ") no_links = [] for word in text.split(" "): if "//" not in word and "http" not in word and "@" not in word: no_links.append(word) cleaned_text = " ".join(no_links) return cleaned_text
def clean_text(text, remove_stop_words=False): text = text.lower() replace_punctuation = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) text = text.translate(replace_punctuation) text = re.sub(r'\s+', ' ', text) text = re.sub('[\n\r]', '', text) if remove_stop_words == True: text = text.split() new_text = [] stemmer = PorterStemmer() for word in text: if word not in STOPWORDS: new_text.append(stemmer.stem(word)) text = ' '.join(new_text) return text
def sent_list(docs,splitStr='__label__'): sent_analysis = [] for i in range(1,len(docs)): text=str(lines[i]) splitText=text.split(splitStr) secHalf=splitText[1] sentiment=secHalf[0] text=secHalf[2:len(secHalf)-1].lower() table=str.maketrans(' ',' ', string.punctuation) text.translate(table) if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text: text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", text) text = re.sub(r'\d+', '', text) sent_analysis.append([text,sentiment]) return sent_analysis
def cloak_replacement(text, select_func, replace_func, delta=1.0): # First, identify which words we're going to replace. words = text.split() replace = list() for i in range(len(words)): if select_func(words[i]): replace.append(i) # Then, replace them for index in replace: if delta < 1.0: # if we're only doing some elements, check if we skip this one if random.rand(0,1) >= delta: continue else: # We are replacing this word, use the function provided to do it word = words[index] new_word = replace_func(word) # Put the new word back. words[index] = new_word # Simplistically join back the words. Ideally, we would rejoin using the original whitespace. # TODO: reuse original whitespace somehow return ' '.join(words)
def encode(self, text): feature_vec = [0] * self.vocab_size valid_ids = [] unigrams = text.split() unigram_num = len(unigrams) for word in unigrams: if word not in stopwords: pos = self.vocab.get(word, 0) feature_vec[pos] = 1 if pos not in valid_ids: valid_ids.append(pos) for n in [2, 3]: for i in range(unigram_num): if unigram_num <= i + n - 1: break ngram = unigrams[i:i + n] if not filter_ngram(ngram): ngram = " ".join(ngram) pos = self.vocab.get(ngram, 0) feature_vec[pos] = 1 if pos not in valid_ids: valid_ids.append(pos) return feature_vec, valid_ids
def clean_text(self, text): """ # Arguments text: text body to be preprocessed and cleaned # Return cleaned text """ # handle non-ascii/special characters text = text.encode("utf-8") text = re.sub(r"\\[ux][a-z0-9]+", " ", str(text)) text = str(text).replace("b", "") text = text.strip("'").lower() text = re.sub(r'[\:\-\(\)\%\d\.\\\/\_\[\]\+\,\#\"]+', ' ', text) text = re.sub(r'\s+', ' ', text) word_list = text.split(' ') # tokenization w.r.t space characters rel_words = [ word for word in word_list if word not in self.stop and len(word) >= self.min_word_len ] # relevant words rel_words_lemm = [ self.lemmatizer.lemmatize(word, pos='v') for word in rel_words ] return " ".join(rel_words_lemm)
def tokenizer(text): return text.split()
def tokenizer_porter(text): return [port.PorterStemmer().stem(word) for word in text.split()]
plt.hist(training_set.target, bins=bins, alpha=0.7) plt.xlabel('Target output class label') plt.ylabel('Count of documents') plt.title('Histogram of documents in each category') plt.show() # Cleaning the texts from sklearn.feature_extraction import text stop_words = text.ENGLISH_STOP_WORDS import re from nltk.stem.porter import PorterStemmer corpus = [] for i in range(len(dataset)): text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i]) text = text.lower() text = text.split() ps = PorterStemmer() text = [ps.stem(word) for word in text if not word in stop_words] text = ' '.join(text) corpus.append(text) train_corpus = corpus[:len(train_data)] test_corpus = corpus[len(train_data):] # Creating the Bag of Words model (min_df=2) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(min_df=2) count_matrix_train = cv.fit_transform(train_corpus).toarray() count_matrix_test = cv.transform(test_corpus).toarray() from sklearn.feature_extraction.text import TfidfTransformer tfidf = TfidfTransformer()
def __split_keyword(text): """ キーワードを区切り、stopwordsを除外する """ keywords = text.split(" ") return [keyword for keyword in keywords if __check_stop_word(keyword)]
def remove_stopword(text): return [word for word in text.split() if word not in stop_words]
def strips(dataset): for id_, (query, text) in dataset.items(): query = query.split(" ") text = text.split(" ") id_ = id_ yield id_, query, text