def find_first_n_keywords(df, header, n, char_list): stop_words = set(stopwords.words('english')) column = df[header] first_n_list = [] for text_row in column: word_tokens = get_unique_word(text_row) # word_tokens = word_tokenize(text_row) filtered = [w for w in word_tokens if not w in stop_words] filtered_char = [w for w in filtered if not w in char_list] filtered_space = [w for w in filtered_char if w != ''] first_n_row = filtered_space[:n] first_n_row = TreebankWordDetokenizer().detokenize(first_n_row) first_n_list.append(first_n_row) df['first_'+str(n)+'_keywords_in_'+header] = first_n_list return df
def preprocess_data(self): """ clean sentences and normalize them, transform sentences to sequence of integers """ # eliminate stop words and punctuation from data words, classes, documents = [], [], [] # loop through each sentence in our intents patterns for intent in self.data['intents']: if 'patterns' in intent: for pattern in intent['patterns']: # tokenize each word in the sentence pattern = pattern.lower() tokens = WordPunctTokenizer().tokenize(pattern) filtered_words = [ w for w in tokens if w not in self.stop_words ] words.extend(filtered_words) sentence = TreebankWordDetokenizer().detokenize( filtered_words) # add to documents in our corpus documents.append((sentence, intent['tag'])) # add to our classes list if intent['tag'] not in classes: classes.append(intent['tag']) # save dictionary of words repository = words for i in words: if not i.isalpha(): repository.remove(i) # separate sentences (input) and labels (output) text, output = [], [] for doc in documents: text.append(doc[0]) output.append(classes.index(doc[1])) self.input = text # save input self.output = np.asarray(output) # save output self.classes = classes # save target classes self.words = Counter(repository) # save words dictionary self.transform_text_to_numeric( ) # transform sentences to sequence of integers self.transform_numeric_with_embeddings( ) # create embedding matrix with words in dictionary
def change_genderwords(text): newtext = "" for sentence in tokenize.sent_tokenize(text): token = tokenize.word_tokenize(sentence) for count, tokens in enumerate(token): for words in gender_word: if words[0] == tokens: token[count] = words[1] elif words[1] == tokens: token[count] = words[0] elif words[0] + '\'s' == tokens: token[count] = words[1] + '\'s' elif words[1] + '\'s' == tokens: token[count] = words[0] + '\'s' detoken = TreebankWordDetokenizer().detokenize(token) newtext = newtext + detoken + ' ' return newtext
def average_sentlength( tokens ): """ Calculates the sentence word length. len(nltk.word_tokenize(sent)) -> length of each sentence, tokenized individually (sum(len(nltk.word_tokenize(sent)) for sent in sent_tokens)) -> sum of all sentence lengths :param tokens: the tokenized list of words :return: average sentence length """ #detokenize the tokenized list --> rebuilding the sentences original_text = TreebankWordDetokenizer().detokenize( tokens ) #use the sentence tokenizer sent_tokens = nltk.sent_tokenize( original_text ) if len(sent_tokens) != 0: return ( sum(len(nltk.word_tokenize( sent )) for sent in sent_tokens) ) / len(sent_tokens) else: return 0
def create_subtitle_features_df(self, subtitle_dir): """ Extract features (see engineer_features()) from subtitles for specific movie :param: imdb_id for movie, subtitle corpus directory :returns: Pandas dataframe with shape (n_words, n_features) """ features_list = [] features_df = pd.DataFrame() files = os.listdir(subtitle_dir) list_of_text_files = [] for file in files: if re.search(self.imdb_id, file): list_of_text_files.append(file) sent_n = 0 sent_per_episode = int(50 / len(list_of_text_files)) for episode, file in enumerate(list_of_text_files): filename = subtitle_dir / file with open(filename, 'r') as subtitles: texts = subtitles.read() subtitles.close() sents_all = SENT_TOKENIZER.tokenize(texts) window_size = 3 if type(sents_all[:-2]) == list and len( sents_all[:-2]) >= sent_per_episode: sents = np.random.choice(sents_all[:-2], sent_per_episode) # last 2 are noise for itext in range(0, len(sents), window_size): text_window = sents[itext:(itext + window_size)] text_window_raw = TreebankWordDetokenizer().detokenize( text_window) arguments = text_window_raw, str('_'), 'episode' + str( episode) + '_tw' + str(itext), 'movie' rt = SingleTextProcessor(*arguments) if len(rt.sentences) > 2: rt.process_self() feature_dict = rt.to_dict() features = engineer_features(feature_dict) features_df = features_df.append(features, ignore_index=True) sent_n += window_size print('----------\n\n------ PROCESSING SENTENCE', sent_n, 'in episode', episode + 1, 'out of', len(list_of_text_files) + 1, '------\n\n----------') self.subtitle_features = features_df
def SpellCheck(data): Spell_Words = [] spell = SpellChecker() words = spell.split_words(words) for i in data.split_words(' '): w = Word(i) spell.word_frequency.load_words(['molded','.', '(',')']) words = spell.correction(w) if words != w: words = colored(words, 'blue') #spell_word = ' '.join(words) Spell_Words.append(words) # print(Spell_Words) Corrected_Words = TreebankWordDetokenizer().detokenize(Spell_Words) return Corrected_Words
def SpellCheck2(data): spell = SpellChecker() Spell_Words = [] # Note that this does not necessarily deal with punctuation unless you provide # a custom tokenizer words_split = nltk.word_tokenize(data) # misspelled = spell.unknown(words_split) for word in words_split: spell.word_frequency.load_words(['molded','.', '(',')']) correction = spell.correction(word) print(correction) if correction != word: correction = colored(correction, 'blue') Spell_Words.append(correction) Corrected_Words = TreebankWordDetokenizer().detokenize(Spell_Words) return Corrected_Words
def preprocess(df, full_processing=False): # Tokenize review text df['reviewText'] = df['reviewText'].apply(word_tokenize) # Remove noise df['reviewText'] = df['reviewText'].apply(remove_noise) # Perform full processing, if needed if full_processing: df['reviewText'] = df['reviewText'].apply(filter_stopwords) df['reviewText'] = df['reviewText'].apply(lemmatize) # Detokenize text df['reviewText'] = df['reviewText'].apply( TreebankWordDetokenizer().detokenize) return df.to_numpy()
def search_news(): """ This endpoint is a (POST) HTTP method, it takes a json that consists of a key value pair - keywords that holds an array of non empty keyword of type string. sample is shown below { "keywords": [ "one", "two", "three" ] } """ query_input = request.get_json() array_input = query_input['keywords'] if (not query_input or not array_input): return response.bad_request(message='not a valid input') if (type(array_input) != list): return response.bad_request(message='the keywords is of type array') if (array_input[0].strip() == ''): return response.bad_request( message='array must contain a non empty string') detokenized_words = TreebankWordDetokenizer().detokenize(array_input) identifier = '_'.join(array_input) identifier = identifier.lower() news_holder = [] identifier = str(identifier) query = {"identifier": identifier} not_included = {"_id": 0, "identifier": 0} try: news_from_db = mongo.db.news.find(query, not_included) if news_from_db: for obj in news_from_db: news_holder.append(obj) if len(news_holder) > 0: news_holder.sort(key=constant.get_my_key, reverse=True) print('!!!!!!! got data from db !!!!!!!') return response.success(data=news_holder) except Exception as err: print(err) return response.internal_server_error( message=f'error occured while querying from db - {err}') print('!!!!!!!! got data from internet !!!!!!!!!!!') return get_news_from_internet(detokenized_words, identifier, array_input)
def punctuation(para): fin = [] #words = nltk.tokenize.word_tokenize(para) words = re.split('[ ]', para) cap_sugg = {} for ind in range(len(words)): word = words[ind] flag = 0 ch = '' if (word == ""): continue if (word[-1] == '.' or word[-1] == '?' or word[-1] == '!'): flag = 1 ch = word[-1] word = word[:-1] sug = word if (ut.tag([word])[0][1] is None): c = word[0] word1 = word[0].upper() + word[1:] cap_sugg[ind] = word1 sug = word1 sug = sug + ch fin.append(sug) para = TreebankWordDetokenizer().detokenize(fin) print(para) arr = re.split('[?.!]', para) # print(arr) newpara = [] for line in arr: line = line.lstrip() line = line.rstrip() if (line == ''): continue if (is_question(line)): newline = line[0].upper() + line[1:len(line)] newpara.append(newline + '?') else: newline = line[0].upper() + line[1:len(line)] newpara.append(newline + '.') if (newpara[len(newpara) - 1] == '.'): newpara = newpara[:len(newpara) - 1] str = ' '.join(newpara) # print(str) # print(str) return str
def abstractive_summary(): os.system("python make_datafiles.py") os.system("python run_summarization.py --mode=decode --data_path=finished_files/test.bin --vocab_path=vocab --log_root=logs --exp_name=myexperiment") with open('logs/myexperiment/decode/attn_vis_data.json') as json_file: data = json.load(json_file) text = (TreebankWordDetokenizer().detokenize(data["decoded_lst"])) text = text.replace(" . ", ". ").replace(" , ", ", ").replace(" ; ", "; ").replace(" \' s", "\'s").replace(" “ ", " “").replace(" ” ", "” ") sentences = sent_tokenize(text) sentences = [s.capitalize().strip() for s in sentences] sentences = list(set(sentences)) text = " ".join(sentences).strip() return text
def get_function(self): dtk = TreebankWordDetokenizer() def lsa(array): array = pd.Series(array, index=pd.Series(array.index), name='array') copy = array.dropna() copy = copy.apply(lambda x: dtk.detokenize(clean_tokens(x))) li = self.trainer.transform(copy) lsa1 = pd.Series(li[:, 0], index=copy.index) lsa2 = pd.Series(li[:, 1], index=copy.index) array = pd.DataFrame(array) array['l1'] = lsa1 array['l2'] = lsa2 arr = ((np.array(array[['l1', 'l2']])).T).tolist() return pd.Series(arr) return lsa
def extra_text_import(): news_text = brown.sents(categories="news") words = [] for sent in news_text: temp = sent words.append(TreebankWordDetokenizer().detokenize(temp)) remove = string.punctuation + string.digits remove = remove.replace(",", "") remove = remove.replace(".", "") # print("patterns to remove", remove) table = str.maketrans("", "", remove) words = [w.lower() for w in words] words = [w.translate(table) for w in words] return words #[0:1]
def extractgender(readacted_date): redacted_gender_files=[] countgender =0 newls=[] gender=['mr.','sir','his','mister','mr','prince','king','mrs.','ms.','miss','her','lady','madameoiselle','baroness','mistress','mrs','ms','queen','princess','madam','madame'] for i in range(len(readacted_date)): tokenize=word_tokenize(readacted_date[i]) for n,i in enumerate(tokenize): for j in range(len(gender)): if i.lower() == gender[j]: tokenize[n] = '██' countgender+=1 file = TreebankWordDetokenizer().detokenize(tokenize) redacted_gender_files.append(file) return redacted_gender_files,countgender
def reinstate_abbreviation_expansion(sentence, abbrevs): # abbrevs = {key + "_1": value for key, value in abbrevs.items()} tokens_text = word_tokenize(sentence) abbrev_data = find_all_abbreviations(sentence, abbrevs) if len(abbrev_data) == 0: return sentence abbrev_data.sort(key=lambda x: x.index) result_tokens = [] start = 0 for abbrev in abbrev_data: start_tokens = tokens_text[start:abbrev.index] start_tokens.append(abbrev.construct_abbrev_with_expansion()) result_tokens.extend(start_tokens) start = abbrev.index + 1 result_tokens.extend(tokens_text[start:]) return TreebankWordDetokenizer().detokenize(result_tokens)
def prepn(inp): from nltk.corpus import stopwords from nltk.tokenize import word_tokenize stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(inp) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) from nltk.tokenize.treebank import TreebankWordDetokenizer return TreebankWordDetokenizer().detokenize(filtered_sentence)
def index(request): if 'inp' not in request.POST: sentence = "" else: sentence = request.POST['inp'] if 'spells' not in request.POST: spellid = '-1' else: spellid = request.POST['spells'] ignore = request.session.get('ignore', []) if 'ignoreid' in request.POST: ignore.append(request.POST['ignoreid']) spellid = request.POST['ignoreid'] request.session['ignore'] = ignore tokens = nltk.word_tokenize(sentence) if 'change' in request.POST: logging.debug(request.POST['change']) spellid = request.POST['changeid'] tokens[int(spellid)] = request.POST['change'] sentence = TreebankWordDetokenizer().detokenize(tokens) spells = spell_suggestions(tokens) for i in ignore: spells[int(i)] = [] if int(spellid) not in spells: spells = [] else: spells = spells[int(spellid)] return render( request, 'rewrite_app/index.html', { 'sentence': sentence, 'spells': spells, 'tokens': tokens, 'spellid': spellid })
def pre_processing(filename='data.csv'): print('loading and tokenizing data ....') train_data = pd.read_csv(filename, index_col=0) print(train_data.head()) print(train_data.describe()) print(train_data.info()) nltk.download('punkt') nltk.download('stopwords') en_stopwords = stopwords.words('english') detokenizer = TreebankWordDetokenizer() def clean_description(desc): desc = word_tokenize(desc.lower()) desc = [ token for token in desc if token not in en_stopwords and token.isalpha() ] return detokenizer.detokenize(desc) train_data['review'] = train_data['features'].apply(clean_description) train_data.dropna(inplace=True) # # target_1_values = set(df['continuous_target_1']) # # >> > target_1_values # # {80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100} def points_to_class(points): if points in range(80, 83): return 0 elif points in range(83, 87): return 1 elif points in range(87, 90): return 2 elif points in range(90, 94): return 3 elif points in range(94 - 98): return 4 else: return 5 train_data['label'] = train_data['continuous_target_1'].apply( points_to_class) return train_data[["review", "label"]]
def sampleCharacter(text, char, n, l): indices = [] for i in range(len(text)): if text[i] == char: indices.append(i) master = {} while n > 0 and len(indices) > 0: x = indices[random.randint(0, len(indices) - 1)] start = int(x - l / 2) if start < 0: start = 0 new = [] for i in range(l): if i + start < len(text): new.append(text[i + start]) master.update({start: TreebankWordDetokenizer().detokenize(new)}) n = n - 1 indices.remove(x) return master
def supercase(text): shift = "⇧" caps = "⇪" tokens = word_tokenize(text) mod_tokens = [] for t in tokens: #Check if entire word is at least two characters and is entirely uppercased if len(t) >= 2 and t == t.upper() and t[0].isalpha(): mod_tokens.append(t.lower() + caps) #Check if upper elif t[0].isupper(): mod_tokens.append(t.lower() + shift) #If none of this is true, return original else: mod_tokens.append(t.lower()) supercasedtext = TreebankWordDetokenizer().detokenize(mod_tokens) return supercasedtext
def get_sample_text_passages(self, expression, no_passages): """ Returns a specified number of example passages that include a certain expression. The number of passages that you request is a maximum number, and this function may return fewer if there are limited cases of a passage in the corpus. :param expression: expression to search for :param no_passages: number of passages to return :return: List of passages as strings >>> from gender_analysis.corpus import Corpus >>> from gender_analysis.common import TEST_DATA_PATH >>> filepath = TEST_DATA_PATH / 'sample_novels' / 'texts' >>> corpus = Corpus(filepath) >>> results = corpus.get_sample_text_passages('he cried', 2) >>> 'he cried' in results[0][1] True >>> 'he cried' in results[1][1] True """ count = 0 output = [] phrase = nltk_tokenize.word_tokenize(expression) random.seed(expression) random_documents = self.documents.copy() random.shuffle(random_documents) for document in random_documents: if count >= no_passages: break current_document = document.get_tokenized_text() for index in range(len(current_document)): if current_document[index] == phrase[0]: if current_document[index:index+len(phrase)] == phrase: passage = " ".join(current_document[index-20:index+len(phrase)+20]) output.append((document.filename, passage)) count += 1 if len(output) <= no_passages: return TreebankWordDetokenizer().detokenize(output) return output[:no_passages]
def allowOnlyCommonWords(df, most_occur): #gets the formated dataframe and the array with the most common words dfInput = {'Tweet': [], 'Hatespeech': []} #Removes all words from the String that are not in the most common words for index, row in df.iterrows(): tweetText = row['Tweet'] hatespeechIndicator = row['Hatespeech'] word_tokens = word_tokenize(tweetText) filtered_sentence = [] for w in word_tokens: for value in most_occur: if value[0] == w: filtered_sentence.append(w) dfInput["Tweet"].append( TreebankWordDetokenizer().detokenize(filtered_sentence)) dfInput["Hatespeech"].append(hatespeechIndicator) clearedDf = pd.DataFrame(dfInput, columns=['Tweet', 'Hatespeech']) return clearedDf
def coref_true_to_file(self, data): # write the coref results to file corefCount = 0 f = open(self.output_name + "_coref_true.tsv", "w+") for line in tqdm(data): coref_line = {"document":line.strip()} try: json = self.predictor.predict_json(coref_line) except KeyboardInterrupt: print("KeyboardInterrup") break except: print("problem sentence: ", line) if len(json['clusters']) > 0: corefCount += 1 f.write(TreebankWordDetokenizer().detokenize(json['document'])+"\n") f.close() print("Coref count: ", corefCount) print("write to file complete")
def get_candidates(model, text, max_candidates): # helper function that retrieves perturbed candidates; called in get_delta_opt words = word_tokenize(text) candidates = [None] * max_candidates counter = 0 for word in words: if wn.synsets(word) == []: continue tmp = wn.synsets(word)[0].pos() # if not adjective or noun, continue if tmp != "a" and tmp != "n": continue for a in antonyms(word): candidates[counter] = (TreebankWordDetokenizer().detokenize( [a.rstrip() if x == word else x for x in words])) counter += 1 if counter >= max_candidates: return list(filter(None.__ne__, candidates)) return list(filter(None.__ne__, candidates))
def conllReader(corpus): ''' Data reader for CoNLL format data ''' root = "data/" sentences = [] ccorpus = ConllCorpusReader(root, ".conll", ('words', 'pos', 'tree')) raw = ccorpus.sents(corpus) for sent in raw: sentences.append([TreebankWordDetokenizer().detokenize(sent)]) tagged = ccorpus.tagged_sents(corpus) print(tagged) return tagged, sentences
class word_tokenizer(tokenizer): def __init__(self): self.__pattern = r'''(?x) (?:[A-Z]\.)+ | \w+(?:-\w+)* | \$?\d+(?:\.\d+)?%? | \.\.\. | [][.,;"'?():_`-] ''' self.__tokenizer = RegexpTokenizer(self.__pattern) self.__detokenizer = TreebankWordDetokenizer() def tokenize(self, text): return self.__tokenizer.tokenize(text) def detokenize(self, iterable): return self.__detokenizer.detokenize(iterable) def encode(self, text): pass def decode(self, iterable): pass def vocab(self): return 0 def __fit_file(self, file): self.source_file = file with Fast_File(file) as ff: self.__fit_iterable(ff) def __fit_iterable(self, it): sentencepiece.SentencePieceTrainer.Train(sentence_iterator=it, model_writer=self.model) def fit(self, x): if isinstance(x, str): self.__fit_file(x) else: self.__fit_iterable(x)
class Cleaner(): def __init__(self): # nltk.download('punkt') self.tk = TreebankWordTokenizer() self.dtk = TreebankWordDetokenizer() self.BAD_CAT_REMOVE = re.compile('^Cat_') self.A_TILDE_REMOVE = re.compile('[á]') self.E_TILDE_REMOVE = re.compile('[é]') self.I_TILDE_REMOVE = re.compile('[í]') self.O_TILDE_REMOVE = re.compile('[ó]') self.U_TILDE_REMOVE = re.compile('[ú]') self.POINT_FOLLOWING_LETTER = re.compile('(?<=\S)\.(?=\w)') # self.BAD_SYMBOLS_REMOVE = re.compile('[^A-Za-z0-9_ áéíóú]') def applyRegex(self, value, regex, replacement): value = regex.sub(replacement, value) return value def text_cleaning(self, text): return pipe( text.lower(), # partial(self.BAD_SYMBOLS_REMOVE.sub, ''), partial(self.A_TILDE_REMOVE.sub, 'a'), partial(self.E_TILDE_REMOVE.sub, 'e'), partial(self.I_TILDE_REMOVE.sub, 'i'), partial(self.O_TILDE_REMOVE.sub, 'o'), partial(self.U_TILDE_REMOVE.sub, 'u'), # partial(self.POINT_FOLLOWING_LETTER.sub('. ')) ) def sentence_cleaning(self, sentence, detokenize=False): word_tokens = pipe(sentence, partial(self.POINT_FOLLOWING_LETTER.sub, '. '), self.tk.tokenize) word_tokens = [self.text_cleaning(text) for text in word_tokens] # word_tokens.remove('') if detokenize: return self.dtk.detokenize(word_tokens) else: return word_tokens
def spell_correction(self, data_obj, stem_and_stop): if not stem_and_stop: self.tokenized_corpus = [[word for word in tweet.split()] for tweet in data_obj.raw_tweets] self.tokenized_corpus = [[ str(TextBlob(word).correct()) for word in tweet ] for tweet in self.tokenized_corpus] self.detokenized_corpus = [ TreebankWordDetokenizer().detokenize(tweet) for tweet in self.tokenized_corpus ] self.detokenized_corpus = [ re.sub('[^A-Za-z0-9 ]', '', tweet) for tweet in self.detokenized_corpus ] self.tokenized_corpus = [ regexp_tokenize(tweet, r'\S*') for tweet in self.detokenized_corpus ]
def create_bert_embeddings(stories): sentences = [] embedding_dimensions = 768 single_index = SimpleNeighbors(embedding_dimensions) for story in stories.values(): sentence = TreebankWordDetokenizer().detokenize(story['story'][0]) #print(sentence) sentences.append(sentence) sbert_model = SentenceTransformer('stsb-roberta-base') sentence_embeddings = sbert_model.encode(sentences, show_progress_bar=True) for embedding, key in zip(sentence_embeddings, stories.values()): single_index.add_one(key['movie_id'], embedding) # print("Key: ", key['movie_id']) # print("Embedding: ", embedding) # print("Vector Len: ", len(embedding)) # input("Press any key...") return (single_index)
def insert_brackets(tok_sent, coref_range, GENDER_PRONOUNS=[ 'he', 'she', 'him', 'her', 'his', 'hers', 'himself', 'herself' ]): start_bracket = ["<", "[", "{", "<<", "[[", "{{"] end_bracket = [">", "]", "}", ">>", "]]", "}}"] index = 0 #used to iterate through brackets array so each cluster will be within a different bracket for cluster in coref_range: if any([ ((c[0] == c[1]) and (tok_sent[c[0]]).lower() in GENDER_PRONOUNS) for c in cluster ]): #check if cluster contains a gender pronoun for (start_index, end_index) in cluster: tok_sent[ start_index] = start_bracket[index] + tok_sent[start_index] tok_sent[end_index] = tok_sent[end_index] + end_bracket[index] index += 1 return TreebankWordDetokenizer().detokenize(tok_sent)
#from six.moves import xrange import os, sys from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer # nltk 3.3 from truecaser.Truecaser import * import _pickle as cPickle def convert(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): #tokens = nltk.word_tokenize(sentence) return getTrueCase(tokens, 'as-is', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) # load a truecase model$ print("load truecaser", file=sys.stderr) #curr_dir = os.path.dirname(__file__) f = open("truecaser/distributions.obj", 'rb') uniDist = cPickle.load(f) backwardBiDist = cPickle.load(f) forwardBiDist = cPickle.load(f) trigramDist = cPickle.load(f) wordCasingLookup = cPickle.load(f) f.close() if __name__ == "__main__": sent = "I do n't have cats named Tom and Jerry ." tokens = [x.lower() for x in sent.split()] print(tokens) truecase_tokens = convert(tokens, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) detokenizer = TreebankWordDetokenizer() sent = detokenizer.detokenize(truecase_tokens) print(sent)