def spacy_entity_extraction(content): try: from nltk import word_tokenize import spacy nlp = spacy.load('en_core_web_md') capitalized_text = [] tokenized_words = word_tokenize(content) for text in tokenized_words: capitalize_first_char = text.capitalize() capitalized_text.append(capitalize_first_char) detokenizer = Detok() detokenized_text = detokenizer.detokenize(capitalized_text) #remove_cardinal = re.sub(r'[0-9]+', '', detokenized_text) nlp_document = nlp(detokenized_text) str_replace_dict = {} if len(nlp_document.ents) == 0: str2 = detokenized_text else: for entities in nlp_document.ents: extracted_entities = {entities.label_} if 'CARDINAL' not in extracted_entities: extracted_text = {entities.text} #print(extracted_text) #print(extracted_text) for key in extracted_text: str_replace_dict[ key] = "<span class='imp'>" + key + '</span>' str2 = multiwordReplace(detokenized_text, str_replace_dict) return str2 except Exception as e: error_updation.exception_log(e, "Error in entities_extraction :", str(''))
def remove_spaCy_stop3(all_data, train_data, test_data): spacy_nlp = spacy.load('en') sw = spacy.lang.en.stop_words.STOP_WORDS deto = Detok() all_cleaned = list() train_cleaned = list() test_cleaned = list() for article in all_data: word_tokens = word_tokenize(article) all_cleaned.append( deto.detokenize([w for w in word_tokens if not w in sw])) for article in train_data: word_tokens = word_tokenize(article) train_cleaned.append( deto.detokenize([w for w in word_tokens if not w in sw])) for article in test_data: word_tokens = word_tokenize(article) test_cleaned.append( deto.detokenize([w for w in word_tokens if not w in sw])) return all_cleaned, train_cleaned, test_cleaned
def detokenize(lyric): detokenizer = Detok() detoken_list = [] while lyric: for list_item in lyric: text = detokenizer.detokenize(list_item) detoken_list.append(text) return detoken_list
def construct_globals(): global MATCH_ALPHA_WORD, LOWER, speller, word_set, detokenizer MATCH_ALPHA_WORD = "[A-Za-zĂÂÎȘȚăâîșț]+" LOWER = [chr(i) for i in range(ord('a'), ord('z') + 1)] LOWER += list("ăâșîț") speller = aspell.Speller('lang', 'ro') word_set = set() detokenizer = Detok()
def __init__(self, quotes_resources): """Initializes resources for quote reccomendation Args: quotes_resources (tuple): (model, neural_network, captions, vectors) """ self.quotes_resources = quotes_resources self.detokenizer = Detok()
def detok(input): tokens = input.split() detokenizer = Detok() text = detokenizer.detokenize(tokens) text = re.sub('\s*,\s*', ', ', text) text = re.sub('\s*\.\s*', '. ', text) text = re.sub('\s*\?\s*', '? ', text) text = text.strip() return text
def remove_NLTK_stop1(all_data): sw = stopwords.words('english') deto = Detok() all_cleaned = list() for article in all_data: word_tokens = word_tokenize(article) all_cleaned.append( deto.detokenize([w for w in word_tokens if not w in sw])) return all_cleaned
def detokenize(tokens): ''' ''' new_tokens = tokens new_tokens = Detokenizer.__box_forward(new_tokens, '-LRB-', '(') new_tokens = Detokenizer.__box_forward(new_tokens, '``', '"') new_tokens = Detokenizer.__box_backwards(new_tokens, '-RRB-', ')') new_tokens = Detokenizer.__box_backwards(new_tokens, '\'\'', '"') new_tokens = Detokenizer.__switch(new_tokens, '--', '-') detokenizer = Detok() return detokenizer.detokenize(new_tokens)
def segment_sent(self, s): '''Segments a sentence pased on prediction''' tokens = word_tokenize(s) y = self.predict(tokens) detokenizer = Detok() sents = [] if 'P' in y: n = y.index('P') + 1 sents.append(detokenizer.detokenize(tokens[:n])) sents.append(detokenizer.detokenize(tokens[n:])) else: sents.append(s) '''with MosesDetokenizer('en') as detokenize: if 'P' in y: n = y.index('P') + 1 sents.append(detokenize(tokens[:n])) sents.append(detokenize(tokens[n:])) else: sents.append(s)''' return sents
def Lem_stopwords(my_tick): nlp_fr = spacy.load('fr_core_news_md') nlp_en=English() my_corpus=[] if(isinstance(my_tick[0], list)): my_corpus.append(my_tick[0]) else: my_corpus=my_tick for i in range (len(my_corpus)): li=[] if(detect(my_corpus[i]))=='fr': lists=nlp_fr(my_corpus[i]) [li.append(str(token.lemma_)) for token in lists if not str(token.lemma_) in list(fr_stop)] else: lists=nlp_en(my_corpus[i]) [li.append(str(token.lemma_)) for token in lists if not str(token.lemma_) in list(en_stop)] detokenizer = Detok() my_corpus[i] = detokenizer.detokenize(li) my_corpus[i]=re.sub(' +', ' ', my_corpus[i]) #print(corpus[i]) return my_corpus
def reconstruct_sentence(sent: List[str], eliminate=None) -> str: global detokenizer if detokenizer is None: detokenizer = Detok() if eliminate is not None: for el in eliminate: while True: try: sent.remove(el) except ValueError: break text = detokenizer.detokenize(sent) text = re.sub(r'(")\s+(.*?)\s+(")', quote_repl, text) text = re.sub(r'(«)\s+(.*?)\s+(»)', quote_repl, text) text = re.sub(r'(“)\s+(.*?)\s+(”)', quote_repl, text) text = re.sub(r'(„)\s+(.*?)\s+(”)', quote_repl, text) text = re.sub(r'\s*,\s*', ', ', text) text = re.sub(r'(\D)\s*\.\s*$', point_repl, text) text = re.sub(r'\s*\?\s*$', '? ', text) text = re.sub(r'\s*\-\s*', '-', text) text = re.sub(r'\s*\!\s*$', '! ', text) return text
# limitations under the License. """This code is required for "official_eval" mode in main.py It provides functions to read a SQuAD json file, use the model to get predicted answers, and write those answers to another JSON file.""" from __future__ import absolute_import from __future__ import division import os from tqdm import tqdm import numpy as np from six.moves import xrange #import nltk.tokenize # from nltk.tokenize.moses import MosesDetokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok detokenizer = Detok() from preprocessing.squad_preprocess import data_from_json, tokenize from vocab import UNK_ID, PAD_ID from data_batcher import padded, Batch def readnext(x): """x is a list""" if len(x) == 0: return False else: return x.pop(0) def refill_batches(batches, word2id, qn_uuid_data, context_token_data,
def detokenize(string): detokenizer = Detok() temp = detokenizer.detokenize(string) return temp