def spacy_entity_extraction(content):
    try:
        from nltk import word_tokenize
        import spacy
        nlp = spacy.load('en_core_web_md')
        capitalized_text = []
        tokenized_words = word_tokenize(content)
        for text in tokenized_words:
            capitalize_first_char = text.capitalize()
            capitalized_text.append(capitalize_first_char)
        detokenizer = Detok()
        detokenized_text = detokenizer.detokenize(capitalized_text)
        #remove_cardinal = re.sub(r'[0-9]+', '', detokenized_text)
        nlp_document = nlp(detokenized_text)
        str_replace_dict = {}
        if len(nlp_document.ents) == 0:
            str2 = detokenized_text
        else:
            for entities in nlp_document.ents:
                extracted_entities = {entities.label_}
                if 'CARDINAL' not in extracted_entities:
                    extracted_text = {entities.text}
                    #print(extracted_text)
                    #print(extracted_text)
                    for key in extracted_text:
                        str_replace_dict[
                            key] = "<span class='imp'>" + key + '</span>'
            str2 = multiwordReplace(detokenized_text, str_replace_dict)
        return str2
    except Exception as e:
        error_updation.exception_log(e, "Error in entities_extraction :",
                                     str(''))
def remove_spaCy_stop3(all_data, train_data, test_data):
    spacy_nlp = spacy.load('en')
    sw = spacy.lang.en.stop_words.STOP_WORDS
    deto = Detok()

    all_cleaned = list()
    train_cleaned = list()
    test_cleaned = list()

    for article in all_data:
        word_tokens = word_tokenize(article)
        all_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    for article in train_data:
        word_tokens = word_tokenize(article)
        train_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    for article in test_data:
        word_tokens = word_tokenize(article)
        test_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    return all_cleaned, train_cleaned, test_cleaned
Ejemplo n.º 3
0
def detokenize(lyric):
    detokenizer = Detok()
    detoken_list = []
    while lyric:
        for list_item in lyric:
            text = detokenizer.detokenize(list_item)
            detoken_list.append(text)
        return detoken_list
Ejemplo n.º 4
0
def construct_globals():
    global MATCH_ALPHA_WORD, LOWER, speller, word_set, detokenizer
    MATCH_ALPHA_WORD = "[A-Za-zĂÂÎȘȚăâîșț]+"
    LOWER = [chr(i) for i in range(ord('a'), ord('z') + 1)]
    LOWER += list("ăâșîț")
    speller = aspell.Speller('lang', 'ro')
    word_set = set()
    detokenizer = Detok()
Ejemplo n.º 5
0
    def __init__(self, quotes_resources):
        """Initializes resources for quote reccomendation

        Args:
            quotes_resources (tuple): (model, neural_network,
                captions, vectors)
        """
        self.quotes_resources = quotes_resources
        self.detokenizer = Detok()
Ejemplo n.º 6
0
def detok(input):
    tokens = input.split()
    detokenizer = Detok()
    text = detokenizer.detokenize(tokens)
    text = re.sub('\s*,\s*', ', ', text)
    text = re.sub('\s*\.\s*', '. ', text)
    text = re.sub('\s*\?\s*', '? ', text)
    text = text.strip()

    return text
def remove_NLTK_stop1(all_data):
    sw = stopwords.words('english')
    deto = Detok()

    all_cleaned = list()

    for article in all_data:
        word_tokens = word_tokenize(article)
        all_cleaned.append(
            deto.detokenize([w for w in word_tokens if not w in sw]))

    return all_cleaned
Ejemplo n.º 8
0
    def detokenize(tokens):
        '''
        '''
        new_tokens = tokens

        new_tokens = Detokenizer.__box_forward(new_tokens, '-LRB-', '(')
        new_tokens = Detokenizer.__box_forward(new_tokens, '``', '"')

        new_tokens = Detokenizer.__box_backwards(new_tokens, '-RRB-', ')')
        new_tokens = Detokenizer.__box_backwards(new_tokens, '\'\'', '"')

        new_tokens = Detokenizer.__switch(new_tokens, '--', '-')

        detokenizer = Detok()
        return detokenizer.detokenize(new_tokens)
Ejemplo n.º 9
0
    def segment_sent(self, s):
        '''Segments a sentence pased on prediction'''
        tokens = word_tokenize(s)
        y = self.predict(tokens)
        detokenizer = Detok()

        sents = []
        if 'P' in y:
            n = y.index('P') + 1
            sents.append(detokenizer.detokenize(tokens[:n]))
            sents.append(detokenizer.detokenize(tokens[n:]))
        else:
            sents.append(s)
        '''with MosesDetokenizer('en') as detokenize:
            if 'P' in y:
                n = y.index('P') + 1
                sents.append(detokenize(tokens[:n]))
                sents.append(detokenize(tokens[n:]))
            else:
                sents.append(s)'''

        return sents
def Lem_stopwords(my_tick):
    nlp_fr = spacy.load('fr_core_news_md')
    nlp_en=English()
    my_corpus=[]
    if(isinstance(my_tick[0], list)):
        my_corpus.append(my_tick[0])
    else:
        my_corpus=my_tick 
       
    for i in range (len(my_corpus)):
        li=[]
        if(detect(my_corpus[i]))=='fr':
            lists=nlp_fr(my_corpus[i])
            [li.append(str(token.lemma_)) for token in lists if not str(token.lemma_) in list(fr_stop)]
        else:
            lists=nlp_en(my_corpus[i])
            [li.append(str(token.lemma_)) for token in lists if not str(token.lemma_) in list(en_stop)]
       
        
        detokenizer = Detok()
        my_corpus[i] = detokenizer.detokenize(li)
        my_corpus[i]=re.sub(' +', ' ', my_corpus[i])
        #print(corpus[i])
    return my_corpus
Ejemplo n.º 11
0
def reconstruct_sentence(sent: List[str], eliminate=None) -> str:
    global detokenizer
    if detokenizer is None:
        detokenizer = Detok()
    if eliminate is not None:
        for el in eliminate:
            while True:
                try:
                    sent.remove(el)
                except ValueError:
                    break

    text = detokenizer.detokenize(sent)
    text = re.sub(r'(")\s+(.*?)\s+(")', quote_repl, text)
    text = re.sub(r'(«)\s+(.*?)\s+(»)', quote_repl, text)
    text = re.sub(r'(“)\s+(.*?)\s+(”)', quote_repl, text)
    text = re.sub(r'(„)\s+(.*?)\s+(”)', quote_repl, text)
    text = re.sub(r'\s*,\s*', ', ', text)
    text = re.sub(r'(\D)\s*\.\s*$', point_repl, text)
    text = re.sub(r'\s*\?\s*$', '? ', text)
    text = re.sub(r'\s*\-\s*', '-', text)
    text = re.sub(r'\s*\!\s*$', '! ', text)

    return text
Ejemplo n.º 12
0
# limitations under the License.
"""This code is required for "official_eval" mode in main.py
It provides functions to read a SQuAD json file, use the model to get predicted answers,
and write those answers to another JSON file."""

from __future__ import absolute_import
from __future__ import division

import os
from tqdm import tqdm
import numpy as np
from six.moves import xrange
#import nltk.tokenize
# from nltk.tokenize.moses import MosesDetokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
detokenizer = Detok()

from preprocessing.squad_preprocess import data_from_json, tokenize
from vocab import UNK_ID, PAD_ID
from data_batcher import padded, Batch


def readnext(x):
    """x is a list"""
    if len(x) == 0:
        return False
    else:
        return x.pop(0)


def refill_batches(batches, word2id, qn_uuid_data, context_token_data,
Ejemplo n.º 13
0
def detokenize(string):
  detokenizer = Detok()
  temp = detokenizer.detokenize(string)
  return temp