Ejemplo n.º 1
0
    def __init__(self, url, variant="seria"):
        # inputs
        self.url = url
        if not isinstance(variant, str) or variant not in [
                "seria", "gamberra"
        ]:
            raise TypeError(
                """The parameter 'variant' of the tree must be a string containing 
                the name of the  desired variant. Allowed values are "seria" or "gamberra"."""
            )
        self.variant = variant
        # load spanish model
        self._nlp = es_core_news_md.load()
        # load data
        self._load_data()

        # initalize article
        self.article = Article(self.url)
        # download and parse article
        self.article.download()
        self.article.parse()
        # store basic info
        self.text = self.article.text
        self.source_url = self.article.source_url
        if self.source_url in self.df_url_media.media_url.tolist():
            self.recognized_media = True
        else:
            self.recognized_media = False

        if self.recognized_media:
            self.media_name = self.df_url_media.loc[
                self.df_url_media.media_url == self.source_url,
                'media_name'].iloc[0]
        else:
            self.media_name = self.source_url
Ejemplo n.º 2
0
    def _test_biluov_task():
        import es_core_news_md
        from scripts.utils import Sentence

        def forward(tokensxsentence, entitiesxsentence):
            labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence)
            return [
                from_biluov(biluov, sentence, spans=True)
                for biluov, sentence in zip(labelsxsentence, tokensxsentence)
            ]

        training = Collection().load(Path("data/training/scenario.txt"))
        nlp = es_core_news_md.load()

        def per_label(label):
            tokensxsentence = [nlp(s.text) for s in training.sentences]
            entitiesxsentence = [[
                k.spans for k in s.keyphrases if k.label == label
            ] for s in training.sentences]
            decoded = forward(tokensxsentence, entitiesxsentence)
            return decoded

        collection = Collection([Sentence(s.text) for s in training.sentences])
        for label in ENTITIES:
            decoded = per_label(label)
            for entities, sentence in zip(decoded, collection.sentences):
                for spans in entities:
                    keyphrase = Keyphrase(sentence, label, -1, spans)
                    sentence.keyphrases.append(keyphrase)

        collection.fix_ids()
        output = Path(
            "data/submissions/forward-biluov/train/run1/scenario2-taskA/")
        output.mkdir(parents=True, exist_ok=True)
        collection.dump(output / "scenario.txt", skip_empty_sentences=False)
Ejemplo n.º 3
0
    def __init__(self,
                 counter=Counter(),
                 popcounter=Counter(),
                 lemmatizer=WordNetLemmatizer()):

        self.__counter = counter
        self.__popCounter = popcounter
        self.lemmatizer = lemmatizer
        self.punctuation = "¡!\"$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
        self.stopWords = set(
            stopwords.words("spanish") + stopwords.words("english"))
        print("loading spaCy model: es_core_news_md")
        self.nlp = es_core_news_md.load()
Ejemplo n.º 4
0
    def get_entities(self, document: str, language: str = 'english'):
        ''' Takes a document and returns a list of extracted entities '''
        if language == 'spanish':
            try:
                import es_core_news_md
                logger.info("Success importing en_core_web_md")
            except ImportError:
                logger.error("Error importing es_core_news_md")
                sys.exit(-1)
        else:
            try:
                import en_core_web_md
                logger.info("Success importing en_core_web_md")
            except ImportError:
                logger.error("Error importing en_core_web_md")
                sys.exit(-1)

        if isinstance(document, List):
            document = " ".join(document)
        # if language given is not the name of a spacy parser, try to convert it to one
        parser_name = language if language in LANG_TO_PARSER.values(
        ) else LANG_TO_PARSER.get(language.lower())
        if not parser_name:
            raise Exception('language not supported')

        # if requested parser is not already in memory, try to load from spacy
        if parser_name not in PARSERS:
            try:
                logger.info("Trying to load parser.")
                if language == 'spanish':
                    PARSERS[parser_name] = es_core_news_md.load()
                else:
                    PARSERS[parser_name] = en_core_web_md.load()
                #PARSERS[parser_name] = spacy.load(parser_name)
            except Exception:
                logger.exception("Error loading parser")
                sys.exit(-1)
        else:
            logger.info("Found parser %s in memory" % parser_name)

        def get_ents(doc):
            ''' prep, parse, then extract entities from doc text '''
            doc = prep_text(doc)  # preprocess string
            doc = PARSERS[parser_name](doc)  # parse prepped doc
            ents = set(ent.text for ent in doc.ents
                       if not self.filter_entity(ent))  # extract entities
            return list(ents)

        return get_ents(document)
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
Ejemplo n.º 6
0
    def __init__(self):
        """ Initialize the extractor
        """
        # Create the taxonomy from files
        self._create_taxonomy()

        # List of words
        self.__sources, self.__reporters, self.__entities = [], [], []

        # Store the tree with the POS-Tagging, lemmata and gramatical relations
        self._tree = None
        
        # load the language model for spacy
        self.nlp = es_core_news_md.load()
        
        # Set maximum character distance that we allow between the reported verb and the recognized entity to consider that it is a font        
        self._max_dist = 100
Ejemplo n.º 7
0
def classifier_spacy(text, opciones):
    palabras = text.split()

    nlp = es_core_news_md.load()
    stop = nlp.Defaults.stop_words
    rta = ''
    sim_max = 0
    i = 0
    for op in opciones:
        opcion = nlp(op.lower())
        sim = nlp(text).similarity(opcion)
        print(sim)
        for palabra in palabras:
            if (not palabra in stop) and palabra in op.split():
                sim += 0.3
        print(sim)
        if sim > sim_max:
            print(op)
            sim_max = sim
            rta = str(i + 1)
        i += 1
    if sim_max < 0.30:
        rta = str(0)
    return rta
Ejemplo n.º 8
0
import logging
from telegram.ext import CommandHandler
from telegram.ext import MessageHandler, Filters
import telegram
import telebot
import snips_nlu
from snips_nlu import SnipsNLUEngine
from SPARQLWrapper import SPARQLWrapper, JSON
import es_core_news_md
import textacy.datasets as ds
import textacy
import io
import json

snips_nlu.load_resources("es")
nlp = es_core_news_md.load()
mi_bot = telebot.TeleBot("605016230:AAEPuIIJGVVoeHaGbP6iroaYdi3uhkvJXrQ")


@mi_bot.message_handler(commands=['start', 'help'])
def send_welcome(message):
    mi_bot.reply_to(message, "Howdy, how are you doing?")


@mi_bot.message_handler(func=lambda m: True)
def imprimirmensaje(message):
    campo = ""
    predicado = ""
    bandera1 = False
    chatid = message.chat.id
    with io.open('dataset.json') as file:
def get_news_features(headline, text):

    nlp = es_core_news_md.load()

    ## headline ##
    headline = re.sub(r"http\S+", "", headline)
    headline = re.sub(r"http", "", headline)
    headline = re.sub(r"@\S+", "", headline)
    headline = re.sub("\n", " ", headline)
    headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline)
    headline = headline.replace(r"*NUMBER*", "número")
    headline = headline.replace(r"*PHONE*", "número")
    headline = headline.replace(r"*EMAIL*", "email")
    headline = headline.replace(r"*URL*", "url")
    headline_lower = headline.lower()
    doc_h = nlp(headline_lower)

    list_tokens_h = []
    list_tags_h = []

    for sentence_h in doc_h.sents:
        for token in sentence_h:
            list_tokens_h.append(token.text)

    fdist_h = FreqDist(list_tokens_h)
    syllables_h = get_nsyllables(headline)
    words_h = len(list_tokens_h)

    # headline complexity features
    avg_word_size_h = round(
        sum(len(word) for word in list_tokens_h) / words_h, 2)
    avg_syllables_word_h = round(syllables_h / words_h, 2)
    unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2)
    mltd_h = round(ld.mtld(list_tokens_h), 2)
    ttr_h = round(ld.ttr(list_tokens_h) * 100, 2)

    ## text content##
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"http", "", text)
    text = re.sub("\n", " ", text)
    text = text.replace(r"*NUMBER*", "número")
    text = text.replace(r"*PHONE*", "número")
    text = text.replace(r"*EMAIL*", "email")
    text = text.replace(r"*URL*", "url")

    # to later calculate upper case letters ratio
    alph = list(filter(str.isalpha, text))
    text_lower = text.lower()
    doc = nlp(text_lower)

    list_tokens = []
    list_pos = []
    list_tag = []
    list_entities = []
    sents = 0

    for entity in doc.ents:
        list_entities.append(entity.label_)

    for sentence in doc.sents:
        sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)

    # Calculate entities, pos, tag, freq, syllables, words and quotes
    entities = len(list_entities)
    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)
    syllables = get_nsyllables(text)
    words = len(list_tokens)
    quotes = n_tag['PUNCT__PunctType=Quot']

    # complexity features
    avg_word_sentence = round(words / sents, 2)
    avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2)
    avg_syllables_word = round(syllables / words, 2)
    unique_words = round((len(fdist.hapaxes()) / words) * 100, 2)
    ttr = round(ld.ttr(list_tokens) * 100, 2)

    # readability spanish test
    huerta_score = round(
        206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2)
    szigriszt_score = round(
        206.835 - ((62.3 * syllables) / words) - (words / sents), 2)

    # stylometric features
    mltd = round(ld.mtld(list_tokens), 2)
    upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2)
    entity_ratio = round((entities / words) * 100, 2)
    quotes_ratio = round((quotes / words) * 100, 2)
    propn_ratio = round((n_pos['PROPN'] / words) * 100, 2)
    noun_ratio = round((n_pos['NOUN'] / words) * 100, 2)
    pron_ratio = round((n_pos['PRON'] / words) * 100, 2)
    adp_ratio = round((n_pos['ADP'] / words) * 100, 2)
    det_ratio = round((n_pos['DET'] / words) * 100, 2)
    punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2)
    verb_ratio = round((n_pos['VERB'] / words) * 100, 2)
    adv_ratio = round((n_pos['ADV'] / words) * 100, 2)
    sym_ratio = round((n_tag['SYM'] / words) * 100, 2)

    # create df_features
    df_features = pd.DataFrame({
        'text': text_lower,
        'headline': headline_lower,
        'words_h': words_h,
        'word_size_h': [avg_word_size_h],
        'avg_syllables_word_h': [avg_syllables_word_h],
        'unique_words_h': [unique_words_h],
        'ttr_h': ttr_h,
        'mltd_h': [mltd_h],
        'sents': sents,
        'words': words,
        'avg_words_sent': [avg_word_sentence],
        'avg_word_size': [avg_word_size],
        'avg_syllables_word': avg_syllables_word,
        'unique_words': [unique_words],
        'ttr': [ttr],
        'huerta_score': [huerta_score],
        'szigriszt_score': [szigriszt_score],
        'mltd': [mltd],
        'upper_case_ratio': [upper_case_ratio],
        'entity_ratio': [entity_ratio],
        'quotes': quotes,
        'quotes_ratio': [quotes_ratio],
        'propn_ratio': [propn_ratio],
        'noun_ratio': [noun_ratio],
        'pron_ratio': [pron_ratio],
        'adp_ratio': [adp_ratio],
        'det_ratio': [det_ratio],
        'punct_ratio': [punct_ratio],
        'verb_ratio': [verb_ratio],
        'adv_ratio': [adv_ratio],
        'sym_ratio': [sym_ratio]
    })

    return df_features
Ejemplo n.º 10
0
def tokenizador_spacy_lemma(texto, nlp=es_core_news_md.load()):
    return [token.lemma_ for token in nlp(texto)]
Ejemplo n.º 11
0
def tokenizador_spacy_simple(texto, nlp=es_core_news_md.load()):
    return [token.text for token in nlp(texto)]
Ejemplo n.º 12
0
 def _load_nlp(self):
     if SpacyMixin.nlp == None:
         import es_core_news_md
         print("NLP is not yet loaded")
         SpacyMixin.nlp = es_core_news_md.load()
         print("NLP loaded")
 def __init__(self):
     self.nlp_model = es_core_news_md.load()