def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)

    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
Exemple #2
0
    def init_resources(self):
        self.punctuation_pattern = re.compile("|".join(PUNCTUATION))
        self.stemmer = None
        stopwords_path = os.path.join(
            os.path.dirname(assistant_dialog_skill_analysis.__file__),
            "resources",
            self.language_code,
            "stopwords",
        )
        if self.language_code == "en":
            from spacy.lang.en import English

            self.tokenizer = Tokenizer(English().vocab)
            self.stemmer = SnowballStemmer(language="english")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "fr":
            from spacy.lang.fr import French

            self.tokenizer = Tokenizer(French().vocab)
            self.stemmer = SnowballStemmer(language="french")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "de":
            from spacy.lang.de import German

            self.tokenizer = Tokenizer(German().vocab)
            self.stemmer = SnowballStemmer(language="german")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "it":
            from spacy.lang.it import Italian

            self.tokenizer = Tokenizer(Italian().vocab)
            self.stemmer = SnowballStemmer(language="italian")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "cs":
            from spacy.lang.cs import Czech

            self.tokenizer = Tokenizer(Czech().vocab)
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "pt":
            from spacy.lang.pt import Portuguese

            self.tokenizer = Tokenizer(Portuguese().vocab)
            self.stemmer = SnowballStemmer(language="portuguese")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "es":
            from spacy.lang.es import Spanish

            self.tokenizer = Tokenizer(Spanish().vocab)
            self.stemmer = SnowballStemmer(language="spanish")
            self.stop_words = self.load_stop_words(stopwords_path)
        else:
            raise Exception("language code %s is not supported",
                            self.language_code)
Exemple #3
0
def tokenize(text):
    tokenized = []
    nlp = Spanish()
    doc = nlp(text)
    token_list = []
    for token in doc:
        token_list.append(token.text)
        tokenized.append(token_list)
    return token_list
Exemple #4
0
            def spacy_tokenizer(sentence):
                parser = Spanish()
                tokens = parser(sentence)
                filtered_tokens = []
                for word in tokens:
                    lemma = word.lemma_.lower().strip()
                    if lemma not in STOP_WORDS and re.search(
                            '^[a-zA-Z]+$', lemma):
                        filtered_tokens.append(lemma)

                return filtered_tokens
Exemple #5
0
def preprocess_test(df):
    # Spacy Tokenizers
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    # Spanish and Portuguese masks to use corresponding language tokenizer
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    # Test file only needs id and tokens
    return df
Exemple #6
0
def main():
    nlp = English()
    doc = nlp("This is a sentence.")
    print(doc.text)

    nlp = German()
    doc = nlp('Liebe Grüße!')
    print(doc.text)

    nlp = Spanish()
    doc = nlp('¿Cómo estás?')
    print(doc.text)
Exemple #7
0
def preprocess(df):
    # Spacy Tokenizers
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    # Spanish and Portuguese masks to use corresponding language tokenizer
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    # Training and validation df need to have __label__ string before category 
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df
 def stopwords(self, text):
     try:
         nlp = Spanish() if self.lang == 'es' else English()
         doc = nlp(text)
         token_list = [token.text for token in doc]
         sentence = []
         for word in token_list:
             lexeme = nlp.vocab[word]
             if not lexeme.is_stop:
                 sentence.append(word)
         return ' '.join(sentence)
     except Exception as e:
         Util.standard_error(sys.exc_info())
         print('Error stopwords: {0}'.format(e))
         return None
Exemple #9
0
def preprocess(text):
    # Tokenize, remove stopwords, numbers, emtpy spaces and punctuation and lemmatize
    tokenized = []
    nlp = Spanish()
    doc = nlp(text)
    token_list = []
    # Tokenize
    for token in doc:
        # Remove stopwords, numbers, emtpy spaces and punctuation and lemmatize
        if ((token.text not in nlp.Defaults.stop_words) &
            (token.text not in string.punctuation) &
            (token.text.isalpha() == True)):
            token_list.append(token.lemma_)
    tokenized.append(token_list)
    return tokenized
def lang_change(language):
    if language == 'en':
        from spacy.lang.en import English
        from spacy.lang.en.stop_words import STOP_WORDS
        parser = English()
        file = "\config_files\config_spacy_en.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'de':
        from spacy.lang.de import German
        from spacy.lang.de.stop_words import STOP_WORDS
        parser = German()
        file = "\config_files\config_spacy_de.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'es':
        from spacy.lang.es import Spanish
        from spacy.lang.es.stop_words import STOP_WORDS
        parser = Spanish()
        file = "\config_files\config_spacy_es.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'pt':
        from spacy.lang.pt import Portuguese
        from spacy.lang.pt.stop_words import STOP_WORDS
        parser = Portuguese()
        file = "\config_files\config_spacy_pt.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'fr':
        from spacy.lang.fr import French
        from spacy.lang.fr.stop_words import STOP_WORDS
        parser = French()
        file = "\config_files\config_spacy_fr.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'it':
        from spacy.lang.it import Italian
        from spacy.lang.it.stop_words import STOP_WORDS
        parser = Italian()
        file = "\config_files\config_spacy_it.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'nl':
        from spacy.lang.nl import Dutch
        from spacy.lang.nl.stop_words import STOP_WORDS
        parser = Dutch()
        file = "\config_files\config_spacy_nl.yaml"
        configfile_path = os.getcwd() + file

    return parser, STOP_WORDS, configfile_path
def tokenize(text):
    '''
    Tokenize a string in Spanish
    Parameters
    ----------
    text : str
        Spanish text string to tokenize.
    Returns
    -------
    tokenized : list
        List of tokens (includes punctuation tokens).
    '''
    nlp = Spanish()
    doc = nlp(text)
    token_list = []
    for token in doc:
        token_list.append(token.text)
    return token_list
Exemple #12
0
def spacy_tokenizer(sentence):
    nlp = spacy.load('es')
    parser = Spanish()
    spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
    STOPWORDS = list(spacy_stopwords)
    STOPWORDS.extend(('y', 'a', 'u', 'o', 'e', 'quiero'))
    tokens = parser(sentence)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        lemma = re.sub("á", "a", lemma)
        lemma = re.sub("é", "e", lemma)
        lemma = re.sub("í", "i", lemma)
        lemma = re.sub("ó", "o", lemma)
        lemma = re.sub("ú", "u", lemma)
        lemma = re.sub("ñ", "n", lemma)
        if lemma not in STOPWORDS and re.search('^[a-zA-Z]+$', lemma):
            filtered_tokens.append(lemma)
    return filtered_tokens
Exemple #13
0
def tokenize(document, language, punctutation):
    if language == 'fr':
        nlp = French()
    if language == 'de':
        nlp = German()
    if language == 'en':
        nlp = French()
    if language == 'es':
        nlp = Spanish()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(document)
    if punctutation:
        sentences = [[str(word) for word in sent if str(word) != '\n']
                     for sent in doc.sents]
    else:
        sentences = [[
            str(word) for word in sent
            if ((str(word) != '\n') and (str(word).isalpha()))
        ] for sent in doc.sents]
    return sentences
Exemple #14
0
    def get_nlp(self, language):

        """"
        this method returns the corresponding spacy language model when 
        provided with a language. To do so it also does the required 
        import. This is certainly not the standard approach. 
        But as this endpoint will be deployed to Heroku (space limitation)
        and only be invoked rarely it is the fastest approach.
        """

        if language == "en":

            from spacy.lang.en import English
            return English()

        elif language == "fr":

            from spacy.lang.fr import French
            return French()

        elif language == "de":

            from spacy.lang.de import German
            return German()

        elif language == "es":

            from spacy.lang.es import Spanish
            return Spanish()

        elif language == "pt":

            from spacy.lang.pt import Portuguese
            return Portuguese()

        else:

            return {"error": "invalid or not supported language entered"}
import json
from spacy.matcher import Matcher
from spacy.lang.es import Spanish

with open("exercises/es/adidas.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Spanish()
matcher = Matcher(nlp.vocab)

# Dos tokens que en minúsculas encuentran "adidas" y "zx"
pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}]

# Token que en minúsculas encuentra "adidas" y un dígito
pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}]

# Añade los patrones al matcher y revisa el resultado
matcher.add("ROPA", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
def complete_text_analysis(text, raw_entities):
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - SingleCompleteTextAnalysis_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='a')
    p_file.write(date +
                 " Single Complete Text Analysis Test - Local Execution" +
                 "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    # 01. Read emojis
    path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
    unicode_emoji_list_file = codecs.open(path +
                                          "list - unicode_emojis_metadata.txt",
                                          encoding='utf-8')
    emoji_list = unicode_emoji_list_file.read().splitlines()
    unicode_emoji_list_file.close()
    aux_emojis_dict = {}
    emojis_dict = {}
    for aux in emoji_list:
        aux_emoji = aux.split('\t')
        aux_emojis_dict[aux_emoji[1]] = [aux_emoji[2], aux_emoji[3]]
        emojis_dict[aux_emoji[2]] = {
            'emoji_id': aux_emoji[0],
            'unicode': aux_emoji[1],
            'name': aux_emoji[3],
            'polarity': float(aux_emoji[4]),
            'happiness': float(aux_emoji[5]),
            'anger': float(aux_emoji[6]),
            'fear': float(aux_emoji[7]),
            'replusion': float(aux_emoji[8]),
            'surprise': float(aux_emoji[9]),
            'sadness': float(aux_emoji[10]),
            'interest': aux_emoji[11]
        }
    sorted_aux_emojis_list = sorted(aux_emojis_dict.keys(),
                                    key=len,
                                    reverse=True)
    emojis_list = list()
    for aux_emoji in sorted_aux_emojis_list:
        emojis_list.append(aux_emojis_dict[aux_emoji][0])
    # print(emojis_list)
    # 02. Read complementary characters
    complementary_characters_list_file = codecs.open(
        path + "list - complementary_characters.txt", encoding='utf-8')
    complementary_characters_list = complementary_characters_list_file.read(
    ).splitlines()
    complementary_characters_list_file.close()
    complementary_characters_dict = {}
    for aux in complementary_characters_list:
        aux_char = aux.split('\t')
        complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]]
    # print(complementary_characters_dict)
    # 03. Read emoticons patterns
    text_type = 'Twitter'
    emotions = ast.literal_eval(config.get(text_type, 'emotions'))
    emoticons_metadata = ast.literal_eval(
        config.get(text_type, 'emoticons_metadata'))
    emotions_polarity = ast.literal_eval(
        config.get(text_type, 'emotions_polarity'))
    # 04. Configure Google_Universal_POS_Tags
    tags = config.options("Google_Universal_POS_Tags")
    google_universal_tags = {}
    for tag in tags:
        google_universal_tags[tag.upper()] = config.get(
            'Google_Universal_POS_Tags', tag)
    # 05. Read special characters (#, @, https, etc.)
    special_characters = ast.literal_eval(
        config.get('TextAnalysis', 'special_characters'))
    additional_symbols = ast.literal_eval(
        config.get('TextAnalysis', 'additional_symbols'))
    variation_selectors = ast.literal_eval(
        config.get('TextAnalysis', 'variation_selectors'))
    # 06. Configure Spanish POS tagger
    nlp = Spanish()
    tag_map = spacy.lang.es.TAG_MAP
    emoticons = []
    emojis = []
    complementary_characters = []
    texts = []
    emojis_count = 0
    emoticon_count = 0
    complementary_characters_count = 0
    original_text = text.replace('\n', ' ')
    results = identify_special_characters(
        original_text, raw_entities, nlp, tag_map, emotions,
        emoticons_metadata, emotions_polarity, emojis_dict, emojis_list,
        variation_selectors, complementary_characters_dict, emoticon_count,
        emojis_count, complementary_characters_count)

    spaced_text = results[0]
    final_clean_text = results[1]
    emoticons += copy.deepcopy(results[2])
    emojis += copy.deepcopy(results[3])
    complementary_characters += copy.deepcopy(results[4])
    emoticon_count = results[5]
    emojis_count = results[6]
    complementary_characters_count = results[7]
    special_entities = results[8]
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    texts.append(spaced_text + '\t' + final_clean_text + '\t' +
                 str(special_entities))
    p_file.write("Texts with: " + "\n")
    for text in texts:
        p_file.write(text + "\n")
    p_file.flush()
    p_file.close()
              if len(x) > 3]  # Remove words with less than 3 letters.
    tokens = [stemmer.stem(token) for token in tokens]  # Lemmatize words.
    tokens = [x for x in tokens if x not in to_avoid]

    return ' '.join(tokens)


if '__main__' == __name__:

    stemmer = SnowballStemmer('spanish')
    sys.setrecursionlimit(10000)

    cwd = os.getcwd()

    stop_words = get_stop_words('es')
    parser = Spanish()

    to_avoid = read_as_list('to_avoid.txt', 'latin-1')

    my_sheet = 'Sheet1'
    file_name = 'Proposals - PAM - Spanish.xlsx'  # name of your excel file
    df = read_excel(file_name, sheet_name=my_sheet)
    df = df[df['category/name/se'] == 'Sanidad y salud']

    txt = list(df['body'])

    text = [filter_vocabulary(txt, 0.01)][0]
    text = [
        prepare_text_for_ML(x, stop_words, parser, stemmer, to_avoid)
        for x in text
    ]
Exemple #18
0
def getSentences(text):
    nlp = Spanish()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]
Exemple #19
0
from rrec.model.reddit_recommender import RedditRecommender

# Spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.fr import French
from spacy.lang.zh import Chinese
from spacy.lang.ru import Russian
from spacy.lang.ar import Arabic
from spacy.lang.de import German
from spacy.lang.uk import Ukrainian
from spacy.lang.ro import Romanian

lang_id_to_spacy = {
    'en': English(),
    'es': Spanish(),
    'fr': French(),
    'zh-cn': Chinese(),
    'ru': Russian(),
    'ar': Arabic(),
    'de': German(),
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
 def __init__(self):
     self.nlp_english = English()
     self.nlp_spanish = Spanish()
Exemple #21
0
# Constants - Hyperparameters
interactions_scores_dict = {
    'VIEW': 1,
    'BOOKMARK': 2,
    'FOLLOW': 3,
    'LIKE': 4,
    'COMMENT CREATED': 5
}

# Global objects
interactions_df = pd.read_csv('interactions.csv')
articles_df = pd.read_csv('articles.csv')
person_le = preprocessing.LabelEncoder()
tokens_le = preprocessing.LabelEncoder()
hidden_dimensions = 250
language_objects = {"en": English(), "pt": Portuguese(), "es": Spanish()}
tokenizers = {}
summaries = {}
filter_regex = "[^A-Za-z0-9]+"
batch_size = 10000
max_iterations = 100000
l2_lambda = 0.001

# We summarize each article with Spacy's TextRank implementation. This eliminates most of the noisy information
# in the texts. Then we apply tf-idf analysis to the article summaries. For every unique token in the obtained corpus
# of summaries, we calculate the expected tf-idf score over all articles. Then we sort the tokens in descending order
# of their expected tf-idf scores. The first 5000 tokens will constitute the representing tokens of our article corpus.


def create_article_tokens():
    def identity_tokenizer(text):
Exemple #22
0
import json
from spacy.lang.es import Spanish
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/es/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Spanish()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crea un Span de entidades con el label "LOC" para todos los resultados
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="LOC") for match_id, start, end in matches
    ]
    return doc


# Añade el componente al pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# El getter que busca el texto del span en un diccionario de ciudades
# capitales de países
import spacy
from spacy.matcher import PhraseMatcher
from spacy_lookup import Entity
from spacy.lang.es import Spanish

nlp = Spanish()
entity = Entity(nlp,
                keywords_list=['pera en Dulce', 'manzana', 'tentacion'],
                label='FOOD')
nlp.add_pipe(entity, name='Food')
entity2 = Entity(nlp, keywords_list=['#mora'], label='FOOD_HASHTAGS')
nlp.add_pipe(entity2, name='FoodHashtags')
text = "Me gustan mucho la manzana y tambien la pera en dulce en salsa de #mora. También me gusta la paleta tentación."
doc = nlp(text)
for e in doc:
    print(e.text, e._.is_entity, e.ent_type_)
Exemple #24
0
from spacy.lang.it import Italian
from spacy.lang.de import German
from spacy.lang.ru import Russian
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
from spacy.lang.es import Spanish

nlp = Spanish()

# Importa las clases Doc y Span
from spacy.____ import ____, ____

words = ["Me", "gusta", "David", "Bowie"]
spaces = [True, True, True, False]

# Crea un doc a partir de las palabras y los espacios
doc = ____(____, ____, ____)
print(doc.text)

# Crea un span para "David Bowie" a partir del doc y asígnalo al label "PERSON"
span = ____(____, ____, ____, label=____)
print(span.text, span.label_)

# Añade el span a las entidades del doc
____.____ = [____]

# Imprime en pantalla el texto y los labels de las entidades
print([(ent.text, ent.label_) for ent in doc.ents])
import json
from spacy.lang.es import Spanish
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/es/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Spanish()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crea un Span de entidades con el label "LOC" para todos los resultados
    matches = matcher(doc)
    doc.ents = [
        ____(____, ____, ____, label=____) for match_id, start, end in matches
    ]
    return doc


# Añade el componente al pipeline
____.____(____)
print(nlp.pipe_names)

# El getter que busca el texto del span en un diccionario de ciudades
# capitales de países
Exemple #27
0
# Import the Spanish language class
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.lang.es import Spanish

# Create the nlp object
nlp = Spanish() # or English() or German()

# Process a text (this is Spanish for: "How are you?")
doc = nlp("¿Cómo estás?")

# Print the document text
print(doc.text)
#  '¿Cómo estás?'
Exemple #28
0
def simple_identification():
    client_from = MongoClient()
    db_from = client_from["SSD"]
    coll_from = db_from["raw_data"]
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - DetectRegexSpecialEntitiesRawData_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='w')
    p_file.write(
        date +
        " Detecting Special Entities with Regex Expression Test - Local Execution"
        + "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    # print(emoticons_dict)
    # 3. Configure Spanish POS tagger
    spanish_pipeline = Spanish()
    all_from_tweets = coll_from.find()
    count = 0
    stop = 100
    p_file.write("Total data to process: " + str(stop) + "\n")
    emoticons = []
    text_type = 'Twitter'
    emotions = ast.literal_eval(config.get(text_type, 'emotions'))
    emoticons_metadata = ast.literal_eval(
        config.get(text_type, 'emoticons_metadata'))
    emotions_polarity = ast.literal_eval(
        config.get(text_type, 'emotions_polarity'))
    texts = []
    no_texts = []
    emoticon_count = 0
    for raw_data in all_from_tweets:
        if 'text' in raw_data.keys() and 'lang' in raw_data.keys():
            if "place" in raw_data.keys():
                place = raw_data["place"]
                if place is not None:
                    if "country_code" in place.keys():
                        raw_data_country_code = raw_data["place"][
                            "country_code"]
                        if raw_data_country_code in ["CO"]:
                            lang = raw_data["lang"]
                            original_text = raw_data['text']
                            raw_entities = raw_data['entities']
                            original_text = original_text.replace('\n', ' ')
                            if lang == 'es':
                                results = identify_special_entities(
                                    original_text, raw_entities,
                                    spanish_pipeline, emoticon_count, emotions,
                                    emoticons_metadata, emotions_polarity)
                                text = results[0]
                                clean_text = results[1]
                                emoticon_count = results[2]
                                special_entities = results[3]
                                emoticons += copy.deepcopy(results[4])
                                if len(results[4]) != 0:
                                    texts.append(original_text + '\t' + text +
                                                 '\t' + clean_text + '\t' +
                                                 str(special_entities))
                                else:
                                    no_texts.append(original_text + '\t' +
                                                    text + '\t' + clean_text)
                                count += 1
                            else:
                                if len(original_text) >= 3:
                                    blob = TextBlob(original_text)
                                    detection = True
                                    detected_language = ''
                                    while detection:
                                        try:
                                            detected_language = blob.detect_language(
                                            )
                                            detection = False
                                        except:
                                            print(
                                                'error while getting detected language'
                                            )
                                    # print(detected_language)
                                    if detected_language == 'es':
                                        results = identify_special_entities(
                                            original_text, raw_entities,
                                            spanish_pipeline, emoticon_count,
                                            emotions, emoticons_metadata,
                                            emotions_polarity)
                                        text = results[0]
                                        clean_text = results[1]
                                        emoticon_count = results[2]
                                        special_entities = results[3]
                                        emoticons += copy.deepcopy(results[4])
                                        if len(results[4]) != 0:
                                            texts.append(original_text + '\t' +
                                                         text + '\t' +
                                                         clean_text + '\t' +
                                                         str(special_entities))
                                        else:
                                            no_texts.append(original_text +
                                                            '\t' + text +
                                                            '\t' + clean_text)
                                        count += 1
                            print(count)
                            print(emoticon_count)
                            if emoticon_count >= stop:
                                break
    all_from_tweets.close()
    client_from.close()
    p_file.write("Emoticons " + str(len(emoticons)) + "\n")
    emoticons_counter = Counter(emoticons).most_common()
    emoticons_counter_sorted = sorted(emoticons_counter,
                                      key=lambda tup: tup[1])
    for emoticon in emoticons_counter_sorted:
        p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n")
    p_file.write("Total Emoticons: " + str(emoticon_count) + ". Total data: " +
                 str(count) + ". Proportion: " + str(emoticon_count / count) +
                 "\n")
    p_file.write("TEXTS WITH EMOTICONS: \n")
    for text in texts:
        p_file.write(text + "\n")
    p_file.write("TEXTS WITHOUT EMOTICONS: \n")
    for text in no_texts:
        p_file.write(text + "\n")
    p_file.write("Total elements in new list: " + str(count) + "\n")
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    p_file.flush()
    p_file.close()
from spacy.lang.es import Spanish

nlp = Spanish()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Crea una lista de patrones para el PhraseMatcher
patterns = list(nlp.pipe(people))
Exemple #30
0
import json
from spacy.lang.es import Spanish

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = Spanish()
doc = nlp("La Unión Europea fue fundada por seis países de Europa occidental "
          "(Francia, Alemania, Italia, Bélgica, Países Bajos, y Luxemburgo) y "
          "se amplió en seis ocasiones.")

# Importa el PhraseMatcher e inicialízalo
from spacy.____ import ____

matcher = ____(____)

# Crea objetos Doc patrón y añádelos al matcher
# Esta es una versión más rápida de: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Llama al matcher sobre el documento de prueba e imprime el
# resultado en pantalla
matches = ____(____)
print([doc[start:end] for match_id, start, end in matches])