コード例 #1
0
def test_issue3803():
    """Test that spanish num-like tokens have True for like_num attribute."""
    nlp = Spanish()
    text = "2 dos 1000 mil 12 doce"
    doc = nlp(text)

    assert [t.like_num for t in doc] == [True, True, True, True, True, True]
コード例 #2
0
    def init_resources(self):
        self.punctuation_pattern = re.compile("|".join(PUNCTUATION))
        self.stemmer = None
        stopwords_path = os.path.join(
            os.path.dirname(assistant_dialog_skill_analysis.__file__),
            "resources",
            self.language_code,
            "stopwords",
        )
        if self.language_code == "en":
            from spacy.lang.en import English

            self.tokenizer = Tokenizer(English().vocab)
            self.stemmer = SnowballStemmer(language="english")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "fr":
            from spacy.lang.fr import French

            self.tokenizer = Tokenizer(French().vocab)
            self.stemmer = SnowballStemmer(language="french")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "de":
            from spacy.lang.de import German

            self.tokenizer = Tokenizer(German().vocab)
            self.stemmer = SnowballStemmer(language="german")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "it":
            from spacy.lang.it import Italian

            self.tokenizer = Tokenizer(Italian().vocab)
            self.stemmer = SnowballStemmer(language="italian")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "cs":
            from spacy.lang.cs import Czech

            self.tokenizer = Tokenizer(Czech().vocab)
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "pt":
            from spacy.lang.pt import Portuguese

            self.tokenizer = Tokenizer(Portuguese().vocab)
            self.stemmer = SnowballStemmer(language="portuguese")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "es":
            from spacy.lang.es import Spanish

            self.tokenizer = Tokenizer(Spanish().vocab)
            self.stemmer = SnowballStemmer(language="spanish")
            self.stop_words = self.load_stop_words(stopwords_path)
        else:
            raise Exception("language code %s is not supported",
                            self.language_code)
コード例 #3
0
ファイル: tokenize.py プロジェクト: tonifuc3m/utils-BSC
def tokenize(text):
    tokenized = []
    nlp = Spanish()
    doc = nlp(text)
    token_list = []
    for token in doc:
        token_list.append(token.text)
        tokenized.append(token_list)
    return token_list
コード例 #4
0
            def spacy_tokenizer(sentence):
                parser = Spanish()
                tokens = parser(sentence)
                filtered_tokens = []
                for word in tokens:
                    lemma = word.lemma_.lower().strip()
                    if lemma not in STOP_WORDS and re.search(
                            '^[a-zA-Z]+$', lemma):
                        filtered_tokens.append(lemma)

                return filtered_tokens
コード例 #5
0
ファイル: utils.py プロジェクト: camporeale/ml_challenge
def preprocess_test(df):
    # Spacy Tokenizers
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    # Spanish and Portuguese masks to use corresponding language tokenizer
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    # Test file only needs id and tokens
    return df
コード例 #6
0
def main():
    nlp = English()
    doc = nlp("This is a sentence.")
    print(doc.text)

    nlp = German()
    doc = nlp('Liebe Grüße!')
    print(doc.text)

    nlp = Spanish()
    doc = nlp('¿Cómo estás?')
    print(doc.text)
コード例 #7
0
ファイル: utils.py プロジェクト: camporeale/ml_challenge
def preprocess(df):
    # Spacy Tokenizers
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    # Spanish and Portuguese masks to use corresponding language tokenizer
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    # Training and validation df need to have __label__ string before category 
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df
コード例 #8
0
 def stopwords(self, text):
     try:
         nlp = Spanish() if self.lang == 'es' else English()
         doc = nlp(text)
         token_list = [token.text for token in doc]
         sentence = []
         for word in token_list:
             lexeme = nlp.vocab[word]
             if not lexeme.is_stop:
                 sentence.append(word)
         return ' '.join(sentence)
     except Exception as e:
         Util.standard_error(sys.exc_info())
         print('Error stopwords: {0}'.format(e))
         return None
コード例 #9
0
ファイル: lda_test.py プロジェクト: tonifuc3m/lda-intro
def preprocess(text):
    # Tokenize, remove stopwords, numbers, emtpy spaces and punctuation and lemmatize
    tokenized = []
    nlp = Spanish()
    doc = nlp(text)
    token_list = []
    # Tokenize
    for token in doc:
        # Remove stopwords, numbers, emtpy spaces and punctuation and lemmatize
        if ((token.text not in nlp.Defaults.stop_words) &
            (token.text not in string.punctuation) &
            (token.text.isalpha() == True)):
            token_list.append(token.lemma_)
    tokenized.append(token_list)
    return tokenized
コード例 #10
0
def lang_change(language):
    if language == 'en':
        from spacy.lang.en import English
        from spacy.lang.en.stop_words import STOP_WORDS
        parser = English()
        file = "\config_files\config_spacy_en.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'de':
        from spacy.lang.de import German
        from spacy.lang.de.stop_words import STOP_WORDS
        parser = German()
        file = "\config_files\config_spacy_de.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'es':
        from spacy.lang.es import Spanish
        from spacy.lang.es.stop_words import STOP_WORDS
        parser = Spanish()
        file = "\config_files\config_spacy_es.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'pt':
        from spacy.lang.pt import Portuguese
        from spacy.lang.pt.stop_words import STOP_WORDS
        parser = Portuguese()
        file = "\config_files\config_spacy_pt.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'fr':
        from spacy.lang.fr import French
        from spacy.lang.fr.stop_words import STOP_WORDS
        parser = French()
        file = "\config_files\config_spacy_fr.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'it':
        from spacy.lang.it import Italian
        from spacy.lang.it.stop_words import STOP_WORDS
        parser = Italian()
        file = "\config_files\config_spacy_it.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'nl':
        from spacy.lang.nl import Dutch
        from spacy.lang.nl.stop_words import STOP_WORDS
        parser = Dutch()
        file = "\config_files\config_spacy_nl.yaml"
        configfile_path = os.getcwd() + file

    return parser, STOP_WORDS, configfile_path
コード例 #11
0
def tokenize(text):
    '''
    Tokenize a string in Spanish
    Parameters
    ----------
    text : str
        Spanish text string to tokenize.
    Returns
    -------
    tokenized : list
        List of tokens (includes punctuation tokens).
    '''
    nlp = Spanish()
    doc = nlp(text)
    token_list = []
    for token in doc:
        token_list.append(token.text)
    return token_list
コード例 #12
0
ファイル: main.py プロジェクト: Pela12345/final-project
def spacy_tokenizer(sentence):
    nlp = spacy.load('es')
    parser = Spanish()
    spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS
    STOPWORDS = list(spacy_stopwords)
    STOPWORDS.extend(('y', 'a', 'u', 'o', 'e', 'quiero'))
    tokens = parser(sentence)
    filtered_tokens = []
    for word in tokens:
        lemma = word.lemma_.lower().strip()
        lemma = re.sub("á", "a", lemma)
        lemma = re.sub("é", "e", lemma)
        lemma = re.sub("í", "i", lemma)
        lemma = re.sub("ó", "o", lemma)
        lemma = re.sub("ú", "u", lemma)
        lemma = re.sub("ñ", "n", lemma)
        if lemma not in STOPWORDS and re.search('^[a-zA-Z]+$', lemma):
            filtered_tokens.append(lemma)
    return filtered_tokens
コード例 #13
0
def tokenize(document, language, punctutation):
    if language == 'fr':
        nlp = French()
    if language == 'de':
        nlp = German()
    if language == 'en':
        nlp = French()
    if language == 'es':
        nlp = Spanish()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(document)
    if punctutation:
        sentences = [[str(word) for word in sent if str(word) != '\n']
                     for sent in doc.sents]
    else:
        sentences = [[
            str(word) for word in sent
            if ((str(word) != '\n') and (str(word).isalpha()))
        ] for sent in doc.sents]
    return sentences
コード例 #14
0
ファイル: tokenizer.py プロジェクト: RobinSrimal/Tokenizer
    def get_nlp(self, language):

        """"
        this method returns the corresponding spacy language model when 
        provided with a language. To do so it also does the required 
        import. This is certainly not the standard approach. 
        But as this endpoint will be deployed to Heroku (space limitation)
        and only be invoked rarely it is the fastest approach.
        """

        if language == "en":

            from spacy.lang.en import English
            return English()

        elif language == "fr":

            from spacy.lang.fr import French
            return French()

        elif language == "de":

            from spacy.lang.de import German
            return German()

        elif language == "es":

            from spacy.lang.es import Spanish
            return Spanish()

        elif language == "pt":

            from spacy.lang.pt import Portuguese
            return Portuguese()

        else:

            return {"error": "invalid or not supported language entered"}
コード例 #15
0
import json
from spacy.matcher import Matcher
from spacy.lang.es import Spanish

with open("exercises/es/adidas.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = Spanish()
matcher = Matcher(nlp.vocab)

# Dos tokens que en minúsculas encuentran "adidas" y "zx"
pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}]

# Token que en minúsculas encuentra "adidas" y un dígito
pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}]

# Añade los patrones al matcher y revisa el resultado
matcher.add("ROPA", None, pattern1, pattern2)
for doc in nlp.pipe(TEXTS):
    print([doc[start:end] for match_id, start, end in matcher(doc)])
コード例 #16
0
def complete_text_analysis(text, raw_entities):
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - SingleCompleteTextAnalysis_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='a')
    p_file.write(date +
                 " Single Complete Text Analysis Test - Local Execution" +
                 "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    # 01. Read emojis
    path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
    unicode_emoji_list_file = codecs.open(path +
                                          "list - unicode_emojis_metadata.txt",
                                          encoding='utf-8')
    emoji_list = unicode_emoji_list_file.read().splitlines()
    unicode_emoji_list_file.close()
    aux_emojis_dict = {}
    emojis_dict = {}
    for aux in emoji_list:
        aux_emoji = aux.split('\t')
        aux_emojis_dict[aux_emoji[1]] = [aux_emoji[2], aux_emoji[3]]
        emojis_dict[aux_emoji[2]] = {
            'emoji_id': aux_emoji[0],
            'unicode': aux_emoji[1],
            'name': aux_emoji[3],
            'polarity': float(aux_emoji[4]),
            'happiness': float(aux_emoji[5]),
            'anger': float(aux_emoji[6]),
            'fear': float(aux_emoji[7]),
            'replusion': float(aux_emoji[8]),
            'surprise': float(aux_emoji[9]),
            'sadness': float(aux_emoji[10]),
            'interest': aux_emoji[11]
        }
    sorted_aux_emojis_list = sorted(aux_emojis_dict.keys(),
                                    key=len,
                                    reverse=True)
    emojis_list = list()
    for aux_emoji in sorted_aux_emojis_list:
        emojis_list.append(aux_emojis_dict[aux_emoji][0])
    # print(emojis_list)
    # 02. Read complementary characters
    complementary_characters_list_file = codecs.open(
        path + "list - complementary_characters.txt", encoding='utf-8')
    complementary_characters_list = complementary_characters_list_file.read(
    ).splitlines()
    complementary_characters_list_file.close()
    complementary_characters_dict = {}
    for aux in complementary_characters_list:
        aux_char = aux.split('\t')
        complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]]
    # print(complementary_characters_dict)
    # 03. Read emoticons patterns
    text_type = 'Twitter'
    emotions = ast.literal_eval(config.get(text_type, 'emotions'))
    emoticons_metadata = ast.literal_eval(
        config.get(text_type, 'emoticons_metadata'))
    emotions_polarity = ast.literal_eval(
        config.get(text_type, 'emotions_polarity'))
    # 04. Configure Google_Universal_POS_Tags
    tags = config.options("Google_Universal_POS_Tags")
    google_universal_tags = {}
    for tag in tags:
        google_universal_tags[tag.upper()] = config.get(
            'Google_Universal_POS_Tags', tag)
    # 05. Read special characters (#, @, https, etc.)
    special_characters = ast.literal_eval(
        config.get('TextAnalysis', 'special_characters'))
    additional_symbols = ast.literal_eval(
        config.get('TextAnalysis', 'additional_symbols'))
    variation_selectors = ast.literal_eval(
        config.get('TextAnalysis', 'variation_selectors'))
    # 06. Configure Spanish POS tagger
    nlp = Spanish()
    tag_map = spacy.lang.es.TAG_MAP
    emoticons = []
    emojis = []
    complementary_characters = []
    texts = []
    emojis_count = 0
    emoticon_count = 0
    complementary_characters_count = 0
    original_text = text.replace('\n', ' ')
    results = identify_special_characters(
        original_text, raw_entities, nlp, tag_map, emotions,
        emoticons_metadata, emotions_polarity, emojis_dict, emojis_list,
        variation_selectors, complementary_characters_dict, emoticon_count,
        emojis_count, complementary_characters_count)

    spaced_text = results[0]
    final_clean_text = results[1]
    emoticons += copy.deepcopy(results[2])
    emojis += copy.deepcopy(results[3])
    complementary_characters += copy.deepcopy(results[4])
    emoticon_count = results[5]
    emojis_count = results[6]
    complementary_characters_count = results[7]
    special_entities = results[8]
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    texts.append(spaced_text + '\t' + final_clean_text + '\t' +
                 str(special_entities))
    p_file.write("Texts with: " + "\n")
    for text in texts:
        p_file.write(text + "\n")
    p_file.flush()
    p_file.close()
コード例 #17
0
              if len(x) > 3]  # Remove words with less than 3 letters.
    tokens = [stemmer.stem(token) for token in tokens]  # Lemmatize words.
    tokens = [x for x in tokens if x not in to_avoid]

    return ' '.join(tokens)


if '__main__' == __name__:

    stemmer = SnowballStemmer('spanish')
    sys.setrecursionlimit(10000)

    cwd = os.getcwd()

    stop_words = get_stop_words('es')
    parser = Spanish()

    to_avoid = read_as_list('to_avoid.txt', 'latin-1')

    my_sheet = 'Sheet1'
    file_name = 'Proposals - PAM - Spanish.xlsx'  # name of your excel file
    df = read_excel(file_name, sheet_name=my_sheet)
    df = df[df['category/name/se'] == 'Sanidad y salud']

    txt = list(df['body'])

    text = [filter_vocabulary(txt, 0.01)][0]
    text = [
        prepare_text_for_ML(x, stop_words, parser, stemmer, to_avoid)
        for x in text
    ]
コード例 #18
0
def getSentences(text):
    nlp = Spanish()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]
コード例 #19
0
from rrec.model.reddit_recommender import RedditRecommender

# Spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.fr import French
from spacy.lang.zh import Chinese
from spacy.lang.ru import Russian
from spacy.lang.ar import Arabic
from spacy.lang.de import German
from spacy.lang.uk import Ukrainian
from spacy.lang.ro import Romanian

lang_id_to_spacy = {
    'en': English(),
    'es': Spanish(),
    'fr': French(),
    'zh-cn': Chinese(),
    'ru': Russian(),
    'ar': Arabic(),
    'de': German(),
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
コード例 #20
0
 def __init__(self):
     self.nlp_english = English()
     self.nlp_spanish = Spanish()
コード例 #21
0
ファイル: main.py プロジェクト: ufukcbicici/website_relevance
# Constants - Hyperparameters
interactions_scores_dict = {
    'VIEW': 1,
    'BOOKMARK': 2,
    'FOLLOW': 3,
    'LIKE': 4,
    'COMMENT CREATED': 5
}

# Global objects
interactions_df = pd.read_csv('interactions.csv')
articles_df = pd.read_csv('articles.csv')
person_le = preprocessing.LabelEncoder()
tokens_le = preprocessing.LabelEncoder()
hidden_dimensions = 250
language_objects = {"en": English(), "pt": Portuguese(), "es": Spanish()}
tokenizers = {}
summaries = {}
filter_regex = "[^A-Za-z0-9]+"
batch_size = 10000
max_iterations = 100000
l2_lambda = 0.001

# We summarize each article with Spacy's TextRank implementation. This eliminates most of the noisy information
# in the texts. Then we apply tf-idf analysis to the article summaries. For every unique token in the obtained corpus
# of summaries, we calculate the expected tf-idf score over all articles. Then we sort the tokens in descending order
# of their expected tf-idf scores. The first 5000 tokens will constitute the representing tokens of our article corpus.


def create_article_tokens():
    def identity_tokenizer(text):
コード例 #22
0
import json
from spacy.lang.es import Spanish
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/es/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Spanish()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crea un Span de entidades con el label "LOC" para todos los resultados
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="LOC") for match_id, start, end in matches
    ]
    return doc


# Añade el componente al pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# El getter que busca el texto del span en un diccionario de ciudades
# capitales de países
コード例 #23
0
import spacy
from spacy.matcher import PhraseMatcher
from spacy_lookup import Entity
from spacy.lang.es import Spanish

nlp = Spanish()
entity = Entity(nlp,
                keywords_list=['pera en Dulce', 'manzana', 'tentacion'],
                label='FOOD')
nlp.add_pipe(entity, name='Food')
entity2 = Entity(nlp, keywords_list=['#mora'], label='FOOD_HASHTAGS')
nlp.add_pipe(entity2, name='FoodHashtags')
text = "Me gustan mucho la manzana y tambien la pera en dulce en salsa de #mora. También me gusta la paleta tentación."
doc = nlp(text)
for e in doc:
    print(e.text, e._.is_entity, e.ent_type_)
コード例 #24
0
ファイル: Tokenizer.py プロジェクト: gkaramanolakis/CLTS
from spacy.lang.it import Italian
from spacy.lang.de import German
from spacy.lang.ru import Russian
from spacy.lang.zh import Chinese
from spacy.lang.ja import Japanese
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
コード例 #25
0
from spacy.lang.es import Spanish

nlp = Spanish()

# Importa las clases Doc y Span
from spacy.____ import ____, ____

words = ["Me", "gusta", "David", "Bowie"]
spaces = [True, True, True, False]

# Crea un doc a partir de las palabras y los espacios
doc = ____(____, ____, ____)
print(doc.text)

# Crea un span para "David Bowie" a partir del doc y asígnalo al label "PERSON"
span = ____(____, ____, ____, label=____)
print(span.text, span.label_)

# Añade el span a las entidades del doc
____.____ = [____]

# Imprime en pantalla el texto y los labels de las entidades
print([(ent.text, ent.label_) for ent in doc.ents])
コード例 #26
0
import json
from spacy.lang.es import Spanish
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/es/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = Spanish()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Crea un Span de entidades con el label "LOC" para todos los resultados
    matches = matcher(doc)
    doc.ents = [
        ____(____, ____, ____, label=____) for match_id, start, end in matches
    ]
    return doc


# Añade el componente al pipeline
____.____(____)
print(nlp.pipe_names)

# El getter que busca el texto del span en un diccionario de ciudades
# capitales de países
コード例 #27
0
# Import the Spanish language class
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.lang.es import Spanish

# Create the nlp object
nlp = Spanish() # or English() or German()

# Process a text (this is Spanish for: "How are you?")
doc = nlp("¿Cómo estás?")

# Print the document text
print(doc.text)
#  '¿Cómo estás?'
コード例 #28
0
def simple_identification():
    client_from = MongoClient()
    db_from = client_from["SSD"]
    coll_from = db_from["raw_data"]
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - DetectRegexSpecialEntitiesRawData_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='w')
    p_file.write(
        date +
        " Detecting Special Entities with Regex Expression Test - Local Execution"
        + "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    # print(emoticons_dict)
    # 3. Configure Spanish POS tagger
    spanish_pipeline = Spanish()
    all_from_tweets = coll_from.find()
    count = 0
    stop = 100
    p_file.write("Total data to process: " + str(stop) + "\n")
    emoticons = []
    text_type = 'Twitter'
    emotions = ast.literal_eval(config.get(text_type, 'emotions'))
    emoticons_metadata = ast.literal_eval(
        config.get(text_type, 'emoticons_metadata'))
    emotions_polarity = ast.literal_eval(
        config.get(text_type, 'emotions_polarity'))
    texts = []
    no_texts = []
    emoticon_count = 0
    for raw_data in all_from_tweets:
        if 'text' in raw_data.keys() and 'lang' in raw_data.keys():
            if "place" in raw_data.keys():
                place = raw_data["place"]
                if place is not None:
                    if "country_code" in place.keys():
                        raw_data_country_code = raw_data["place"][
                            "country_code"]
                        if raw_data_country_code in ["CO"]:
                            lang = raw_data["lang"]
                            original_text = raw_data['text']
                            raw_entities = raw_data['entities']
                            original_text = original_text.replace('\n', ' ')
                            if lang == 'es':
                                results = identify_special_entities(
                                    original_text, raw_entities,
                                    spanish_pipeline, emoticon_count, emotions,
                                    emoticons_metadata, emotions_polarity)
                                text = results[0]
                                clean_text = results[1]
                                emoticon_count = results[2]
                                special_entities = results[3]
                                emoticons += copy.deepcopy(results[4])
                                if len(results[4]) != 0:
                                    texts.append(original_text + '\t' + text +
                                                 '\t' + clean_text + '\t' +
                                                 str(special_entities))
                                else:
                                    no_texts.append(original_text + '\t' +
                                                    text + '\t' + clean_text)
                                count += 1
                            else:
                                if len(original_text) >= 3:
                                    blob = TextBlob(original_text)
                                    detection = True
                                    detected_language = ''
                                    while detection:
                                        try:
                                            detected_language = blob.detect_language(
                                            )
                                            detection = False
                                        except:
                                            print(
                                                'error while getting detected language'
                                            )
                                    # print(detected_language)
                                    if detected_language == 'es':
                                        results = identify_special_entities(
                                            original_text, raw_entities,
                                            spanish_pipeline, emoticon_count,
                                            emotions, emoticons_metadata,
                                            emotions_polarity)
                                        text = results[0]
                                        clean_text = results[1]
                                        emoticon_count = results[2]
                                        special_entities = results[3]
                                        emoticons += copy.deepcopy(results[4])
                                        if len(results[4]) != 0:
                                            texts.append(original_text + '\t' +
                                                         text + '\t' +
                                                         clean_text + '\t' +
                                                         str(special_entities))
                                        else:
                                            no_texts.append(original_text +
                                                            '\t' + text +
                                                            '\t' + clean_text)
                                        count += 1
                            print(count)
                            print(emoticon_count)
                            if emoticon_count >= stop:
                                break
    all_from_tweets.close()
    client_from.close()
    p_file.write("Emoticons " + str(len(emoticons)) + "\n")
    emoticons_counter = Counter(emoticons).most_common()
    emoticons_counter_sorted = sorted(emoticons_counter,
                                      key=lambda tup: tup[1])
    for emoticon in emoticons_counter_sorted:
        p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n")
    p_file.write("Total Emoticons: " + str(emoticon_count) + ". Total data: " +
                 str(count) + ". Proportion: " + str(emoticon_count / count) +
                 "\n")
    p_file.write("TEXTS WITH EMOTICONS: \n")
    for text in texts:
        p_file.write(text + "\n")
    p_file.write("TEXTS WITHOUT EMOTICONS: \n")
    for text in no_texts:
        p_file.write(text + "\n")
    p_file.write("Total elements in new list: " + str(count) + "\n")
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    p_file.flush()
    p_file.close()
コード例 #29
0
from spacy.lang.es import Spanish

nlp = Spanish()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

# Crea una lista de patrones para el PhraseMatcher
patterns = list(nlp.pipe(people))
コード例 #30
0
import json
from spacy.lang.es import Spanish

with open("exercises/es/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

nlp = Spanish()
doc = nlp("La Unión Europea fue fundada por seis países de Europa occidental "
          "(Francia, Alemania, Italia, Bélgica, Países Bajos, y Luxemburgo) y "
          "se amplió en seis ocasiones.")

# Importa el PhraseMatcher e inicialízalo
from spacy.____ import ____

matcher = ____(____)

# Crea objetos Doc patrón y añádelos al matcher
# Esta es una versión más rápida de: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add("COUNTRY", None, *patterns)

# Llama al matcher sobre el documento de prueba e imprime el
# resultado en pantalla
matches = ____(____)
print([doc[start:end] for match_id, start, end in matches])