def test_usage_single_emoji(nlp, icon): emoji = Emoji(nlp) nlp.add_pipe(emoji, last=True) doc = nlp(u"Hello %s world" % icon) assert doc._.has_emoji assert doc[1]._.is_emoji assert doc[1]._.emoji_desc == emoji.get_emoji_desc(doc[1]) assert doc[1:3]._.has_emoji assert len(doc._.emoji) == 1 emoji_text, emoji_idx, emoji_desc = doc._.emoji[0] assert emoji_text == icon assert emoji_idx == 1
def test_usage_no_emoji(nlp): emoji = Emoji(nlp) nlp.add_pipe(emoji, last=True) doc = nlp(u"In total there are 2,666 emojis in the Unicode Standard.") assert not doc._.has_emoji for token in doc: assert not token._.is_emoji
def get_spacy_nlp(core, emojis=True): nlp = spacy.load(core) if emojis: emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) return nlp
def create_nlp_instance(): import spacy from spacymoji import Emoji nlp = spacy.load('en') emoji_pipe = Emoji(nlp) nlp.add_pipe(emoji_pipe, first=True) # Merge hashtag tokens which were split by spacy def hashtag_pipe(doc): merged_hashtag = False while True: for token_index, token in enumerate(doc): if token.text == '#': if token.head is not None: start_index = token.idx end_index = start_index + len(token.head.text) + 1 if doc.merge(start_index, end_index) is not None: merged_hashtag = True break if not merged_hashtag: break merged_hashtag = False return doc nlp.add_pipe(hashtag_pipe) return nlp
def test_lookup(nlp): emoji = Emoji(nlp, lookup={'👨🎤': 'David Bowie'}) nlp.add_pipe(emoji, last=True) doc = nlp(u"We can be 👨🎤 heroes") assert doc._.has_emoji assert doc[3]._.is_emoji assert doc[3]._.emoji_desc == 'David Bowie'
def build_pipeline(self): """Build spaCy pipeline.""" # Add spacymoji emoji = Emoji(self.nlp, merge_spans=False) self.nlp.add_pipe(emoji, first=True) # Add entity skipping self.nlp.add_pipe(self.skip_ents, after='ner')
def test_usage_multiple_emoji(nlp): emoji = Emoji(nlp) nlp.add_pipe(emoji, last=True) doc = nlp(u"Hello 😻🍕 world, this ✨ 💥 is an example.") assert doc._.has_emoji assert len(doc._.emoji) == 4 assert doc[:5]._.has_emoji assert len(doc[:5]._.emoji) == 2
def __init__(self): self.nlp = it_core_news_sm.load() emoji = Emoji(self.nlp) sentencizer = self.nlp.create_pipe("sentencizer") #Add components to the pipeline self.nlp.add_pipe(emoji, first=True) self.nlp.add_pipe(hashtag_pipe, first=True) self.nlp.add_pipe(sentencizer)
def test_custom_attrs(): attrs = ('contains_emoji', 'equals_emoji', 'emoji_details', 'all_emoji') nlp = English() emoji = Emoji(nlp, attrs=attrs) nlp.add_pipe(emoji, last=True) doc = nlp(u"Hello 🎉") assert doc._.all_emoji assert len(doc._.all_emoji) == 1 assert doc[1]._.has('equals_emoji') assert doc[1]._.emoji_details
def get_nlp_v2(): nlp = spacy.load('en_core_web_sm', disable=['ner']) emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) # bug with spacy https://github.com/explosion/spaCy/issues/1574 for word in nlp.Defaults.stop_words.difference( ext_spacy.stop_words_modified): nlp.vocab[word].is_stop = False return nlp
def setup_spacy(): """ Setup spacy parameters Returns ------- spacy.lang.en.English """ nlp = en_core_web_sm.load() emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) return nlp
def test_usage_merge_overlapping(nlp): text = '🇺🇸🇦🇷' assert len(text) == 4 emoji = Emoji(nlp) nlp.add_pipe(emoji, last=True) doc = nlp(text) assert len(doc) == 2 assert doc[0].orth_ == text[0:2] assert doc[1].orth_ == text[2:4]
def test_usage_merge_spans(nlp, emoji): text = u"This is %s a test" % emoji emoji = Emoji(nlp) doc = nlp(text) assert len(doc) > 5 nlp.add_pipe(emoji, last=True) doc = nlp(text) assert len(doc) == 5 assert doc._.has_emoji assert doc[2]._.is_emoji assert len(doc[2].text) > 1
def load_sapcy(self, lang): result = None try: stemmer_text = Steaming(lang) # initialise component result = spacy.load('es_core_news_md') if lang == 'es' else spacy.load('en_core_web_md') emoji = Emoji(result) result.add_pipe(emoji, first=True) result.add_pipe(stemmer_text, after='parser', name='stemmer') print('Language: {0}\nText Analysis: {1}'.format(lang, result.pipe_names)) except Exception as e: Util.standard_error(sys.exc_info()) print('Error load_sapcy: {0}'.format(e)) return result
def test(): import spacy from spacymoji import Emoji nlp = spacy.load('es') emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) doc = nlp(u"This is a test 😻 👍🏿") assert doc._.has_emoji == True assert doc[2:5]._.has_emoji == True assert doc[0]._.is_emoji == False assert doc[4]._.is_emoji == True assert doc[5]._.emoji_desc == u'thumbs up dark skin tone' assert len(doc._.emoji) == 2 assert doc._.emoji[1] == (u'👍🏿', 5, u'thumbs up dark skin tone')
def _create_spacy_tokenizer(self, language: AnyStr) -> Language: """Private method to create a custom spaCy tokenizer for a given language Args: language: Language code in ISO 639-1 format, cf. https://spacy.io/usage/models#languages Returns: spaCy Language instance with the tokenizer Raises: TokenizationError: If something went wrong with the tokenizer creation """ start = perf_counter() logging.info(f"Loading tokenizer for language '{language}'...") try: if language == "th": # PyThaiNLP requires a "data directory" even if nothing needs to be downloaded os.environ["PYTHAINLP_DATA_DIR"] = mkdtemp( ) # dummy temp directory if language in SPACY_LANGUAGE_MODELS and self.use_models: nlp = spacy.load(SPACY_LANGUAGE_MODELS[language]) else: nlp = spacy.blank( language ) # spaCy language without models (https://spacy.io/usage/models) except (ValueError, OSError) as e: raise TokenizationError( f"SpaCy tokenization not available for language '{language}' because of error: '{e}'" ) if self.hashtags_as_token: re_token_match = spacy.tokenizer._get_regex_pattern( nlp.Defaults.token_match) re_token_match = r"""({re_token_match}|#\w+)""" nlp.tokenizer.token_match = re.compile(re_token_match).match _prefixes = list(nlp.Defaults.prefixes) if "#" in _prefixes: _prefixes.remove("#") nlp.tokenizer.prefix_search = spacy.util.compile_prefix_regex( _prefixes).search if self.stopwords_folder_path and language in SUPPORTED_LANGUAGES_SPACY: self._customize_stopwords(nlp, language) logging.info( f"Loading tokenizer for language '{language}': done in {perf_counter() - start:.2f} seconds" ) if language not in UNSUPPORTED_SPACY_EMOJI_LANG: nlp.add_pipe(Emoji(nlp), first=True) return nlp
def on_status(self, status): blacklist = [ 'netflix', 'rt', 'https', 't', 'co', 'q', 'a', 'o', 'e', 'n', 'pq', 'vc' ] nlp = spacy.load("pt_core_news_sm") emoji = Emoji(nlp) nlp.add_pipe(emoji) tokens = nlp(status.text.lower()) words = [ token.text for token in tokens if token.is_stop != True and token.is_punct != True and token._.is_emoji != True and token.text not in blacklist ] word_list.extend(words) fdist = nltk.FreqDist(word_list) print('10 MAIS FREQUENTES:') print(fdist.most_common(10)) print('\n')
def start(): nlp = spacy.load('en_core_web_sm') nlp.add_pipe(Emoji(nlp), first=True) return nlp
def __init__(self): self.nlp = spacy.load('en_core_web_sm') emoji = Emoji(self.nlp) self.nlp.add_pipe(emoji, first=True)
def test_pattern_id(nlp, pattern_id): emoji = Emoji(nlp, pattern_id=pattern_id) assert pattern_id in emoji.matcher assert "EMOJI" not in emoji.matcher
size = len(data) anger = [0] * size anticipation = [0] * size disgust = [0] * size fear = [0] * size joy = [0] * size sadness = [0] * size surprise = [0] * size trust = [0] * size index = 0 nlp = spacy.load("it_core_news_sm") emoji = Emoji(nlp, merge_spans=False) nlp.add_pipe(emoji, first=True) length = len(data) for _, row in data.iterrows(): if index % 100000 == 0: print(" {}% of tweets were analysed in {:.2f} seconds".format( (index / length), time.time() - start_time)) for token in nlp(row['text']): if token._.is_emoji:
def main(model='C:/Users/Pasante/Desktop/Gastón/drug_model_v4/spacy_model', new_model_name='spacy_model_es_drug', output_dir='C:/Users/Pasante/Desktop/Gastón/spacy_model_es_drug', n_iter=200): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Modelo cargado '%s'" % model) else: nlp = spacy.blank('es') # create blank Language class print("Modelo en blanco 'es' creado") # Add emojis to pipe emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL1) # add new entity label to entity recognizer ner.add_label(LABEL2) # add new entity label to entity recognizer losses_max = 99999999999 t0 = datetime.datetime.now() print("Start: ", t0) # load test text validation_data = list( filter(None, open('test_text.txt', 'r').read().splitlines())) iteration = 0 # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) iteration = iteration + 1 losses = {} for raw_text, entity_offsets in TRAIN_DATA: nlp.update([raw_text], [entity_offsets], drop=0.25, sgd=optimizer, losses=losses) print(losses) print("Iteration:" + str(iteration)) for text in validation_data: doc = nlp(text) ents = [(ent.text, ent.label_) for ent in doc.ents] for ent, label in ents: print(f'Found entity: "{ent}": "{label}",') # save model to output directory # take the most successful for a in losses.keys(): if losses[a] < losses_max: losses_max = losses[a] if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("saved model: ", output_dir) # test the trained model print("Finished: ", datetime.datetime.now() - t0) print("\n\n\n")
def simple_identification(): client_from = MongoClient() db_from = client_from["SSD"] coll_from = db_from["raw_data"] start_time = time() date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S") path_to_file = date + " - DetectEmoticonsAndEmojis_Performance.txt" p_file = codecs.open(path_to_file, encoding='utf-8', mode='a') p_file.write(date + " Detecting Emoticons & Emojis Test - Local Execution" + "\n") p_file.flush() # II. Prepare data p_file.write("Preparing initial data ... " + "\n") path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \ + 'configuration.ini' config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(codecs.open(path_to_configuration, "r", "utf8")) # Read emojis path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep unicode_emoji_list_file = codecs.open(path + "list - unicode_emojis.txt", encoding='utf-8') emoji_list = unicode_emoji_list_file.read().splitlines() unicode_emoji_list_file.close() aux_emojis_dict = {} emojis_dict = {} for aux in emoji_list: aux_emoji = aux.split('\t') aux_emojis_dict[aux_emoji[1]] = [aux_emoji[2], aux_emoji[3]] emojis_dict[aux_emoji[2]] = [aux_emoji[1], aux_emoji[3]] sorted_aux_emojis_list = sorted(aux_emojis_dict.keys(), key=len, reverse=True) emojis_list = list() for aux_emoji in sorted_aux_emojis_list: emojis_list.append(aux_emojis_dict[aux_emoji][0]) # print(emojis_list) # Read complementary characters complementary_characters_list_file = codecs.open( path + "list - complementary_characters.txt", encoding='utf-8') complementary_characters_list = complementary_characters_list_file.read( ).splitlines() complementary_characters_list_file.close() complementary_characters_dict = {} for aux in complementary_characters_list: aux_char = aux.split('\t') complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]] # print(complementary_characters_dict) # Read emoticons emoticon_list_file = codecs.open(path + "list - emoticons.txt", encoding='utf-8') emoticon_list = emoticon_list_file.read().splitlines() emoticon_list_file.close() emoticons_dict = {} for aux in emoticon_list: aux_emoticon = aux.split('\t') # print(aux_emoticon) emoticons_dict[aux_emoticon[0]] = aux_emoticon[1] # print(emoticons_dict) # 1. Configure Google_Universal_POS_Tags tags = config.options("Google_Universal_POS_Tags") google_universal_tags = {} for tag in tags: google_universal_tags[tag.upper()] = config.get( 'Google_Universal_POS_Tags', tag) # 2. Read special characters (#, @, https, etc.) special_characters = ast.literal_eval( config.get('TextAnalysis', 'special_characters')) additional_symbols = ast.literal_eval( config.get('TextAnalysis', 'additional_symbols')) variation_selectors = ast.literal_eval( config.get('TextAnalysis', 'variation_selectors')) # 3. Configure Spanish POS tagger spanish_pipeline = spacy.load('es') emoji = Emoji(spanish_pipeline) spanish_pipeline.add_pipe(emoji, first=True) tag_map = spacy.es.TAG_MAP all_from_tweets = coll_from.find() count = 0 stop = 100000 p_file.write("Total data to process: " + str(stop) + "\n") emoticons = [] emojis = [] complementary_characters = [] texts = [] emojis_count = 0 emoticon_count = 0 complementary_characters_count = 0 for raw_data in all_from_tweets: if 'text' in raw_data.keys() and 'lang' in raw_data.keys(): if "place" in raw_data.keys(): place = raw_data["place"] if place is not None: if "country_code" in place.keys(): raw_data_country_code = raw_data["place"][ "country_code"] if raw_data_country_code in ["CO"]: lang = raw_data["lang"] text = raw_data['text'] if lang == 'es': results = identify_special_characters( text, spanish_pipeline, tag_map, emoticons_dict, emojis_dict, emojis_list, variation_selectors, complementary_characters_dict, emoticon_count, emojis_count, complementary_characters_count) spaced_text = results[0] final_clean_text = results[1] emoticons += copy.deepcopy(results[2]) emojis += copy.deepcopy(results[3]) complementary_characters += copy.deepcopy( results[4]) emoticon_count = results[5] emojis_count = results[6] complementary_characters_count = results[7] if len(results[2]) != 0 or len( results[3]) != 0 or len( results[4]) != 0: texts.append(spaced_text + '\t' + final_clean_text) count += 1 else: if len(text) >= 3: blob = TextBlob(text) detection = True detected_language = '' while detection: try: detected_language = blob.detect_language( ) detection = False except: print( 'error while getting detected language' ) if detected_language == 'es': results = identify_special_characters( text, spanish_pipeline, tag_map, emoticons_dict, emojis_dict, emojis_list, variation_selectors, complementary_characters_dict, emoticon_count, emojis_count, complementary_characters_count) spaced_text = results[0] final_clean_text = results[1] emoticons += copy.deepcopy(results[2]) emojis += copy.deepcopy(results[3]) complementary_characters += copy.deepcopy( results[4]) emoticon_count = results[5] emojis_count = results[6] complementary_characters_count = results[ 7] if len(results[2]) != 0 or len( results[3]) != 0 or len( results[4]) != 0: texts.append(spaced_text + '\t' + final_clean_text) count += 1 print(count) if count == stop: break all_from_tweets.close() client_from.close() p_file.write("Emoticons " + str(len(emoticons)) + "\n") emoticons_counter = Counter(emoticons).most_common() emoticons_counter_sorted = sorted(emoticons_counter, key=lambda tup: tup[1]) for emoticon in emoticons_counter_sorted: p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n") p_file.write("Total Emoticons: " + str(emoticon_count) + ". Proportion: " + str(emoticon_count / stop) + "\n") p_file.write("Emojis " + str(len(emojis)) + "\n") emojis_counter = Counter(emojis).most_common() emojis_counter_sorted = sorted(emojis_counter, key=lambda tup: tup[1]) for emoji in emojis_counter_sorted: p_file.write(str(emoji[0]) + "\t" + str(emoji[1]) + "\n") p_file.write("Total Emojis: " + str(emojis_count) + ". Proportion: " + str(emojis_count / stop) + "\n") p_file.write("Complementary Characters " + str(len(complementary_characters)) + "\n") cc_counter = Counter(complementary_characters).most_common() cc_counter_sorted = sorted(cc_counter, key=lambda tup: tup[1]) for cc in cc_counter_sorted: p_file.write(str(cc[0]) + "\t" + str(cc[1]) + "\n") p_file.write("Total Complementary Characters: " + str(complementary_characters_count) + ". Proportion: " + str(complementary_characters_count / stop) + "\n") p_file.write("Texts without: " + "\n") for text in texts: p_file.write(text + "\n") p_file.write("Total elements in new list: " + str(count) + "\n") execution_time = time() - start_time p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n") p_file.flush() p_file.close()
file.write(".") #crea una copia de la lista de entidades encontrada sin repeticiones def remove_duplicates(lst): newlist = [] for element in lst: if element not in newlist: newlist.append(element) return newlist #carga del modelo de spacy pre-entrenado con la entidad "DRUG" nlp = spacy.load('spacy_model_es_drug') #se agrega al pipeline el reconocimiento de emojis para poder analizar posible relación de estos con drogas emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) path = 'C:/Users/Gaston Migone/Desktop/Gastón/Drug_Train/Files/*.csv' files = glob.glob(path) for name in files: try: print(f"\nAnalizando archivo: {name}... ") doc = open(name, 'r', encoding="utf8").read() data = [(x.strip(), len(x) + 1) for x in doc.splitlines() if x] drugs_list = [] per_list = [] loc_list = [] org_list = [] misc_list = [] me_list = []
def test_integration(nlp): emoji = Emoji(nlp) nlp.add_pipe(emoji, last=True) assert nlp.pipe_names[-1] == 'emoji'
import spacy from spacytextblob.spacytextblob import SpacyTextBlob import lemminflect from collections import Counter, defaultdict from spacy_langdetect import LanguageDetector from spacymoji import Emoji import pandas as pd import numpy as np import nltk from nltk.corpus import stopwords # from afinn import Afinn import re import json nlp = spacy.load('en_core_web_md') nlp.add_pipe(Emoji(nlp, merge_spans=False), first=True) nlp.add_pipe(SpacyTextBlob()) nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Comment out for quick summary from summarizer import Summarizer model = Summarizer() # afinn = Afinn() def get_summary(raw_text, category_list=[]): # file = 'raw_text.txt' # with open(file) as f:
def simple_identification(): client_from = MongoClient() db_from = client_from["SSD"] coll_from = db_from["raw_data"] start_time = time() date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S") path_to_file = date + " - DetectRegexEmoticonsRawData_Performance.txt" p_file = codecs.open(path_to_file, encoding='utf-8', mode='a') p_file.write(date + " Detecting Emoticons with Regex Expression Test - Local Execution" + "\n") p_file.flush() # II. Prepare data p_file.write("Preparing initial data ... " + "\n") path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \ + 'configuration.ini' config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(codecs.open(path_to_configuration, "r", "utf8")) # print(emoticons_dict) # 3. Configure Spanish POS tagger spanish_pipeline = spacy.load('es') emoji = Emoji(spanish_pipeline) spanish_pipeline.add_pipe(emoji, first=True) all_from_tweets = coll_from.find() count = 0 stop = 1000 p_file.write("Total data to process: " + str(stop) + "\n") emoticons = [] emoticon_pattern = r"([:;=]-?([\)D\(\C\co]+|[xX][dD]+))|([:<][3])|(<\\3)|()" emoticons_characters = ['X', 'x', 'd', 'D', ')', '(', ':', ';', '\'', '*', '=', '/', '$', '#', '-', 'C', 'c', '<', '3', '0', 'O', 'o'] texts = [] no_texts = [] emoticon_count = 0 for raw_data in all_from_tweets: if 'text' in raw_data.keys() and 'lang' in raw_data.keys(): if "place" in raw_data.keys(): place = raw_data["place"] if place is not None: if "country_code" in place.keys(): raw_data_country_code = raw_data["place"]["country_code"] if raw_data_country_code in ["CO"]: lang = raw_data["lang"] text = raw_data['text'] raw_entities = raw_data['entities'] if lang == 'es': results = identify_emoticons(text, raw_entities, spanish_pipeline, emoticon_count, emoticon_pattern, emoticons_characters) text = results[0] clean_text = results[1] emoticon_count = results[2] special_entities = results[3] emoticons += copy.deepcopy(results[4]) if len(results[4]) != 0: texts.append(text + '\t' + clean_text + '\t' + str(special_entities)) else: no_texts.append(text + '\t' + clean_text) count += 1 else: if len(text) >= 3: blob = TextBlob(text) detection = True detected_language = '' while detection: try: detected_language = blob.detect_language() detection = False except: print('error while getting detected language') if detected_language == 'es': results = identify_emoticons(text, raw_entities, spanish_pipeline, emoticon_count, emoticon_pattern, emoticons_characters) text = results[0] clean_text = results[1] emoticon_count = results[2] special_entities = results[3] emoticons += copy.deepcopy(results[4]) if len(results[4]) != 0: texts.append(text + '\t' + clean_text + '\t' + str(special_entities)) else: no_texts.append(text + '\t' + clean_text) count += 1 print(count) if count == stop: break all_from_tweets.close() client_from.close() p_file.write("Emoticons " + str(len(emoticons)) + "\n") emoticons_counter = Counter(emoticons).most_common() emoticons_counter_sorted = sorted(emoticons_counter, key=lambda tup: tup[1]) for emoticon in emoticons_counter_sorted: p_file.write(str(emoticon[0]) + "\t" + str(emoticon[1]) + "\n") p_file.write("Total Emoticons: " + str(emoticon_count) + ". Proportion: " + str(emoticon_count / stop) + "\n") p_file.write("TEXTS WITH EMOTICONS: \n") for text in texts: p_file.write(text + "\n") p_file.write("TEXTS WITHOUT EMOTICONS: \n") for text in no_texts: p_file.write(text + "\n") p_file.write("Total elements in new list: " + str(count) + "\n") execution_time = time() - start_time p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n") p_file.flush() p_file.close()
def simple_identification(): client_from = MongoClient() db_from = client_from["SSD"] coll_from = db_from["raw_data"] start_time = time() date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S") path_to_file = date + " - DetectEmojisWithSpacymoji_Performance.txt" p_file = codecs.open(path_to_file, encoding='utf-8', mode='a') p_file.write(date + " Detecting Emojis with Spacymoji Test - Local Execution" + "\n") p_file.flush() # II. Prepare data p_file.write("Preparing initial data ... " + "\n") path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \ + 'configuration.ini' config = ConfigParser(interpolation=ExtendedInterpolation()) config.read_file(codecs.open(path_to_configuration, "r", "utf8")) path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep # Read complementary characters complementary_characters_list_file = codecs.open( path + "list - complementary_characters.txt", encoding='utf-8') complementary_characters_list = complementary_characters_list_file.read( ).splitlines() complementary_characters_list_file.close() complementary_characters_dict = {} for aux in complementary_characters_list: aux_char = aux.split('\t') complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]] # print(complementary_characters_dict) # 3. Configure Spanish POS tagger spanish_pipeline = spacy.load('es') emoji = Emoji(spanish_pipeline) spanish_pipeline.add_pipe(emoji, first=True) tag_map = spacy.es.TAG_MAP # start all_from_tweets = coll_from.find() count = 0 stop = 1000 p_file.write("Total data to process: " + str(stop) + "\n") for raw_data in all_from_tweets: if 'text' in raw_data.keys() and 'lang' in raw_data.keys(): if "place" in raw_data.keys(): place = raw_data["place"] if place is not None: if "country_code" in place.keys(): raw_data_country_code = raw_data["place"][ "country_code"] if raw_data_country_code in ["CO"]: lang = raw_data["lang"] text = raw_data['text'] if lang == 'es': identify_special_characters( text, spanish_pipeline, tag_map, p_file) count += 1 else: if len(text) >= 3: blob = TextBlob(text) detection = True detected_language = '' while detection: try: detected_language = blob.detect_language( ) detection = False except: print( 'error while getting detected language' ) if detected_language == 'es': identify_special_characters( text, spanish_pipeline, tag_map, p_file) count += 1 print(count) if count == stop: break all_from_tweets.close() client_from.close() p_file.write("Total elements in new list: " + str(count) + "\n") execution_time = time() - start_time p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) + "\n") p_file.flush() p_file.close()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import pandas as pd analyzer = SentimentIntensityAnalyzer() import spacy from spacymoji import Emoji nlp = spacy.load('en') emoji = Emoji(nlp) nlp.add_pipe(emoji, first=True) nlp1 = spacy.load('en') def add_compound(set): emotion_set = [] for index, row in set.iterrows(): vs = analyzer.polarity_scores(row['twitters']) emotion_set.append(vs['compound']) set['emotion_score'] = emotion_set return set def add_emojis(frame): emojisite = [] for index, row in frame.iterrows(): num_of_emoji = 0 post = row['twitters'] try: tokens = nlp(post)