def init_resources(self): self.punctuation_pattern = re.compile("|".join(PUNCTUATION)) self.stemmer = None stopwords_path = os.path.join( os.path.dirname(assistant_dialog_skill_analysis.__file__), "resources", self.language_code, "stopwords", ) if self.language_code == "en": from spacy.lang.en import English self.tokenizer = Tokenizer(English().vocab) self.stemmer = SnowballStemmer(language="english") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "fr": from spacy.lang.fr import French self.tokenizer = Tokenizer(French().vocab) self.stemmer = SnowballStemmer(language="french") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "de": from spacy.lang.de import German self.tokenizer = Tokenizer(German().vocab) self.stemmer = SnowballStemmer(language="german") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "it": from spacy.lang.it import Italian self.tokenizer = Tokenizer(Italian().vocab) self.stemmer = SnowballStemmer(language="italian") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "cs": from spacy.lang.cs import Czech self.tokenizer = Tokenizer(Czech().vocab) self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "pt": from spacy.lang.pt import Portuguese self.tokenizer = Tokenizer(Portuguese().vocab) self.stemmer = SnowballStemmer(language="portuguese") self.stop_words = self.load_stop_words(stopwords_path) elif self.language_code == "es": from spacy.lang.es import Spanish self.tokenizer = Tokenizer(Spanish().vocab) self.stemmer = SnowballStemmer(language="spanish") self.stop_words = self.load_stop_words(stopwords_path) else: raise Exception("language code %s is not supported", self.language_code)
def chunkstring_spacy(text): chunck_sentences = [] nlp = Portuguese() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(text) for sent in doc.sents: chunck_sentences.append('>>en<<' + ' ' + sent.text) return chunck_sentences
def preprocess_test(df): # Spacy Tokenizers nlp_es = Spanish() nlp_pt = Portuguese() # Spanish and Portuguese masks to use corresponding language tokenizer mask_spanish = df["language"] == 'spanish' mask_portuguese = df["language"] == 'portuguese' df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,)) df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,)) # Test file only needs id and tokens return df
def preprocess(df): # Spacy Tokenizers nlp_es = Spanish() nlp_pt = Portuguese() # Spanish and Portuguese masks to use corresponding language tokenizer mask_spanish = df["language"] == 'spanish' mask_portuguese = df["language"] == 'portuguese' df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,)) df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,)) # Training and validation df need to have __label__ string before category df["label"] = df["category"].apply(lambda x: '__label__'+ x) return df
def most_words(df_tweets): def get_all_text(tweet_text): txt = '' for t in tweet_text: txt += t return txt all_text = get_all_text(df_tweets.tweet).lower() # TEXT CLEANING ### Special Replacement all_text = all_text.replace('inteligencia', 'inteligência') all_text = all_text.replace('inteligência artificial', 'ia') all_text = all_text.replace('inteligencia artificial', 'ia') all_text = all_text.replace('artificial intelligence', 'ia') all_text = all_text.replace(punctuation, ' ') ### sub_text = re.sub(r'http\S+', '', all_text) sub_text = re.sub('[-|0-9]', ' ', sub_text) sub_text = re.findall('\\w+', sub_text) sub_text = ' '.join(sub_text) # STOPWORDS REMOVAL spacy_stopwords = STOP_WORDS nlp = Portuguese() stopswords_1 = [ 'pra', 'pro', 'tb', 'tbm', 'vc', 'aí', 'tá', 'ah', 'oq', 'ta' 'eh', 'oh', 'msm', 'q', 'r', 'lá', 'ue', 'ué', 'pq', 'ti', 'tu' 'rn', 'mt', 'n', 'mais', 'menos', 'pode', 'vai', 'da', 'de', 'do', 'uau', 'estao' ] stopwords_2 = ['a', 'as', 'e', 'es', 'i', 'o', 'os', 'u'] stopwords_externo = pd.read_csv('portuguese_stopwords.txt', header=None) stopwords_3 = stopwords_externo.values.tolist() stopwords_4 = [] for i in stopwords_3: stopwords_4.append(i[0]) stopword_list = set(stopswords_1 + stopwords_2 + stopwords_4) spacy_stopwords.update(stopword_list) doc = nlp.tokenizer(sub_text) words = [token.text for token in doc if token.is_stop != spacy_stopwords] final_words = [w for w in words if w not in spacy_stopwords] return final_words
def lang_change(language): if language == 'en': from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS parser = English() file = "\config_files\config_spacy_en.yaml" configfile_path = os.getcwd() + file elif language == 'de': from spacy.lang.de import German from spacy.lang.de.stop_words import STOP_WORDS parser = German() file = "\config_files\config_spacy_de.yaml" configfile_path = os.getcwd() + file elif language == 'es': from spacy.lang.es import Spanish from spacy.lang.es.stop_words import STOP_WORDS parser = Spanish() file = "\config_files\config_spacy_es.yaml" configfile_path = os.getcwd() + file elif language == 'pt': from spacy.lang.pt import Portuguese from spacy.lang.pt.stop_words import STOP_WORDS parser = Portuguese() file = "\config_files\config_spacy_pt.yaml" configfile_path = os.getcwd() + file elif language == 'fr': from spacy.lang.fr import French from spacy.lang.fr.stop_words import STOP_WORDS parser = French() file = "\config_files\config_spacy_fr.yaml" configfile_path = os.getcwd() + file elif language == 'it': from spacy.lang.it import Italian from spacy.lang.it.stop_words import STOP_WORDS parser = Italian() file = "\config_files\config_spacy_it.yaml" configfile_path = os.getcwd() + file elif language == 'nl': from spacy.lang.nl import Dutch from spacy.lang.nl.stop_words import STOP_WORDS parser = Dutch() file = "\config_files\config_spacy_nl.yaml" configfile_path = os.getcwd() + file return parser, STOP_WORDS, configfile_path
def spacy_tokenizer(sentence): parser = Portuguese() punctuations = string.punctuation stop_words = spacy.lang.pt.stop_words.STOP_WORDS # Creating our token object, which is used to create documents with linguistic annotations. mytokens = parser(sentence) # Lemmatizing each token and converting each token into lowercase mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # Removing stop words mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ] # return preprocessed list of tokens return mytokens
def get_nlp(self, language): """" this method returns the corresponding spacy language model when provided with a language. To do so it also does the required import. This is certainly not the standard approach. But as this endpoint will be deployed to Heroku (space limitation) and only be invoked rarely it is the fastest approach. """ if language == "en": from spacy.lang.en import English return English() elif language == "fr": from spacy.lang.fr import French return French() elif language == "de": from spacy.lang.de import German return German() elif language == "es": from spacy.lang.es import Spanish return Spanish() elif language == "pt": from spacy.lang.pt import Portuguese return Portuguese() else: return {"error": "invalid or not supported language entered"}
phrase = phrase.replace(punct, ' ') for o, r in RM: phrase = re.sub(o, r, phrase, flags=re.MULTILINE) # Limpeza extra phrase = word_tokenize(phrase) clean_frase = [] clfa = clean_frase.append for palavra in phrase: if not is_number(palavra) and len(palavra) > 2: clfa(palavra) return ' '.join(clean_frase) if join else clean_frase # GLOBALS NLP = Portuguese() # STEMMER = nltk.stem.RSLPStemmer() STEMMER = nltk.stem.SnowballStemmer('portuguese') STOPWORDS, PUNCT = _get_stopwords() RM = [ (r'(http[s]*?:\/\/)+.*[\r\n]*', r''), (r'@', r''), (r'\n+', r' . '), (r'"', r' '), (r'\'', r' '), (r'#', r''), (r'(RT)', r''), (r'[…]', ' . '), (r'[0-9]*', r''), (r'“', r''), (r'”', ''),
from spacy.lang.pt import Portuguese from spacy.lang.en import English JobDescript = "Job Purpose: Serve as an advanced technical expert for analyzing and identifying security vulnerabilities across penetration testing services, and automated SAST/DAST scans. Technical team lead to coach and assist with the implementation, administration, support of enterprise DevSecOps tooling for security automation and identification of security vulnerabilities within the CI/CD pipeline. Work close with software development teams to implement SDL (Secure Development Lifecycle) across the organization.*Required Job Qualifications: Bachelors Degree with 4 years IT security experience OR 6 years of experience.Fluent in English. All interviews will be conducted in English.Experience and understanding of DevSecOps and securing CI/CD pipelineProficient with penetration testing and web application assessment tools to discover security vulnerabilitiesExperience in the following: software development, web applications, cybersecurity, networking protocols and their related implementations.Experience with and understanding of compiled and interpreted programs and the types of security issues possible in each; database systems, web servers, application servers, firewalls, and different types of middleware.Understanding of Application Security and the OWASP top 10 principlesProficiency in education engineers on application securityUnderstanding of the current threat and vulnerability landscapeMust be detail-oriented, possess programming skills in at least of of these languages: Java, Python, C/C++, .Net, JavaScript, Go.Verbal communications skills and concise written communication skillsPreferred Job Qualifications: Bachelors OR Masters Degree in Computer Science, Information Systems, or other related field. Or equivalent work experience preferredExperience with implementation, configuration, and administration of enterprise SAST, DAST, and triaging SAST/DAST reports.Expertise with technical delivery of identified security vulnerabilities and their risks to technology and application ownersAbility to recommend mitigating solutions to remediate risk associated with vulnerabilitiesStrong ability to conduct verification and validation tasks for remediation and mitigation controlsAbility to review, evaluate, and scope project engagements for vulnerability assessmentsExperience and desire in coaching the vulnerability operations team on technical skills for evaluating, testing, rating, and discovering security vulnerabilitiesAbility to conduct ad-hoc dynamic web and mobile application testingAssist with SAST, DAST, and secrets management tooling implementation, administration, support, and tool evaluation/recommendationsExperience working as a subject matter expert for reviewing static and dynamic code analysis findings, confirming false positives, creating new audit queries to fine tune toolsExperience partnering with devops and development teams with threat modeling and assist in grooming security championsAbility to write reports including recommendations, root cause analysis, security summary analysis, and project roadmapsExperience maintaining knowledge of cybersecurity trends and changing technologies, and providing recommendations for adaptation of new technologies for vulnerability monitoringSecurity certifications (such as OSCP or CISSP) and published CVEs would be a plus.This is a 100% remote position where ideal candidates will work directly with one of our clients in the USA central time zone. Black River is a leader in cybersecurity in Chicago area helping companies to build their Secure Development Lifecycle with the right staff.*Job Types: Full-time, ContractSalary: R$8,000.00 - R$10,000.00 per monthExperience:cybersecurity (Preferred)software development (Required)Language:Ingles (Required)Work Remotely:Yes" nlp_en = English() doc_en = nlp_en(JobDescript) nlp_pt = Portuguese() doc_pt = nlp_pt(JobDescript) #print(doc_en.text) #print(doc_pt.text) for token_en in doc_en: print(token_en) #for token_pt in doc_pt: # print(doc_pt)
# Constants - Hyperparameters interactions_scores_dict = { 'VIEW': 1, 'BOOKMARK': 2, 'FOLLOW': 3, 'LIKE': 4, 'COMMENT CREATED': 5 } # Global objects interactions_df = pd.read_csv('interactions.csv') articles_df = pd.read_csv('articles.csv') person_le = preprocessing.LabelEncoder() tokens_le = preprocessing.LabelEncoder() hidden_dimensions = 250 language_objects = {"en": English(), "pt": Portuguese(), "es": Spanish()} tokenizers = {} summaries = {} filter_regex = "[^A-Za-z0-9]+" batch_size = 10000 max_iterations = 100000 l2_lambda = 0.001 # We summarize each article with Spacy's TextRank implementation. This eliminates most of the noisy information # in the texts. Then we apply tf-idf analysis to the article summaries. For every unique token in the obtained corpus # of summaries, we calculate the expected tf-idf score over all articles. Then we sort the tokens in descending order # of their expected tf-idf scores. The first 5000 tokens will constitute the representing tokens of our article corpus. def create_article_tokens(): def identity_tokenizer(text):
pa_2caracter = re.compile( r'(?<=\s)[a-zA-Za-záàâãéèêíïóôõöúçñÁÀÂÃÉÈÍÏÓÔÕÖÚÇÑ]{0,2}(?=\s|,|;)') pa_generico = re.compile( r'(fls|folha[s]?|peça[s]?|grifo[s]? nosso[s]?|-?[X]-?|CPF|CNPJ|LTDA)', re.I) pa_espacos = re.compile(r'(\s{2,})') re_espacos = ' ' pa_new_line = re.compile(r'(<br>|<\/p>)') re_new_line = '\n' nlp = spacy.load('pt_core_news_lg') parser = Portuguese() def apply_tokenize(sentence): return word_tokenize(sentence) def remove_stopwords(sentence): all_stopwords = nlp.Defaults.stop_words stopwordnez = PortugueseStopWords().OnlyStopWords() stopwordnez += all_stopwords stopwordnez = set(stopwordnez) return [ word for word in sentence if not word in stopwordnez if len(word) > 2 ]
import json from pathlib import Path from spacy.lang.pt import Portuguese nlp = Portuguese() ruler = nlp.add_pipe("entity_ruler") patterns = json.load(open("data/states/states_label.json")) ruler.add_patterns(patterns) Path("models/").mkdir(parents=True, exist_ok=True) nlp.to_disk("models/pt_core_news_sm_addresses")