Ejemplo n.º 1
0
    def init_resources(self):
        self.punctuation_pattern = re.compile("|".join(PUNCTUATION))
        self.stemmer = None
        stopwords_path = os.path.join(
            os.path.dirname(assistant_dialog_skill_analysis.__file__),
            "resources",
            self.language_code,
            "stopwords",
        )
        if self.language_code == "en":
            from spacy.lang.en import English

            self.tokenizer = Tokenizer(English().vocab)
            self.stemmer = SnowballStemmer(language="english")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "fr":
            from spacy.lang.fr import French

            self.tokenizer = Tokenizer(French().vocab)
            self.stemmer = SnowballStemmer(language="french")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "de":
            from spacy.lang.de import German

            self.tokenizer = Tokenizer(German().vocab)
            self.stemmer = SnowballStemmer(language="german")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "it":
            from spacy.lang.it import Italian

            self.tokenizer = Tokenizer(Italian().vocab)
            self.stemmer = SnowballStemmer(language="italian")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "cs":
            from spacy.lang.cs import Czech

            self.tokenizer = Tokenizer(Czech().vocab)
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "pt":
            from spacy.lang.pt import Portuguese

            self.tokenizer = Tokenizer(Portuguese().vocab)
            self.stemmer = SnowballStemmer(language="portuguese")
            self.stop_words = self.load_stop_words(stopwords_path)

        elif self.language_code == "es":
            from spacy.lang.es import Spanish

            self.tokenizer = Tokenizer(Spanish().vocab)
            self.stemmer = SnowballStemmer(language="spanish")
            self.stop_words = self.load_stop_words(stopwords_path)
        else:
            raise Exception("language code %s is not supported",
                            self.language_code)
Ejemplo n.º 2
0
def chunkstring_spacy(text):
    chunck_sentences = []
    nlp = Portuguese()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(text)
    for sent in doc.sents:
        chunck_sentences.append('>>en<<' + ' ' + sent.text)

    return chunck_sentences
Ejemplo n.º 3
0
def preprocess_test(df):
    # Spacy Tokenizers
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    # Spanish and Portuguese masks to use corresponding language tokenizer
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    # Test file only needs id and tokens
    return df
Ejemplo n.º 4
0
def preprocess(df):
    # Spacy Tokenizers
    nlp_es = Spanish()
    nlp_pt = Portuguese()
    # Spanish and Portuguese masks to use corresponding language tokenizer
    mask_spanish    = df["language"] == 'spanish'
    mask_portuguese = df["language"] == 'portuguese'
    df.loc[mask_spanish, "tokens"] = df["title"].apply(normalize_text,args=(nlp_es,))
    df.loc[mask_portuguese, "tokens"] = df["title"].apply(normalize_text,args=(nlp_pt,))
    # Training and validation df need to have __label__ string before category 
    df["label"] = df["category"].apply(lambda x: '__label__'+ x)
    return df
Ejemplo n.º 5
0
def most_words(df_tweets):
    def get_all_text(tweet_text):
        txt = ''
        for t in tweet_text:
            txt += t
        return txt

    all_text = get_all_text(df_tweets.tweet).lower()

    # TEXT CLEANING
    ### Special Replacement
    all_text = all_text.replace('inteligencia', 'inteligência')
    all_text = all_text.replace('inteligência artificial', 'ia')
    all_text = all_text.replace('inteligencia artificial', 'ia')
    all_text = all_text.replace('artificial intelligence', 'ia')
    all_text = all_text.replace(punctuation, ' ')
    ###
    sub_text = re.sub(r'http\S+', '', all_text)
    sub_text = re.sub('[-|0-9]', ' ', sub_text)
    sub_text = re.findall('\\w+', sub_text)
    sub_text = ' '.join(sub_text)

    # STOPWORDS REMOVAL

    spacy_stopwords = STOP_WORDS
    nlp = Portuguese()

    stopswords_1 = [
        'pra', 'pro', 'tb', 'tbm', 'vc', 'aí', 'tá', 'ah', 'oq', 'ta'
        'eh', 'oh', 'msm', 'q', 'r', 'lá', 'ue', 'ué', 'pq', 'ti', 'tu'
        'rn', 'mt', 'n', 'mais', 'menos', 'pode', 'vai', 'da', 'de', 'do',
        'uau', 'estao'
    ]

    stopwords_2 = ['a', 'as', 'e', 'es', 'i', 'o', 'os', 'u']

    stopwords_externo = pd.read_csv('portuguese_stopwords.txt', header=None)
    stopwords_3 = stopwords_externo.values.tolist()

    stopwords_4 = []
    for i in stopwords_3:
        stopwords_4.append(i[0])

    stopword_list = set(stopswords_1 + stopwords_2 + stopwords_4)

    spacy_stopwords.update(stopword_list)

    doc = nlp.tokenizer(sub_text)
    words = [token.text for token in doc if token.is_stop != spacy_stopwords]
    final_words = [w for w in words if w not in spacy_stopwords]

    return final_words
Ejemplo n.º 6
0
def lang_change(language):
    if language == 'en':
        from spacy.lang.en import English
        from spacy.lang.en.stop_words import STOP_WORDS
        parser = English()
        file = "\config_files\config_spacy_en.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'de':
        from spacy.lang.de import German
        from spacy.lang.de.stop_words import STOP_WORDS
        parser = German()
        file = "\config_files\config_spacy_de.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'es':
        from spacy.lang.es import Spanish
        from spacy.lang.es.stop_words import STOP_WORDS
        parser = Spanish()
        file = "\config_files\config_spacy_es.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'pt':
        from spacy.lang.pt import Portuguese
        from spacy.lang.pt.stop_words import STOP_WORDS
        parser = Portuguese()
        file = "\config_files\config_spacy_pt.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'fr':
        from spacy.lang.fr import French
        from spacy.lang.fr.stop_words import STOP_WORDS
        parser = French()
        file = "\config_files\config_spacy_fr.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'it':
        from spacy.lang.it import Italian
        from spacy.lang.it.stop_words import STOP_WORDS
        parser = Italian()
        file = "\config_files\config_spacy_it.yaml"
        configfile_path = os.getcwd() + file
    elif language == 'nl':
        from spacy.lang.nl import Dutch
        from spacy.lang.nl.stop_words import STOP_WORDS
        parser = Dutch()
        file = "\config_files\config_spacy_nl.yaml"
        configfile_path = os.getcwd() + file

    return parser, STOP_WORDS, configfile_path
Ejemplo n.º 7
0
def spacy_tokenizer(sentence):
    parser = Portuguese()
    punctuations = string.punctuation
    stop_words = spacy.lang.pt.stop_words.STOP_WORDS
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [
        word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
        for word in mytokens
    ]

    # Removing stop words
    mytokens = [
        word for word in mytokens
        if word not in stop_words and word not in punctuations
    ]

    # return preprocessed list of tokens
    return mytokens
Ejemplo n.º 8
0
    def get_nlp(self, language):

        """"
        this method returns the corresponding spacy language model when 
        provided with a language. To do so it also does the required 
        import. This is certainly not the standard approach. 
        But as this endpoint will be deployed to Heroku (space limitation)
        and only be invoked rarely it is the fastest approach.
        """

        if language == "en":

            from spacy.lang.en import English
            return English()

        elif language == "fr":

            from spacy.lang.fr import French
            return French()

        elif language == "de":

            from spacy.lang.de import German
            return German()

        elif language == "es":

            from spacy.lang.es import Spanish
            return Spanish()

        elif language == "pt":

            from spacy.lang.pt import Portuguese
            return Portuguese()

        else:

            return {"error": "invalid or not supported language entered"}
        phrase = phrase.replace(punct, ' ')
    for o, r in RM:
        phrase = re.sub(o, r, phrase, flags=re.MULTILINE)

    # Limpeza extra
    phrase = word_tokenize(phrase)
    clean_frase = []
    clfa = clean_frase.append
    for palavra in phrase:
        if not is_number(palavra) and len(palavra) > 2:
            clfa(palavra)
    return ' '.join(clean_frase) if join else clean_frase


# GLOBALS
NLP = Portuguese()
# STEMMER = nltk.stem.RSLPStemmer()
STEMMER = nltk.stem.SnowballStemmer('portuguese')
STOPWORDS, PUNCT = _get_stopwords()
RM = [
    (r'(http[s]*?:\/\/)+.*[\r\n]*', r''),
    (r'@', r''),
    (r'\n+', r' . '),
    (r'"', r' '),
    (r'\'', r' '),
    (r'#', r''),
    (r'(RT)', r''),
    (r'[…]', ' . '),
    (r'[0-9]*', r''),
    (r'“', r''),
    (r'”', ''),
Ejemplo n.º 10
0
from spacy.lang.pt import Portuguese
from spacy.lang.en import English

JobDescript = "Job Purpose:  Serve as an advanced technical expert for analyzing and identifying security vulnerabilities across penetration testing services, and automated SAST/DAST scans. Technical team lead to coach and assist with the implementation, administration, support of enterprise DevSecOps tooling for security automation and identification of security vulnerabilities within the CI/CD pipeline. Work close with software development teams to implement SDL (Secure Development Lifecycle) across the organization.*Required Job Qualifications: Bachelors Degree with 4 years IT security experience OR 6 years of experience.Fluent in English. All interviews will be conducted in English.Experience and understanding of DevSecOps and securing CI/CD pipelineProficient with penetration testing and web application assessment tools to discover security vulnerabilitiesExperience in the following: software development, web applications, cybersecurity, networking protocols and their related implementations.Experience with and understanding of compiled and interpreted programs and the types of security issues possible in each; database systems, web servers, application servers, firewalls, and different types of middleware.Understanding of Application Security and the OWASP top 10 principlesProficiency in education engineers on application securityUnderstanding of the current threat and vulnerability landscapeMust be detail-oriented, possess programming skills in at least of of these languages: Java, Python, C/C++, .Net, JavaScript, Go.Verbal communications skills and concise written communication skillsPreferred Job Qualifications: Bachelors OR Masters Degree in Computer Science, Information Systems, or other related field. Or equivalent work experience preferredExperience with implementation, configuration, and administration of enterprise SAST, DAST, and triaging SAST/DAST reports.Expertise with technical delivery of identified security vulnerabilities and their risks to technology and application ownersAbility to recommend mitigating solutions to remediate risk associated with vulnerabilitiesStrong ability to conduct verification and validation tasks for remediation and mitigation controlsAbility to review, evaluate, and scope project engagements for vulnerability assessmentsExperience and desire in coaching the vulnerability operations team on technical skills for evaluating, testing, rating, and discovering security vulnerabilitiesAbility to conduct ad-hoc dynamic web and mobile application testingAssist with SAST, DAST, and secrets management tooling implementation, administration, support, and tool evaluation/recommendationsExperience working as a subject matter expert for reviewing static and dynamic code analysis findings, confirming false positives, creating new audit queries to fine tune toolsExperience partnering with devops and development teams with threat modeling and assist in grooming security championsAbility to write reports including recommendations, root cause analysis, security summary analysis, and project roadmapsExperience maintaining knowledge of cybersecurity trends and changing technologies, and providing recommendations for adaptation of new technologies for vulnerability monitoringSecurity certifications (such as OSCP or CISSP) and published CVEs would be a plus.This is a 100% remote position where ideal candidates will work directly with one of our clients in the USA central time zone. Black River is a leader in cybersecurity in Chicago area helping companies to build their Secure Development Lifecycle with the right staff.*Job Types: Full-time, ContractSalary: R$8,000.00 - R$10,000.00 per monthExperience:cybersecurity (Preferred)software development (Required)Language:Ingles (Required)Work Remotely:Yes"

nlp_en = English()
doc_en = nlp_en(JobDescript)

nlp_pt = Portuguese()
doc_pt = nlp_pt(JobDescript)

#print(doc_en.text)
#print(doc_pt.text)

for token_en in doc_en:
    print(token_en)

#for token_pt in doc_pt:
#    print(doc_pt)
Ejemplo n.º 11
0
# Constants - Hyperparameters
interactions_scores_dict = {
    'VIEW': 1,
    'BOOKMARK': 2,
    'FOLLOW': 3,
    'LIKE': 4,
    'COMMENT CREATED': 5
}

# Global objects
interactions_df = pd.read_csv('interactions.csv')
articles_df = pd.read_csv('articles.csv')
person_le = preprocessing.LabelEncoder()
tokens_le = preprocessing.LabelEncoder()
hidden_dimensions = 250
language_objects = {"en": English(), "pt": Portuguese(), "es": Spanish()}
tokenizers = {}
summaries = {}
filter_regex = "[^A-Za-z0-9]+"
batch_size = 10000
max_iterations = 100000
l2_lambda = 0.001

# We summarize each article with Spacy's TextRank implementation. This eliminates most of the noisy information
# in the texts. Then we apply tf-idf analysis to the article summaries. For every unique token in the obtained corpus
# of summaries, we calculate the expected tf-idf score over all articles. Then we sort the tokens in descending order
# of their expected tf-idf scores. The first 5000 tokens will constitute the representing tokens of our article corpus.


def create_article_tokens():
    def identity_tokenizer(text):
Ejemplo n.º 12
0
pa_2caracter = re.compile(
    r'(?<=\s)[a-zA-Za-záàâãéèêíïóôõöúçñÁÀÂÃÉÈÍÏÓÔÕÖÚÇÑ]{0,2}(?=\s|,|;)')

pa_generico = re.compile(
    r'(fls|folha[s]?|peça[s]?|grifo[s]? nosso[s]?|-?[X]-?|CPF|CNPJ|LTDA)',
    re.I)

pa_espacos = re.compile(r'(\s{2,})')
re_espacos = ' '

pa_new_line = re.compile(r'(<br>|<\/p>)')
re_new_line = '\n'

nlp = spacy.load('pt_core_news_lg')

parser = Portuguese()


def apply_tokenize(sentence):
    return word_tokenize(sentence)


def remove_stopwords(sentence):
    all_stopwords = nlp.Defaults.stop_words
    stopwordnez = PortugueseStopWords().OnlyStopWords()

    stopwordnez += all_stopwords
    stopwordnez = set(stopwordnez)
    return [
        word for word in sentence if not word in stopwordnez if len(word) > 2
    ]
Ejemplo n.º 13
0
import json
from pathlib import Path

from spacy.lang.pt import Portuguese

nlp = Portuguese()

ruler = nlp.add_pipe("entity_ruler")
patterns = json.load(open("data/states/states_label.json"))

ruler.add_patterns(patterns)

Path("models/").mkdir(parents=True, exist_ok=True)
nlp.to_disk("models/pt_core_news_sm_addresses")