def __init__(self, grammar: Grammar, reduction: Reduction = None, verbose: bool = False): """ Initialize a parser with some global parameters. :param grammar: A CFG grammar driving acceptable transitions. :param reduction: A mapping of a complex grammar to a simpler one. :param verbose: Enables additional output. """ self._grammar = grammar self.__parse_stack = [] # A stack of parsed Symbols self.__input_stack = [] # A stack of raw input strings and reduced Symbols. self._reduction = reduction self._needs_prune = False self.verbose = verbose # Check if necessary NLTK resources are available. try: nltk.find('tokenizers/punkt') except LookupError: print('Missing NLTK "punkt" package, downloading...') nltk.download('punkt') try: nltk.find('taggers/averaged_perceptron_tagger') except LookupError: print('Missing NLTK "Perceptron Tagger" package, downloading...') nltk.download('averaged_perceptron_tagger') # Create an artificial state frame to server as parse tree root. root_frame = StateFrame((Grammar.ROOT_SYM, 0, 0)) root_frame.to_sym = [grammar.start_symbol] self.__state = root_frame # Generate the initial rule set as all rules accessible from the start point. self._set_looking_for(root_frame, create_all=True)
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Try lemmatization filter try: nltk.find('taggers/universal_tagset') except LookupError: nltk.download('universal_tagset') try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stim = ComplexTextStim(text='These are tests for Stemming filters') filt = WordStemmingFilter(stemmer='wordnet') lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['these', 'be', 'test', 'for', 'stem', 'filter'] assert lemmas == target # Try case sensitive filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True) lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['These', 'be', 'test', 'for', 'Stemming', 'filter'] assert lemmas == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'
def data_preparation(self): """ Splits one of Brown, BNC News, Indian corpora into train set and test set Returns: -------- sentences (list): Sentences without POS-tags tagged_sentences (list): Sentences with POS-tags """ if self.corpus == 'brown': tagged_sentences = brown.tagged_sents(categories='news') sentences = brown.sents(categories='news') elif self.corpus == 'bnc': root = find('corpora/bnc') bncnews = TaggedCorpusReader(root, 'bnc-news-wtp.txt', tagset='en-claws') if self.tagset is None: tagged_sentences = bncnews.tagged_sents() elif self.tagset == 'universal': tagged_sentences = bncnews.tagged_sents(tagset=self.tagset) sentences = bncnews.sents() elif self.corpus == 'indian': if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']: tagged_sentences = indian.tagged_sents(f'{self.lang}.pos') sentences = indian.sents(f'{self.lang}.pos') else: print('Language not part of Indian Corpus.') return sentences, tagged_sentences
def __init__(self): if not find("corpora/stopwords.zip"): download('stopwords') self.link_regex = re.compile( r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+' r'[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', re.IGNORECASE) self.account_regex = re.compile(r"@\w*", re.IGNORECASE) self.low_fre_words = defaultdict(int) self.model = None self.labeled_data = None self.get_labeled_data()
import requests from bs4 import BeautifulSoup from operator import itemgetter import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer import copy from nltk.corpus import movie_reviews import random from nltk.tokenize import word_tokenize try: nltk.find('tokenizers/punkt') except LookupError: nltk.download('punkt') try: nltk.find('sentiment/vader_lexicon.zip') except LookupError: nltk.download('vader_lexicon') def scraper(): work = "https://api.nytimes.com/svc/movies/v2/reviews/all.json?api-key=rPRXhYeMN9E6OCRWs7704hENbvHAGmyK" res = requests.get(work) all = res.json() movie_data = all["results"] myLs = [] # adds the first 20 reviews to a list of dict's for i in range(0, 19): link = movie_data[i]["link"]
import pickle import random import re from collections import defaultdict from nltk import word_tokenize, find, download try: find("corpora/stopwords.zip") except LookupError: download('stopwords') from nltk.classify import NaiveBayesClassifier from nltk.corpus import stopwords import rootpath rootpath.append() from backend.data_preparation.connection import Connection class NLTKTest: def __init__(self): if not find("corpora/stopwords.zip"): download('stopwords') self.link_regex = re.compile( r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+' r'[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', re.IGNORECASE) self.account_regex = re.compile(r"@\w*", re.IGNORECASE) self.low_fre_words = defaultdict(int) self.model = None
import numpy as np import pandas as pd from sklearn.pipeline import Pipeline from sklearn.metrics import confusion_matrix from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectKBest, chi2 from sklearn.model_selection import cross_val_score, train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from reader import DSReader sys.path.append("src") try: nltk.find("wordnet") except LookupError: nltk.download("wordnet") try: nltk.find("stopwords") except LookupError: nltk.download("stopwords") try: nltk.find("punkt") except LookupError: nltk.download('punkt') dataset_path = os.path.abspath("tests/datasets/emails.csv")
import sys import random try: nlp = spacy.load("en_core_web_md") except Exception: traceback.print_exc() print("Error loading Spacy", file=sys.stderr) print("Please run the following command:", file=sys.stderr) print("python -m spacy download en_core_web_md", file=sys.stderr) try: nltk.data.find("wordnet") except Exception: nltk.download("wordnet") try: nltk.find("averaged_perceptron_tagger") except Exception: nltk.download("averaged_perceptron_tagger") try: nltk.find("vader_lexicon") except Exception: nltk.download("vader_lexicon") from emora_stdm.state_transition_dialogue_manager.wordnet import related_synsets, wordnet_knowledge_base # , lemmas_of from nltk.corpus import wordnet import regex import re def _process_args_set(args, vars): for i, e in enumerate(args): if isinstance(e, str) and "$" == e[0]:
#!/bin/python import nltk try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') try: from nltk.corpus import wordnet as wn except: nltk.download('wordnet', force=True) from nltk.corpus import wordnet as wn import json import segment import convert import data import numpy as np import re from collections import defaultdict # Load dictionary for use in making annotations: dictionary = np.load("dict/epsd.npz", allow_pickle=True)["dictionary"].item() def get_noun_hypernyms(word): """ word: An English word returns: A set of Synset objects representing the given word's hypernyms. Only returns noun synsets. """ all_hypernyms = wn.synsets(word)
import spacy import sys import random try: nlp = spacy.load("en_core_web_md") except Exception as e: traceback.print_exc() print('Error loading Spacy', file=sys.stderr) print('Please run the following command:', file=sys.stderr) print('python -m spacy download en_core_web_md', file=sys.stderr) try: nltk.data.find('wordnet') except: nltk.download('wordnet') try: nltk.find('averaged_perceptron_tagger') except: nltk.download('averaged_perceptron_tagger') try: nltk.find('vader_lexicon') except: nltk.download('vader_lexicon') from emora_stdm.state_transition_dialogue_manager.wordnet import \ related_synsets, wordnet_knowledge_base, lemmas_of from nltk.corpus import wordnet import regex import re def _process_args_set(args, vars): for i, e in enumerate(args):
import pronouncing, nltk # download tagger try: nltk.find("taggers/averaged_perceptron_tagger") except: nltk.download('averaged_perceptron_tagger') # load common nouns with open("nounlist.txt") as f: common_nouns = set(f.read().split("\n")) with open("bad-words.txt") as f: bad_words = set(f.read().split("\n")) """ Creates a sentence using the elf on the shelf template """ def make_sentence(query): global common_nouns rhymes = set(pronouncing.rhymes(query)) choice_nouns = list(rhymes) choice_nouns = list(rhymes - bad_words) return choice_nouns