def get_data(): """ Outcome: S, salaries for IT staff in a corporation. Predictors: X, experience (years) E, education (1=Bachelor’s, 2=Master’s, 3=Ph.D) M, management (1=management, 0=not management) S X E M 13876 1 1 1 11608 1 3 0 18701 1 3 1 11283 1 2 0 11767 1 3 0 """ url = 'http://stats191.stanford.edu/data/salary.table' fh = urlopen(url) df = pd.read_table(fh) print(df.head()) return df
def get_data(): """ Variable Description ===================================================================== TEST Job aptitude test score MINORITY 1 if applicant could be considered minority, 0 otherwise JPERF Job performance evaluation TEST MINORITY JPERF 0 0.28 1 1.83 1 0.97 1 4.59 2 1.25 1 2.97 3 2.46 1 8.14 4 2.51 1 8.00 """ url = 'http://stats191.stanford.edu/data/jobtest.table' fh = urlopen(url) df = pd.read_table(fh) print(df.head()) return df
from statsmodels.compat import urlopen import numpy as np np.set_printoptions(precision=4, suppress=True) import pandas as pd pd.set_option("display.width", 100) import matplotlib.pyplot as plt from statsmodels.formula.api import ols from statsmodels.graphics.api import interaction_plot, abline_plot from statsmodels.stats.anova import anova_lm try: salary_table = pd.read_csv('salary.table') except: # recent pandas can read URL without urlopen url = 'http://stats191.stanford.edu/data/salary.table' fh = urlopen(url) salary_table = pd.read_table(fh) salary_table.to_csv('salary.table') E = salary_table.E M = salary_table.M X = salary_table.X S = salary_table.S # Take a look at the data: plt.figure(figsize=(6, 6)) symbols = ['D', '^'] colors = ['r', 'g', 'blue'] factor_groups = salary_table.groupby(['E', 'M']) for values, group in factor_groups:
from __future__ import print_function from statsmodels.compat import urlopen import numpy as np np.set_printoptions(precision=4, suppress=True) import statsmodels.api as sm import pandas as pd import matplotlib.pyplot as plt from statsmodels.formula.api import ols from statsmodels.graphics.api import interaction_plot, abline_plot from statsmodels.stats.anova import anova_lm try: salary_table = pd.read_csv('salary.table') except: # recent pandas can read URL without urlopen url = 'http://stats191.stanford.edu/data/salary.table' fh = urlopen(url) salary_table = pd.read_table(fh) salary_table.to_csv('salary.table') E = salary_table.E M = salary_table.M X = salary_table.X S = salary_table.S # Take a look at the data: plt.figure(figsize=(6,6)) symbols = ['D', '^'] colors = ['r', 'g', 'blue'] factor_groups = salary_table.groupby(['E','M'])
from nltk.tokenize import sent_tokenize from bs4 import BeautifulSoup from serial.request import Request from statsmodels.compat import urlopen from nltk.corpus import stopwords from string import punctuation from nltk.probability import FreqDist from collections import defaultdict from heapq import nlargest stp = set(stopwords.words('portuguese') + list(punctuation)) link = Request( 'http://ultimosegundo.ig.com.br/politica/2017-04-25/reforma-da-previdencia.html', headers={'User-Agent': 'Mozilla/5.0'}) pagina = urlopen(link).read().decode('utf-8', 'ignore') soup = BeautifulSoup(pagina, "lxml") texto = soup.find(id="noticia").text sentencas = sent_tokenize(texto) palavras = word_tokenize(texto.lower()) stopwords = stp palavras_sem_stopwords = [ palavra for palavra in palavras if palavra not in stopwords ] frequencia = FreqDist(palavras_sem_stopwords) sentencas_importantes = defaultdict(int) for i, sentenca in enumerate(sentencas): for palavra in word_tokenize(sentenca.lower()): if palavra in frequencia: