Beispiel #1
0
def get_data():
    """
    Outcome: S, salaries for IT staff in a corporation.
    Predictors:
        X, experience (years)
        E, education (1=Bachelor’s, 2=Master’s, 3=Ph.D)
        M, management (1=management, 0=not management) 

        S  X  E  M
    13876  1  1  1
    11608  1  3  0
    18701  1  3  1
    11283  1  2  0
    11767  1  3  0
    """
    url = 'http://stats191.stanford.edu/data/salary.table'
    fh = urlopen(url)
    df = pd.read_table(fh)
    print(df.head())
    return df
def get_data():
    """
    Variable	Description
    =====================================================================
    TEST	    Job aptitude test score
    MINORITY	1 if applicant could be considered minority, 0 otherwise
    JPERF	    Job performance evaluation

    TEST     MINORITY  JPERF
    0  0.28         1   1.83
    1  0.97         1   4.59
    2  1.25         1   2.97
    3  2.46         1   8.14
    4  2.51         1   8.00
    """
    url = 'http://stats191.stanford.edu/data/jobtest.table'
    fh = urlopen(url)
    df = pd.read_table(fh)
    print(df.head())
    return df
from statsmodels.compat import urlopen
import numpy as np
np.set_printoptions(precision=4, suppress=True)

import pandas as pd
pd.set_option("display.width", 100)
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

try:
    salary_table = pd.read_csv('salary.table')
except:  # recent pandas can read URL without urlopen
    url = 'http://stats191.stanford.edu/data/salary.table'
    fh = urlopen(url)
    salary_table = pd.read_table(fh)
    salary_table.to_csv('salary.table')

E = salary_table.E
M = salary_table.M
X = salary_table.X
S = salary_table.S

# Take a look at the data:

plt.figure(figsize=(6, 6))
symbols = ['D', '^']
colors = ['r', 'g', 'blue']
factor_groups = salary_table.groupby(['E', 'M'])
for values, group in factor_groups:
from __future__ import print_function
from statsmodels.compat import urlopen
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

try:
    salary_table = pd.read_csv('salary.table')
except:  # recent pandas can read URL without urlopen
    url = 'http://stats191.stanford.edu/data/salary.table'
    fh = urlopen(url)
    salary_table = pd.read_table(fh)
    salary_table.to_csv('salary.table')

E = salary_table.E
M = salary_table.M
X = salary_table.X
S = salary_table.S


# Take a look at the data:

plt.figure(figsize=(6,6))
symbols = ['D', '^']
colors = ['r', 'g', 'blue']
factor_groups = salary_table.groupby(['E','M'])
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
from serial.request import Request
from statsmodels.compat import urlopen
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest

stp = set(stopwords.words('portuguese') + list(punctuation))

link = Request(
    'http://ultimosegundo.ig.com.br/politica/2017-04-25/reforma-da-previdencia.html',
    headers={'User-Agent': 'Mozilla/5.0'})
pagina = urlopen(link).read().decode('utf-8', 'ignore')

soup = BeautifulSoup(pagina, "lxml")
texto = soup.find(id="noticia").text
sentencas = sent_tokenize(texto)
palavras = word_tokenize(texto.lower())
stopwords = stp
palavras_sem_stopwords = [
    palavra for palavra in palavras if palavra not in stopwords
]

frequencia = FreqDist(palavras_sem_stopwords)
sentencas_importantes = defaultdict(int)
for i, sentenca in enumerate(sentencas):
    for palavra in word_tokenize(sentenca.lower()):
        if palavra in frequencia: