Exemple #1
0
    def test_hidden(self):
        paths = Root(__file__, ignore_hidden=False).paths()
        hidden = paths.testdir.Turing.filepaths
        paths = Root(__file__, ignore_hidden=True).paths()
        not_hidden = paths.testdir.Turing.files

        self.assertEqual(len(hidden), 3)
        self.assertEqual(len(not_hidden), 2)
Exemple #2
0
 def test_bad_dir(self):
     with self.assertRaises(Exception):
         Root(__file__, 2).paths()
Exemple #3
0
 def test_files(self):
     paths = Root(__file__).paths()
     files = paths.testdir.Jetsons.files
     self.assertEqual(set(files), {'George', 'Jane'})
Exemple #4
0
 def test_no_paths(self):
     with self.assertRaises(Exception):
         Root(depth=1).paths()
Exemple #5
0
 def test_dir_path(self):
     paths = Root(__file__).paths()
     testdir = paths.testdir.path
     truth = os.path.abspath(os.path.dirname(__file__)) + '/testdir/'
     self.assertEqual(testdir, truth)
Exemple #6
0
import pandas as pd
from filepaths import Root
from spacy.lang.en import English


paths = Root(__file__, 1).paths()
clean = paths.data.clean.path

pd.set_option('display.max_columns', 100)

my_stops = ['>', '<', 'p', '/p', 's', 'o', 't', ', ', 'd', '444444',
            '0pt', '1pt', '2pt', '4pt', '10pt', '12pt', '14pt', '15pt',
            '0px', '1px', '2px', '4px', '10px', '12px', '14px', '15px',
            'rgb', '255', '0', 'li', 'div', 'u', 'b', '0001pt', '39', '51'
            'meta', 'font', 'size', 'arial', 'nbsp', 'align', 'justify',
            'href', 'style', 'quot', 'msonormal', 'serif', 'text', 'ldquo',
            'rdquo', 'height', 'text', 'mso', 'san', 'margin', 'class', 'tab',
            'roman', 'times', 'http', 'www', 'html', 'background', 'pad',
            'bidi', 'color', 'bidi', 'san', 'rsquo', 'br', 'spin', 'letter',
            'spacing', 'space', 'hyphenate', 'place', 'line', 'placename',
            'placetype', 'border', 'box', 'normal', 'com', 'url', 'link',
            'publish', 'lsdexception', '00', '000', '000000', 'river',
            'family', 'water', 'boat', 'stay', 'helvetica', 'st', 'inherit',
            'width', 'false', 'face', 'non', '51', 'say', 'raft', 'rapid',
            'year', '1', '2', '3', 'rescue', 'true', 'paddle', 'w',
            'lock', 'priority', 'accent', 'semihidden', 'unhidewhenused',
            'table', 'list', 'lock', 'semihidden', 'amp', 'bt', 'grid',
            'layout', 'mode', 'narrative', 'initial', 'variant', 'weight',
            'outline', 'baseline', 'datum', 'vertical', 'leave', 'image',
            'max', 'position', 'display', '68', 'https', 'right', 'ligature',
            'stockticker', '08', '11', '06', '12', 'pa', 'source', '11pt',
Exemple #7
0
 def test_two_paths(self):
     with self.assertRaises(Exception):
         Root(__file__, 0, alt_path='Jetsons/').paths()
Exemple #8
0
import numpy as np
import pandas as pd
from filepaths import Root
from statsmodels.tools import add_constant
from statsmodels.discrete.discrete_model import Logit
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

paths = Root(__file__, depth=1).paths()
train = paths.data.train.path
holdout = paths.data.holdout.path

train_df = pd.read_pickle(train + 'train.pkl')
train_df = train_df[train_df['age'] != 0]

holdout_df = pd.read_pickle(holdout + 'holdout.pkl')
holdout_df = holdout_df[holdout_df['age'] != 0]

ss = StandardScaler()

columns = ['rellevel', 'difficulty', 'experience', 'F']
names = ['const', 'rellevel', 'difficulty', 'experience']


def get_X_y(df):

    df = df[columns]
    df.dropna(inplace=True)
Exemple #9
0
import numpy as np
import pandas as pd
from filepaths import Root
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenator import tokenize_and_lemmatize

paths = Root(1).paths()
clean = paths.data.clean.path

df = pd.read_pickle(clean + '/clean.pkl')

data = df['description']

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             max_df=0.55,
                             max_features=100000,
                             token_pattern=None,
                             tokenizer=tokenize_and_lemmatize)

X = vectorizer.fit_transform(data)

range_n_clusters = [2, 3, 4, 5, 6, 8, 10]

scores = []
for n_clusters in range_n_clusters:

    clusterer = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = clusterer.fit_predict(X)
Exemple #10
0
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from filepaths import Root
import sys
src = Root(__file__, 1).paths().src.path
sys.path.append(src)
from nlp_scorer import *
from tokenator import tokenize_and_lemmatize

ada = AdaBoostClassifier(n_estimators=50)

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             max_df=0.55,
                             max_features=100000,
                             token_pattern=None,
                             tokenizer=tokenize_and_lemmatize)

binary(ada, vectorizer)
X_test, X_train, y_test, y_train = split_data(X, y_binary)

importances = np.mean([tree.feature_importances_ for tree in ada.estimators_],
                      axis=0)

important_idx = importances.argsort()[-1:-16:-1]
important_val = importances[important_idx]
important_wrd = []
Exemple #11
0
import numpy as np
from filepaths import Root
import matplotlib.pyplot as plt

paths = Root(0).paths()
images = paths.images.path

inertia = [
    2422.489275321806, 2401.0146509817514, 2376.4660170141583,
    2358.2659294524606, 2342.375437232583, 2331.928273536303,
    2323.2218893501063, 2317.7068108677245, 2307.625672205139,
    2299.8464666019386
]

range_n_clusters = [2, 3, 4, 5, 6, 8, 10]

silhouette = [
    0.021031147139082912, 0.010593812029318254, 0.011169450306382025,
    0.013266511447292354, 0.01345184252350024, 0.01673858640961575,
    0.017921311429581137
]

silhouette_2 = [
    0.012350957951900533, 0.00781858377484346, 0.008483841502261651,
    0.009262873575787358, 0.009541112171262496, 0.010146434809065342,
    0.010636419349471717
]

x = np.arange(0, 10, 1) + 1

fig, ax = plt.subplots(figsize=(10, 6))
Exemple #12
0
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from filepaths import Root
import sys
paths = Root(__file__, 0).paths()
images = paths.images.path
clean = paths.data.clean.path
src = paths.src.path
sys.path.append(src)
from tokenator import tokenize_and_lemmatize


df = pd.read_pickle(clean + '/clean.pkl')

X = df['description']
y = np.array(df['F'])

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    shuffle=True,
                                                    random_state=42)

vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                             max_df=0.55,
                             max_features=100000,
                             token_pattern=None,