def test_hidden(self): paths = Root(__file__, ignore_hidden=False).paths() hidden = paths.testdir.Turing.filepaths paths = Root(__file__, ignore_hidden=True).paths() not_hidden = paths.testdir.Turing.files self.assertEqual(len(hidden), 3) self.assertEqual(len(not_hidden), 2)
def test_bad_dir(self): with self.assertRaises(Exception): Root(__file__, 2).paths()
def test_files(self): paths = Root(__file__).paths() files = paths.testdir.Jetsons.files self.assertEqual(set(files), {'George', 'Jane'})
def test_no_paths(self): with self.assertRaises(Exception): Root(depth=1).paths()
def test_dir_path(self): paths = Root(__file__).paths() testdir = paths.testdir.path truth = os.path.abspath(os.path.dirname(__file__)) + '/testdir/' self.assertEqual(testdir, truth)
import pandas as pd from filepaths import Root from spacy.lang.en import English paths = Root(__file__, 1).paths() clean = paths.data.clean.path pd.set_option('display.max_columns', 100) my_stops = ['>', '<', 'p', '/p', 's', 'o', 't', ', ', 'd', '444444', '0pt', '1pt', '2pt', '4pt', '10pt', '12pt', '14pt', '15pt', '0px', '1px', '2px', '4px', '10px', '12px', '14px', '15px', 'rgb', '255', '0', 'li', 'div', 'u', 'b', '0001pt', '39', '51' 'meta', 'font', 'size', 'arial', 'nbsp', 'align', 'justify', 'href', 'style', 'quot', 'msonormal', 'serif', 'text', 'ldquo', 'rdquo', 'height', 'text', 'mso', 'san', 'margin', 'class', 'tab', 'roman', 'times', 'http', 'www', 'html', 'background', 'pad', 'bidi', 'color', 'bidi', 'san', 'rsquo', 'br', 'spin', 'letter', 'spacing', 'space', 'hyphenate', 'place', 'line', 'placename', 'placetype', 'border', 'box', 'normal', 'com', 'url', 'link', 'publish', 'lsdexception', '00', '000', '000000', 'river', 'family', 'water', 'boat', 'stay', 'helvetica', 'st', 'inherit', 'width', 'false', 'face', 'non', '51', 'say', 'raft', 'rapid', 'year', '1', '2', '3', 'rescue', 'true', 'paddle', 'w', 'lock', 'priority', 'accent', 'semihidden', 'unhidewhenused', 'table', 'list', 'lock', 'semihidden', 'amp', 'bt', 'grid', 'layout', 'mode', 'narrative', 'initial', 'variant', 'weight', 'outline', 'baseline', 'datum', 'vertical', 'leave', 'image', 'max', 'position', 'display', '68', 'https', 'right', 'ligature', 'stockticker', '08', '11', '06', '12', 'pa', 'source', '11pt',
def test_two_paths(self): with self.assertRaises(Exception): Root(__file__, 0, alt_path='Jetsons/').paths()
import numpy as np import pandas as pd from filepaths import Root from statsmodels.tools import add_constant from statsmodels.discrete.discrete_model import Logit from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split, KFold from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score paths = Root(__file__, depth=1).paths() train = paths.data.train.path holdout = paths.data.holdout.path train_df = pd.read_pickle(train + 'train.pkl') train_df = train_df[train_df['age'] != 0] holdout_df = pd.read_pickle(holdout + 'holdout.pkl') holdout_df = holdout_df[holdout_df['age'] != 0] ss = StandardScaler() columns = ['rellevel', 'difficulty', 'experience', 'F'] names = ['const', 'rellevel', 'difficulty', 'experience'] def get_X_y(df): df = df[columns] df.dropna(inplace=True)
import numpy as np import pandas as pd from filepaths import Root from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score from sklearn.feature_extraction.text import TfidfVectorizer from tokenator import tokenize_and_lemmatize paths = Root(1).paths() clean = paths.data.clean.path df = pd.read_pickle(clean + '/clean.pkl') data = df['description'] vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.55, max_features=100000, token_pattern=None, tokenizer=tokenize_and_lemmatize) X = vectorizer.fit_transform(data) range_n_clusters = [2, 3, 4, 5, 6, 8, 10] scores = [] for n_clusters in range_n_clusters: clusterer = KMeans(n_clusters=n_clusters, random_state=42) cluster_labels = clusterer.fit_predict(X)
import numpy as np import pandas as pd import joblib import matplotlib.pyplot as plt from sklearn.ensemble import AdaBoostClassifier from sklearn.feature_extraction.text import TfidfVectorizer from filepaths import Root import sys src = Root(__file__, 1).paths().src.path sys.path.append(src) from nlp_scorer import * from tokenator import tokenize_and_lemmatize ada = AdaBoostClassifier(n_estimators=50) vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.55, max_features=100000, token_pattern=None, tokenizer=tokenize_and_lemmatize) binary(ada, vectorizer) X_test, X_train, y_test, y_train = split_data(X, y_binary) importances = np.mean([tree.feature_importances_ for tree in ada.estimators_], axis=0) important_idx = importances.argsort()[-1:-16:-1] important_val = importances[important_idx] important_wrd = []
import numpy as np from filepaths import Root import matplotlib.pyplot as plt paths = Root(0).paths() images = paths.images.path inertia = [ 2422.489275321806, 2401.0146509817514, 2376.4660170141583, 2358.2659294524606, 2342.375437232583, 2331.928273536303, 2323.2218893501063, 2317.7068108677245, 2307.625672205139, 2299.8464666019386 ] range_n_clusters = [2, 3, 4, 5, 6, 8, 10] silhouette = [ 0.021031147139082912, 0.010593812029318254, 0.011169450306382025, 0.013266511447292354, 0.01345184252350024, 0.01673858640961575, 0.017921311429581137 ] silhouette_2 = [ 0.012350957951900533, 0.00781858377484346, 0.008483841502261651, 0.009262873575787358, 0.009541112171262496, 0.010146434809065342, 0.010636419349471717 ] x = np.arange(0, 10, 1) + 1 fig, ax = plt.subplots(figsize=(10, 6))
import joblib import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.ensemble import BaggingClassifier from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from filepaths import Root import sys paths = Root(__file__, 0).paths() images = paths.images.path clean = paths.data.clean.path src = paths.src.path sys.path.append(src) from tokenator import tokenize_and_lemmatize df = pd.read_pickle(clean + '/clean.pkl') X = df['description'] y = np.array(df['F']) X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42) vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.55, max_features=100000, token_pattern=None,