from sklearn.linear_model import Perceptron, LogisticRegression from sklearn.neural_network import MLPClassifier # new helpers: from shared import dataset_local_path, bootstrap_accuracy, simple_boxplot, TODO # stdlib: from dataclasses import dataclass import json from typing import Dict, Any, List #%% load up the data examples = [] ys = [] with open(dataset_local_path("poetry_id.jsonl")) as fp: for line in fp: info = json.loads(line) # Note: the data contains a whole bunch of extra stuff; we just want numeric features for now. keep = info["features"] # whether or not it's poetry is our label. ys.append(info["poetry"]) # hold onto this single dictionary. examples.append(keep) ## CONVERT TO MATRIX: feature_numbering = DictVectorizer(sort=True) X = feature_numbering.fit_transform(examples) print("Features as {} matrix.".format(X.shape))
import random import matplotlib.pyplot as plt import pandas as pd import numpy as np import typing as T import re import numpy as np from dataclasses import dataclass from shared import bootstrap_accuracy, bootstrap_auc, dataset_local_path, simple_boxplot RAND = 123456 random.seed(RAND) # Using 'pandas' to load data now: df: pd.DataFrame = pd.read_json(dataset_local_path("lit-wiki-2020.jsonl.gz"), lines=True) # Regular expresssions to grab parts of the text: WORDS = re.compile(r"(\w+)") NUMBERS = re.compile(r"(\d+)") def extract_features(row): """ Given the title and body of a Wikipedia article, extract features that might be of use to the 'is literary' task. Return named features in a dictionary. """ title = row["title"].lower()
simple_boxplot, ) # stdlib: from dataclasses import dataclass, field import json, gzip from typing import Dict, List #%% load up the data # Try 'POETRY' dataset = "WIKI" examples: List[str] = [] ys: List[bool] = [] if dataset == "WIKI": with gzip.open(dataset_local_path("lit-wiki-2020.jsonl.gz"), "rt") as fp: for line in fp: info = json.loads(line) # Note: the data contains a whole bunch of extra stuff; we just want numeric features for now. keep = info["body"] # whether or not it's poetry is our label. ys.append(info["truth_value"]) # hold onto this single dictionary. examples.append(keep) else: # take only one per book! by_book = {} with open(dataset_local_path("poetry_id.jsonl")) as fp: for line in fp: info = json.loads(line) # dictionary keeps this key unique:
from shared import dataset_local_path import pandas as pd import numpy as np from typing import Tuple from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import SGDClassifier import os, gzip from tqdm import tqdm clickbait = pd.read_csv(dataset_local_path("clickbait.csv.gz")) glove = {} with gzip.open(os.environ["HOME"] + "/data/glove.6B.50d.txt.gz", "rt") as vecf: for line in tqdm(vecf, total=400000): split = line.index(" ") word = line[:split] vector = np.fromstring(line[split + 1:], dtype=np.float32, sep=" ") glove[word] = vector if word == "the": print(word, vector) print(clickbait.head()) # skip citation df = clickbait.iloc[1:] RANDOM_SEED = 12345 tv_f, test_f = train_test_split(df, test_size=0.25, random_state=RANDOM_SEED) train_f, vali_f = train_test_split(tv_f,
#%% import pandas as pd import numpy as np import typing as T import re import numpy as np from tqdm import tqdm from dataclasses import dataclass from shared import bootstrap_auc, dataset_local_path, simple_boxplot df: pd.DataFrame = pd.read_json( dataset_local_path("lit-wiki-2020.jsonl.gz"), lines=True ) # Debug loading: # df.head() # Regular expresssions to grab parts of the text: WORDS = re.compile(r"(\w+)") NUMBERS = re.compile(r"(\d+)") def extract_features(row): """ Given the title and body of a Wikipedia article, extract features that might be of use to the 'is literary' task. Return named features in a dictionary. """
""" Problem 1: We have a copy of Wikipedia (I spared you the other 6 million pages). It is separate from our labels we collected. """ @dataclass class JustWikiPage: title: str wiki_id: str body: str # Load our pages into this pages list. pages: List[JustWikiPage] = [] with gzip.open(dataset_local_path("tiny-wiki.jsonl.gz"), "rt") as fp: for line in fp: entry = json.loads(line) pages.append(JustWikiPage(**entry)) @dataclass class JustWikiLabel: wiki_id: str is_literary: bool # Load our judgments/labels/truths/ys into this labels list: labels: List[JustWikiLabel] = [] with open(dataset_local_path("tiny-wiki-labels.jsonl")) as fp: for line in fp:
dataset_local_path, bootstrap_r2, simple_boxplot, ) import random import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler # start off by seeding random number generators: RANDOM_SEED = 12345 random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) # Load the AirQualityUCI Dataset: df = pd.read_csv(dataset_local_path("AirQualityUCI.csv"), sep=";", decimal=",") print(df.shape) # drop empty columns: df = df.dropna(how="all", axis="columns") print(df.shape) PREDICT_COL = "CO(GT)" # select only the rows where our 'y' is present: df = df[df[PREDICT_COL] > -200.0] print(df.shape) # delete Date/Time columns df.pop("Date") df.pop("Time")
from sklearn.model_selection import train_test_split from sklearn.feature_extraction import DictVectorizer import numpy as np from scipy.spatial.distance import euclidean from typing import List, Tuple from tqdm import tqdm import csv from shared import dataset_local_path ys = [] examples = [] with open(dataset_local_path("AirQualityUCI.csv")) as fp: # This is a CSV file where the separators are not commas! rows = csv.reader(fp, delimiter=";") header = next(rows) for row in rows: datapoint = {} # {'Date': '10/03/2004', 'Time': '18.00.00', # 'CO(GT)': '2,6', 'PT08.S1(CO)': '1360', 'NMHC(GT)': '150', 'C6H6(GT)': '11,9', # 'PT08.S2(NMHC)': '1046', 'NOx(GT)': '166', 'PT08.S3(NOx)': '1056', # 'NO2(GT)': '113', 'PT08.S4(NO2)': '1692', 'PT08.S5(O3)': '1268', # 'T': '13,6', 'RH': '48,9', 'AH': '0,7578', '': ''} date = None time = None for (column_name, column_value) in zip(header, row): if column_value == "" or column_name == "": continue elif column_name == "Date":
import numpy as np from shared import ( dataset_local_path, ) from typing import Tuple, Dict import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline # start off by seeding random number generators: RANDOM_SEED = 12345 df: pd.DataFrame = pd.read_json(dataset_local_path("poetry_id.jsonl"), lines=True) features = pd.json_normalize(df.features) features = features.join([df.poetry, df.words]) tv_f, test_f = train_test_split(features, test_size=0.25, random_state=RANDOM_SEED) train_f, vali_f = train_test_split(tv_f, test_size=0.25, random_state=RANDOM_SEED) textual = TfidfVectorizer(max_df=0.75, min_df=2, dtype=np.float32) numeric = make_pipeline(DictVectorizer(sparse=False), StandardScaler())