def get_sms_training_data(): np.random.seed(42) sms = get_data('sms-spam') counter = CountVectorizer(tokenizer=casual_tokenize) index = [ 'sms{}{}'.format(i, '!' * j) for (i, j) in zip(range(len(sms)), sms.spam) ] bow = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(), index=index) cols, terms = zip( *sorted(zip(counter.vocabulary_.values(), counter.vocabulary_.keys()))) # this one liner seems to say: sort counter.vocabulary_ by its values, then output those sorted values and keys. bow.columns = terms from sklearn.decomposition import LatentDirichletAllocation as LDiA mdl = LDiA(n_components=16, learning_method='batch') mdl = mdl.fit(bow) pd.set_option('display.width', 75) col_names = ["topic" + str(i) for i in range(16)] comp = pd.DataFrame(mdl.components_.T, index=terms, columns=col_names) comp.round(2).head(3) comp.topic3.sort_values(ascending=False)[:10] topic_vecs = mdl.transform(bow) topic_vecs = pd.DataFrame(topic_vecs, index=index, columns=col_names) topic_vecs.round(2).head() return topic_vecs, sms.spam
def append_predictions(): (vecs, _) = get_sms_training_data() mdl = train_LDA() sms = get_data( 'sms-spam' ) # importing this again...maybe the one giant script approach really is better... # ... # no, it's the children who are wrong sms['ldia_predict'] = mdl.predict(vecs) return sms
def get_sms_data(): pd.options.display.width = 120 sms = get_data('sms-spam') index = [ 'sms{}{}'.format(i, '!' * j) for (i, j) in zip(range(len(sms)), sms.spam) ] sms.index = index sms.head(6) from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize.casual import casual_tokenize tfidf = TfidfVectorizer(tokenizer=casual_tokenize) tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray() tfidf_docs = pd.DataFrame(tfidf_docs) tfidf_docs = tfidf_docs - tfidf_docs.mean() # mean centering return (tfidf_docs, sms, tfidf)
def embed_wordvecs(w2v=None, df=None, vocab='name', embedder=TSNE, **kwargs): w2v = os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin') if w2v is None else w2v try: model = KeyedVectors.load_word2vec_format(w2v, binary=True) if isinstance(w2v, str) else w2v except IOError: model = os.path.join(DATA_PATH, w2v) model = KeyedVectors.loadWord2Vec.load_word2vec_format(model, binary=True) if df is None: df = get_data('cities') if isinstance(vocab, str) and vocab in df.columns: vocab = set([s.replace(' ', '_') for s in vocab.name] + [s.replace(' ', '_') for s in df.country]) vocab = [word for word in vocab if word in model.wv] vectors = pd.DataFrame([model.wv[word] for word in vocab], index=vocab, columns=range(300)) tsne = embedder(**kwargs) tsne = tsne.fit(vectors) return pd.DataFrame(tsne.embedding_, columns=['x', 'y'])
def lsa_models(vocabulary='cat dog apple lion NYC love'.lower().split(), docs=11, verbosity=0): # vocabulary = 'cat dog apple lion NYC love big small bright'.lower().split() if isinstance(docs, int): docs = get_data('cats_and_dogs_sorted')[:docs] tdm, tfidfdm, tfidfer = docs_to_tdm(docs=docs, vocabulary=vocabulary) lsa_bow_model = lsa(tdm) # (tdm - tdm.mean(axis=1)) # SVD fails to converge if you center, like PCA does lsa_bow_model['vocabulary'] = tdm.index.values lsa_bow_model['docs'] = docs err = accuracy_study(verbosity=verbosity, **lsa_bow_model) lsa_bow_model['err'] = err lsa_bow_model['accuracy'] = list(1. - np.array(err)) lsa_tfidf_model = lsa(tdm=tfidfdm) lsa_bow_model['vocabulary'] = tfidfdm.index.values lsa_tfidf_model['docs'] = docs err = accuracy_study(verbosity=verbosity, **lsa_tfidf_model) lsa_tfidf_model['err'] = err lsa_tfidf_model['accuracy'] = list(1. - np.array(err)) return lsa_bow_model, lsa_tfidf_model
print( sa.polarity_scores( text="Python is very readable and its great for NLP . ")) corpus = [ "Absolutely perfect ! Love it! :-) :-) :-)", "Horrible! Completely useless. :(", "It was OK. Some good and some bad things." ] for doc in corpus: scores = sa.polarity_scores(doc) print("{:+}: {}".format(scores["compound"], doc)) print( "---------------------------------------------------------------------------------------------------" ) from nlpia.data.loaders import get_data movies = get_data("hutto_movies") print(movies.head().round(2)) print(movies.describe().round(2)) import pandas as pd pd.set_option("display.width", 75) from nltk.tokenize import casual_tokenize bags_of_words = [] from collections import Counter for text in movies.text: bags_of_words.append(Counter(casual_tokenize(text))) df_bows = pd.DataFrame.from_records(bags_of_words) df_bows = df_bows.fillna(0).astype(int) print(df_bows.shape) print(df_bows.head()) print(df_bows.head()[list(bags_of_words[0].keys())]) from sklearn.naive_bayes import MultinomialNB
4 1.47 If you sometimes like to go to the m... 5 1.73 Emerges as something rare, an issue ... >>> movies.describe().round(2) sentiment count 10605.00 mean 0.00 std 1.92 min -3.88 25% -1.77 50% -0.08 75% 1.83 max 3.94 """ from nlpia.data.loaders import get_data # noqa movies = get_data('hutto_movies') movies.head().round(2) # sentiment text # id # 1 2.27 The Rock is destined to be the 21st ... # 2 3.53 The gorgeously elaborate continuatio... # 3 -0.60 Effective but too tepid biopic # 4 1.47 If you sometimes like to go to the m... # 5 1.73 Emerges as something rare, an issue ... movies.describe().round(2) # sentiment # count 10605.00 # mean 0.00 # std 1.92 # min -3.88 # 25% -1.77
('Norbert_Wiener', 0.41063863039016724), ('Charles_Babbage', 0.40797877311706543)] TODO: automate the search for synonyms with higher than 60% similarity, walking a shallow graph """ import os from collections import OrderedDict import pandas as pd from nlpia.data.loaders import get_data, BIGDATA_PATH from gensim.models import KeyedVectors word_vectors = get_data('word2vec') # not in book wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz') # not in book, reader required to compose this path if 'word_vectors' not in globals(): # not in book WV = word_vectors = get_data('word2vec') word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True) ################################################### # Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals COMPONENT_WORDS = OrderedDict([ ('placeness', ('geography Geography geographic geographical geographical_location location ' + 'locale locations proximity').split()), ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
# 0.5501352 1 - _ # <3> # 0.4498648 # ---- # <1> Euclidean distance # <2> Cosine similarity # <3> Cosine distance wv['Illini'] # array([ 0.15625 , 0.18652344, 0.33203125, 0.55859375, 0.03637695, # -0.09375 , -0.05029297, 0.16796875, -0.0625 , 0.09912109, # -0.0291748 , 0.39257812, 0.05395508, 0.35351562, -0.02270508, from nlpia.data.loaders import get_data cities = get_data('cities') cities.head(1).T # geonameid 3039154 # name El Tarter # asciiname El Tarter # alternatenames Ehl Tarter,Эл Тартер # latitude 42.5795 # longitude 1.65362 # feature_class P # feature_code PPL # country_code AD # cc2 NaN # admin1_code 02 # admin2_code NaN # admin3_code NaN # admin4_code NaN
""" import pandas as pd import numpy as np import matplotlib matplotlib.use('TkAgg') # noqa import seaborn # noqa from matplotlib import pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from nlpia.data.loaders import get_data pd.options.display.width = 120 pd.options.display.max_columns = 12 corpus = docs = get_data('cats_and_dogs_sorted')[:12] vocabulary = 'cat dog apple lion nyc love big small bright'.split() tfidfer = TfidfVectorizer(min_df=1, max_df=.99, stop_words=None, token_pattern=r'(?u)\b\w+\b', vocabulary=vocabulary) tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense()) id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()] tfidf_dense.columns = list(zip(*sorted(id_words)))[1] tfidfer.use_idf = False tfidfer.norm = None bow_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense()) bow_dense.columns = list(zip(*sorted(id_words)))[1] bow_dense = bow_dense.astype(int)
import pandas as pd from nlpia.data.loaders import get_data from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize.casual import casual_tokenize from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.model_selection import train_test_split sms = get_data('sms-spam') index = [f"sms{i}{'!'*j}" for (i, j) in zip(range(len(sms)), sms.spam)] sms = pd.DataFrame(sms.values, columns=sms.columns, index=index) sms['spam'] = sms.spam.astype(int) tfidf_model = TfidfVectorizer(tokenizer=casual_tokenize) tfidf_docs = tfidf_model.fit_transform(raw_documents=sms.text).toarray() X_train, X_test, y_train, y_test = train_test_split(tfidf_docs, sms.spam, test_size=0.33) lda = LDA(n_components=1) lda.fit(X_train, y_train) print(lda.score(X_test, y_test))
from nlpia.data.loaders import get_data moview = get_data('hutto_movies') print(moview.head().round(2))
word_vector[ 'love'] = .2 * topic['pet'] - .1 * topic['animal'] + .1 * topic['city'] import pandas as pd from sklearn.decomposition import PCA import matplotlib matplotlib.use('TkAgg') import matplotlib matplotlib.use('TkAgg') import seaborn from matplotlib import pyplot as plt from nlpia.data.loaders import get_data df = get_data('pointcloud').sample(1000) pca = PCA(n_components=2) df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy')) df2d.plot(kind='scatter', x='x', y='y') plt.show() from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize.casual import casual_tokenize from nlpia.data.loaders import get_data sms = get_data('sms-spam') sms.head(3) # spam text # 0 0 Go until jurong point, crazy.. Available only ... # 1 0 Ok lar... Joking wif u oni... # 2 1 Free entry in 2 a wkly comp to win FA Cup fina...
import logging import sys import os import requests import re import pandas as pd from sklearn.manifold import TSNE from gensim.models import KeyedVectors from pugnlp.futil import find_files from nlpia.constants import secrets, DATA_PATH from nlpia.data.loaders import get_data, read_csv UTF8_TABLE = get_data('utf8') UTF8_TO_MULTIASCII = dict(zip(UTF8_TABLE.char, UTF8_TABLE.multiascii)) UTF8_TO_ASCII = dict(zip(UTF8_TABLE.char, UTF8_TABLE.ascii)) def stdout_logging(loglevel=logging.INFO): """Setup basic logging Args: loglevel (int): minimum loglevel for emitting messages """ logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(lineno)d: %(message)s" logging.config.dictConfig(level=loglevel, stream=sys.stdout, format=logformat,
def get_sms_data(): # the data is 4837 sms messages of which 638 are spam return get_data('sms-spam')
('brilliant_mathematician', 0.41744932532310486), ('Bertha_von_Suttner', 0.4144267439842224), ('Norbert_Wiener', 0.41063863039016724), ('Charles_Babbage', 0.40797877311706543)] TODO: automate the search for synonyms with higher than 60% similarity, walking a shallow graph """ from collections import OrderedDict import pandas as pd from nlpia.data.loaders import get_data # from gensim.models import KeyedVectors if 'word_vectors' not in globals(): WV = word_vectors = get_data('word2vec') # word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True) ################################################### # Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals COMPONENT_WORDS = OrderedDict([ ('placeness', ('geography Geography geographic geographical geographical_location location ' + 'locale locations proximity').split()), ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women' .split()), ('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal' .split()),
import sys import pandas as pd import numpy as np import matplotlib matplotlib.use('TkAgg') # noqa import seaborn # noqa from matplotlib import pyplot as plt from sklearn.feature_extraction.text import TfidfVectorizer from nlpia.data.loaders import get_data pd.options.display.width = 120 pd.options.display.max_columns = 16 VOCABULARY = vocabulary='cat dog apple lion NYC love'.lower().split() # 'cat dog apple lion NYC love big small bright'.lower().split() DOCS = get_data('cats_and_dogs_sorted') def docs_to_tdm(docs=DOCS, vocabulary=VOCABULARY, verbosity=0): tfidfer = TfidfVectorizer(min_df=1, max_df=.99, stop_words=None, token_pattern=r'(?u)\b\w+\b', vocabulary=vocabulary) tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense()) id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()] tfidf_dense.columns = list(zip(*sorted(id_words)))[1] tfidfer.use_idf = False tfidfer.norm = None bow_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense()) bow_dense.columns = list(zip(*sorted(id_words)))[1] bow_dense = bow_dense.astype(int) tfidfer.use_idf = True
""" >>> import pandas as pd >>> pd.set_option('display.max_columns', 6) >>> from sklearn.decomposition import PCA >>> import seaborn >>> from matplotlib import pyplot as plt >>> from nlpia.data.loaders import get_data >> df = get_data('pointcloud').sample(1000) >> pca = PCA(n_components=2) >> df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy')) >> df2d.plot(kind='scatter', x='x', y='y') >> plt.show() """ import pandas as pd pd.set_option('display.max_columns', 6) from sklearn.decomposition import PCA import matplotlib # matplotlib.use('TkAgg') # noqa import seaborn from matplotlib import pyplot as plt from nlpia.data.loaders import get_data df = get_data('pointcloud').sample(1000) pca = PCA(n_components=2) df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy')) df2d.plot(kind='scatter', x='x', y='y') plt.show()
def horse_plot(): df = get_data('pointcloud').sample(1000) pca = PCA(n_components=2) df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy')) df2d.plot(kind='scatter', x='x', y='y') plt.show()
# Manually load w2v # import os # from nlpia.data.loaders import BIGDATA_PATH # from gensim.models import KeyedVectors # path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz') # wv = KeyedVectors.load_word2vec_format(path, binary=True) # nlpia can now automatically download and load w2v from nlpia.data.loaders import get_data from gensim.models import KeyedVectors wv = get_data('word2vec') # wv = KeyedVectors.load_word2vec_format(path, binary=True) len(wv.vocab) # 3000000 wv.vectors.shape # (3000000, 300) import pandas as pd vocab = pd.Series(wv.vocab) vocab.iloc[100000:100006] # different words for new KeyedVector format # Illington_Fund Vocab(count:447860, index:2552140) # Illingworth Vocab(count:2905166, index:94834) # Illingworth_Halifax Vocab(count:1984281, index:1015719) # Illini Vocab(count:2984391, index:15609) # IlliniBoard.com Vocab(count:1481047, index:1518953) # Illini_Bluffs Vocab(count:2636947, index:363053) import numpy as np
peopleness 0.35 animalness 0.17 conceptness -0.32 femaleness 0.26 dtype: float64 TODO: automate the search for synonyms with higher than 60% similarity, walking a shallow graph """ import pandas as pd from nlpia.data.loaders import get_data from gensim.models.keyedvectors import KeyedVectors wordvector_path = get_data('word2vec') word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True) ################################################### # Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors global COMPONENT_WORDS = [ ('placeness', ('geography Geography geographic geographical geographical_location location ' + 'locale locations proximity').split()), ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women' .split()), ('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal' .split()),
from nlpia.data.loaders import get_data from nltk.tokenize import casual_tokenize import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem import PorterStemmer # import nltk # nltk.download('wordnet') # noqa # from nltk.stem.wordnet import WordNetLemmatizer corpus = get_data('cats_and_dogs') STOPWORDS = ( 'a an and or the do are with from for of on in by if at to into them' + ' I it its it\'s that than me my our you your ? , . !').split() SYNONYMS = dict( zip( 'wolv people person women woman man human he we her she him his hers' .split(), 'wolf her her her her her her her her her her her her her'. split())) SYNONYMS.update( dict( zip( 'ate pat smarter have had isn\'t hasn\'t no got get become been was were wa be' .split(), 'eat pet smart has has not not not has has is is is is is is' .split()))) tfidfer = TfidfVectorizer(min_df=2, max_df=.6) docs = [doc.lower() for doc in corpus]
def plot_city_wordvectors(): global wv path = get_data('wv') wv = KeyedVectors.load_word2vec_format(path, binary=True) if wv is None else wv cities = get_data('cities') cities.head(1).T # geonameid 3039154 # name El Tarter # asciiname El Tarter # alternatenames Ehl Tarter,Эл Тартер # latitude 42.5795 # longitude 1.65362 # feature_class P # feature_code PPL # country_code AD # cc2 NaN # admin1_code 02 # admin2_code NaN # admin3_code NaN # admin4_code NaN # population 1052 # elevation NaN # dem 1721 # timezone Europe/Andorra # modification_date 2012-11-03 us = cities[(cities.country_code == 'US') & (cities.admin1_code.notnull())].copy() states = pd.read_csv( 'http://www.fonz.net/blog/wp-content/uploads/2008/04/states.csv') states = dict(zip(states.Abbreviation, states.State)) us['city'] = us.name.copy() us['st'] = us.admin1_code.copy() us['state'] = us.st.map(states) us[us.columns[-3:]].head() # city st state # geonameid # 4046255 Bay Minette AL Alabama # 4046274 Edna TX Texas # 4046319 Bayou La Batre AL Alabama # 4046332 Henderson TX Texas # 4046430 Natalia TX Texas vocab = pd.np.concatenate([us.city, us.st, us.state]) vocab = np.array([word for word in vocab if word in wv.wv]) vocab[:10] # array(['Edna', 'Henderson', 'Natalia', 'Yorktown', 'Brighton', 'Berry', # 'Trinity', 'Villas', 'Bessemer', 'Aurora'], dtype='<U15') # >>> us_300D = pd.DataFrame([[i] + list(wv[c] + (wv[state] if state in vocab else wv[s]) + wv[s]) for i, c, state, s # ... in zip(us.index, us.city, us.state, us.st) if c in vocab]) city_plus_state = [] for c, state, st in zip(us.city, us.state, us.st): if c not in vocab: continue row = [] if state in vocab: row.extend(wv[c] + wv[state]) else: row.extend(wv[c] + wv[st]) city_plus_state.append(row) us_300D = pd.DataFrame(city_plus_state) from sklearn.decomposition import PCA pca = PCA(n_components=2) # <1> us_300D = get_data('cities_us_wordvectors') us_2D = pca.fit_transform(us_300D.iloc[:, :300]) # <2>
for words in docs] docs = [[synonyms.get(w, w) for w in words if w not in stopwords] for words in docs] docs = [' '.join(w for w in words if w not in stopwords) for words in docs] return docs def tokenize(text, vocabulary, synonyms=SYNONYMS, stopwords=STOPWORDS): doc = normalize_corpus_words([text.lower()], synonyms=synonyms, stopwords=stopwords)[0] stems = [w for w in doc.split() if w in vocabulary] return stems corpus = get_data('cats_and_dogs') docs = normalize_corpus_words(corpus, stemmer=None) tfidfer = TfidfVectorizer(min_df=2, max_df=.6, stop_words=None, token_pattern=r'(?u)\b\w+\b') tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense()) id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()] tfidf_dense.columns = list(zip(*sorted(id_words)))[1] pd.options.display.width = 110 pd.options.display.max_columns = 14 pd.options.display.max_colwidth = 32 fun_words = vocabulary = 'cat dog apple lion nyc love big small' fun_stems = normalize_corpus_words([fun_words])[0].split() fun_words = fun_words.split()
replies = [] for i, record in tqdm(df.iterrows()): turns = list(split_turns(record.Context)) statement = turns[-1] if len(turns) else '\n' # <1> statements.append(statement) turns = list(split_turns(record.Utterance)) reply = turns[-1] if len(turns) else '\n' replies.append(reply) df['statement'] = statements df['reply'] = replies return df def format_ubuntu_dialog(df): """ Print statements paired with replies, formatted for easy review """ s = '' for i, record in df.iterrows(): statement = list(split_turns(record.Context))[-1] # <1> reply = list(split_turns(record.Utterance))[-1] # <2> s += 'Statement: {}\n'.format(statement) s += 'Reply: {}\n\n'.format(reply) return s # <1> We need to use `list` to force iteration through the generator # <2> The `[-1]` index retrievs the last "turn" in the sequence, discarding everything else if __name__ == '__main__': df = get_data('ubuntu_dialog') df = preprocess_ubuntu_corpus(df) print(format_ubuntu_dialog(df.head(4)))