def embed_wordvecs(w2v=None, df=None, vocab='name', embedder=TSNE, **kwargs): w2v = os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin') if w2v is None else w2v try: model = Word2Vec.loadWord2Vec.load_word2vec_format(w2v, binary=True) if isinstance(w2v, str) else w2v except IOError: model = os.path.join(DATA_PATH, w2v) model = Word2Vec.loadWord2Vec.load_word2vec_format(model, binary=True) if df is None: df = get_data('cities') if isinstance(vocab, str) and vocab in df.columns: vocab = set([s.replace(' ', '_') for s in vocab.name] + [s.replace(' ', '_') for s in df.country]) vocab = [word for word in vocab if word in model.wv] vectors = pd.DataFrame([model.wv[word] for word in vocab], index=vocab, columns=range(300)) tsne = embedder(**kwargs) tsne = tsne.fit(vectors) return pd.DataFrame(tsne.embedding_, columns=['x', 'y'])
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from nltk.tokenize.casual import casual_tokenize from matplotlib import pyplot as plt import seaborn # noqa from nlpia.data import get_data from nltk.sentiment import SentimentIntensityAnalyzer from nlpia.models import LinearRegressor from sklearn.linear_model import SGDRegressor sms = get_data('sms-spam') tfidf = TfidfVectorizer(tokenizer=casual_tokenize) tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray() tfidf_docs = pd.DataFrame(tfidf_docs, columns=list(zip(*sorted([(v, k) for (k, v) in tfidf.vocabulary_.items()])))[1]) # TFIDF tfidf_lda = LDA(n_components=1) tfidf_lda.fit(tfidf_docs, sms.spam) # UserWarning: Variables are collinear. warnings.warn("Variables are collinear.") sms['tfidf_lda_spam_prob'] = tfidf_lda.predict_proba(tfidf_docs)[:, 1] # Almost all 00000...0001 or .9999999... # TFIDF->PCA pca = PCA(n_components=256) pca = pca.fit(tfidf_docs) pca_topic_vectors = pca.transform(tfidf_docs)
word_vector['cat'] = .3 * topic['pet'] + .1 * topic['animal'] + 0 * topic['city'] word_vector['dog'] = .3 * topic['pet'] + .1 * topic['animal'] - .1 * topic['city'] word_vector['apple'] = 0 * topic['pet'] - .1 * topic['animal'] + .2 * topic['city'] word_vector['lion'] = 0 * topic['pet'] + .5 * topic['animal'] - .1 * topic['city'] word_vector['NYC'] = -.2 * topic['pet'] + .1 * topic['animal'] + .5 * topic['city'] word_vector['love'] = .2 * topic['pet'] - .1 * topic['animal'] + .1 * topic['city'] import pandas as pd from sklearn.decomposition import PCA import seaborn from matplotlib import pyplot as plt from nlpia.data import get_data df = get_data('pointcloud').sample(1000) pca = PCA(n_components=2) df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy')) df2d.plot(kind='scatter', x='x', y='y') plt.show() from sklearn.feature_extraction.text import TfidfVectorizer from nltk.tokenize.casual import casual_tokenize from nlpia.data import get_data sms = get_data('sms-spam') sms.head(3) # spam text # 0 0 Go until jurong point, crazy.. Available only ...