Exemple #1
0
def get_sms_training_data():
    np.random.seed(42)
    sms = get_data('sms-spam')
    counter = CountVectorizer(tokenizer=casual_tokenize)
    index = [
        'sms{}{}'.format(i, '!' * j)
        for (i, j) in zip(range(len(sms)), sms.spam)
    ]
    bow = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(),
                       index=index)
    cols, terms = zip(
        *sorted(zip(counter.vocabulary_.values(), counter.vocabulary_.keys())))
    # this one liner seems to say: sort counter.vocabulary_ by its values, then output those sorted values and keys.
    bow.columns = terms

    from sklearn.decomposition import LatentDirichletAllocation as LDiA
    mdl = LDiA(n_components=16, learning_method='batch')
    mdl = mdl.fit(bow)
    pd.set_option('display.width', 75)
    col_names = ["topic" + str(i) for i in range(16)]
    comp = pd.DataFrame(mdl.components_.T, index=terms, columns=col_names)
    comp.round(2).head(3)
    comp.topic3.sort_values(ascending=False)[:10]
    topic_vecs = mdl.transform(bow)
    topic_vecs = pd.DataFrame(topic_vecs, index=index, columns=col_names)
    topic_vecs.round(2).head()
    return topic_vecs, sms.spam
Exemple #2
0
def append_predictions():
    (vecs, _) = get_sms_training_data()
    mdl = train_LDA()
    sms = get_data(
        'sms-spam'
    )  # importing this again...maybe the one giant script approach really is better...
    # ...
    # no, it's the children who are wrong
    sms['ldia_predict'] = mdl.predict(vecs)
    return sms
Exemple #3
0
def get_sms_data():
    pd.options.display.width = 120
    sms = get_data('sms-spam')
    index = [
        'sms{}{}'.format(i, '!' * j)
        for (i, j) in zip(range(len(sms)), sms.spam)
    ]
    sms.index = index
    sms.head(6)

    from sklearn.feature_extraction.text import TfidfVectorizer
    from nltk.tokenize.casual import casual_tokenize
    tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
    tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
    tfidf_docs = pd.DataFrame(tfidf_docs)
    tfidf_docs = tfidf_docs - tfidf_docs.mean()  # mean centering
    return (tfidf_docs, sms, tfidf)
Exemple #4
0
def embed_wordvecs(w2v=None, df=None, vocab='name', embedder=TSNE, **kwargs):
    w2v = os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin') if w2v is None else w2v
    try:
        model = KeyedVectors.load_word2vec_format(w2v, binary=True) if isinstance(w2v, str) else w2v
    except IOError:
        model = os.path.join(DATA_PATH, w2v)
        model = KeyedVectors.loadWord2Vec.load_word2vec_format(model, binary=True)
    if df is None:
        df = get_data('cities')
    if isinstance(vocab, str) and vocab in df.columns:
        vocab = set([s.replace(' ', '_') for s in vocab.name] + [s.replace(' ', '_') for s in df.country])

    vocab = [word for word in vocab if word in model.wv]
    vectors = pd.DataFrame([model.wv[word] for word in vocab], index=vocab, columns=range(300))
    tsne = embedder(**kwargs)
    tsne = tsne.fit(vectors)
    return pd.DataFrame(tsne.embedding_, columns=['x', 'y'])
Exemple #5
0
def lsa_models(vocabulary='cat dog apple lion NYC love'.lower().split(), docs=11, verbosity=0):
    # vocabulary = 'cat dog apple lion NYC love big small bright'.lower().split()
    if isinstance(docs, int):
        docs = get_data('cats_and_dogs_sorted')[:docs]
    tdm, tfidfdm, tfidfer = docs_to_tdm(docs=docs, vocabulary=vocabulary)
    lsa_bow_model = lsa(tdm)  # (tdm - tdm.mean(axis=1)) # SVD fails to converge if you center, like PCA does
    lsa_bow_model['vocabulary'] = tdm.index.values
    lsa_bow_model['docs'] = docs
    err = accuracy_study(verbosity=verbosity, **lsa_bow_model)
    lsa_bow_model['err'] = err
    lsa_bow_model['accuracy'] = list(1. - np.array(err))
    
    lsa_tfidf_model = lsa(tdm=tfidfdm)
    lsa_bow_model['vocabulary'] = tfidfdm.index.values
    lsa_tfidf_model['docs'] = docs
    err = accuracy_study(verbosity=verbosity, **lsa_tfidf_model)
    lsa_tfidf_model['err'] = err
    lsa_tfidf_model['accuracy'] = list(1. - np.array(err))

    return lsa_bow_model, lsa_tfidf_model
Exemple #6
0
print(
    sa.polarity_scores(
        text="Python is very readable and its great for NLP . "))
corpus = [
    "Absolutely perfect ! Love it! :-) :-) :-)",
    "Horrible! Completely useless. :(",
    "It was OK. Some good and some bad things."
]
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print("{:+}: {}".format(scores["compound"], doc))
print(
    "---------------------------------------------------------------------------------------------------"
)
from nlpia.data.loaders import get_data
movies = get_data("hutto_movies")
print(movies.head().round(2))
print(movies.describe().round(2))
import pandas as pd
pd.set_option("display.width", 75)
from nltk.tokenize import casual_tokenize
bags_of_words = []
from collections import Counter
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))
df_bows = pd.DataFrame.from_records(bags_of_words)
df_bows = df_bows.fillna(0).astype(int)
print(df_bows.shape)
print(df_bows.head())
print(df_bows.head()[list(bags_of_words[0].keys())])
from sklearn.naive_bayes import MultinomialNB
Exemple #7
0
4        1.47  If you sometimes like to go to the m...
5        1.73  Emerges as something rare, an issue ...
>>> movies.describe().round(2)
       sentiment
count   10605.00
mean        0.00
std         1.92
min        -3.88
25%        -1.77
50%        -0.08
75%         1.83
max         3.94
"""

from nlpia.data.loaders import get_data  # noqa
movies = get_data('hutto_movies')
movies.head().round(2)
#     sentiment                                     text
# id
# 1        2.27  The Rock is destined to be the 21st ...
# 2        3.53  The gorgeously elaborate continuatio...
# 3       -0.60           Effective but too tepid biopic
# 4        1.47  If you sometimes like to go to the m...
# 5        1.73  Emerges as something rare, an issue ...
movies.describe().round(2)
#        sentiment
# count   10605.00
# mean        0.00
# std         1.92
# min        -3.88
# 25%        -1.77
 ('Norbert_Wiener', 0.41063863039016724),
 ('Charles_Babbage', 0.40797877311706543)]

TODO:
automate the search for synonyms with higher than 60% similarity, walking a shallow graph
"""
import os

from collections import OrderedDict

import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors


word_vectors = get_data('word2vec')  # not in book

wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')    # not in book, reader required to compose this path

if 'word_vectors' not in globals():  # not in book
    WV = word_vectors = get_data('word2vec')
    word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)


###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals

COMPONENT_WORDS = OrderedDict([
    ('placeness', ('geography Geography geographic geographical geographical_location location ' +
                   'locale locations proximity').split()),
    ('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
Exemple #9
0
# 0.5501352
1 - _  # <3>
# 0.4498648
# ----
# <1> Euclidean distance
# <2> Cosine similarity
# <3> Cosine distance

wv['Illini']
# array([ 0.15625   ,  0.18652344,  0.33203125,  0.55859375,  0.03637695,
#        -0.09375   , -0.05029297,  0.16796875, -0.0625    ,  0.09912109,
#        -0.0291748 ,  0.39257812,  0.05395508,  0.35351562, -0.02270508,

from nlpia.data.loaders import get_data

cities = get_data('cities')
cities.head(1).T
# geonameid                       3039154
# name                          El Tarter
# asciiname                     El Tarter
# alternatenames     Ehl Tarter,Эл Тартер
# latitude                        42.5795
# longitude                       1.65362
# feature_class                         P
# feature_code                        PPL
# country_code                         AD
# cc2                                 NaN
# admin1_code                          02
# admin2_code                         NaN
# admin3_code                         NaN
# admin4_code                         NaN
"""
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('TkAgg')  # noqa
import seaborn  # noqa
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

from nlpia.data.loaders import get_data

pd.options.display.width = 120
pd.options.display.max_columns = 12

corpus = docs = get_data('cats_and_dogs_sorted')[:12]
vocabulary = 'cat dog apple lion nyc love big small bright'.split()
tfidfer = TfidfVectorizer(min_df=1,
                          max_df=.99,
                          stop_words=None,
                          token_pattern=r'(?u)\b\w+\b',
                          vocabulary=vocabulary)
tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense())
id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()]
tfidf_dense.columns = list(zip(*sorted(id_words)))[1]

tfidfer.use_idf = False
tfidfer.norm = None
bow_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense())
bow_dense.columns = list(zip(*sorted(id_words)))[1]
bow_dense = bow_dense.astype(int)
Exemple #11
0
import pandas as pd
from nlpia.data.loaders import get_data
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

sms = get_data('sms-spam')
index = [f"sms{i}{'!'*j}" for (i, j) in zip(range(len(sms)), sms.spam)]

sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)

sms['spam'] = sms.spam.astype(int)

tfidf_model = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf_model.fit_transform(raw_documents=sms.text).toarray()

X_train, X_test, y_train, y_test = train_test_split(tfidf_docs,
                                                    sms.spam,
                                                    test_size=0.33)

lda = LDA(n_components=1)
lda.fit(X_train, y_train)

print(lda.score(X_test, y_test))
Exemple #12
0
from nlpia.data.loaders import get_data

moview = get_data('hutto_movies')

print(moview.head().round(2))

Exemple #13
0
word_vector[
    'love'] = .2 * topic['pet'] - .1 * topic['animal'] + .1 * topic['city']

import pandas as pd
from sklearn.decomposition import PCA

import matplotlib
matplotlib.use('TkAgg')
import matplotlib
matplotlib.use('TkAgg')
import seaborn

from matplotlib import pyplot as plt
from nlpia.data.loaders import get_data

df = get_data('pointcloud').sample(1000)
pca = PCA(n_components=2)
df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy'))
df2d.plot(kind='scatter', x='x', y='y')
plt.show()

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
from nlpia.data.loaders import get_data

sms = get_data('sms-spam')
sms.head(3)
#    spam                                               text
# 0     0  Go until jurong point, crazy.. Available only ...
# 1     0                      Ok lar... Joking wif u oni...
# 2     1  Free entry in 2 a wkly comp to win FA Cup fina...
Exemple #14
0
import logging
import sys
import os
import requests
import re

import pandas as pd
from sklearn.manifold import TSNE
from gensim.models import KeyedVectors

from pugnlp.futil import find_files

from nlpia.constants import secrets, DATA_PATH
from nlpia.data.loaders import get_data, read_csv

UTF8_TABLE = get_data('utf8')
UTF8_TO_MULTIASCII = dict(zip(UTF8_TABLE.char, UTF8_TABLE.multiascii))
UTF8_TO_ASCII = dict(zip(UTF8_TABLE.char, UTF8_TABLE.ascii))


def stdout_logging(loglevel=logging.INFO):
    """Setup basic logging

    Args:
      loglevel (int): minimum loglevel for emitting messages
    """
    logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(lineno)d: %(message)s"

    logging.config.dictConfig(level=loglevel,
                              stream=sys.stdout,
                              format=logformat,
Exemple #15
0
def get_sms_data():
    # the data is 4837 sms messages of which 638 are spam
    return get_data('sms-spam')
 ('brilliant_mathematician', 0.41744932532310486),
 ('Bertha_von_Suttner', 0.4144267439842224),
 ('Norbert_Wiener', 0.41063863039016724),
 ('Charles_Babbage', 0.40797877311706543)]

TODO:
automate the search for synonyms with higher than 60% similarity, walking a shallow graph
"""
from collections import OrderedDict

import pandas as pd
from nlpia.data.loaders import get_data
# from gensim.models import KeyedVectors

if 'word_vectors' not in globals():
    WV = word_vectors = get_data('word2vec')
    # word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)

###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals

COMPONENT_WORDS = OrderedDict([
    ('placeness',
     ('geography Geography geographic geographical geographical_location location '
      + 'locale locations proximity').split()),
    ('peopleness',
     'human Humans homo_sapiens peole people individuals humankind people men women'
     .split()),
    ('animalness',
     'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'
     .split()),
Exemple #17
0
import sys
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('TkAgg')  # noqa
import seaborn  # noqa
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

from nlpia.data.loaders import get_data

pd.options.display.width = 120
pd.options.display.max_columns = 16

VOCABULARY = vocabulary='cat dog apple lion NYC love'.lower().split()  # 'cat dog apple lion NYC love big small bright'.lower().split()
DOCS = get_data('cats_and_dogs_sorted')


def docs_to_tdm(docs=DOCS, vocabulary=VOCABULARY, verbosity=0):
    tfidfer = TfidfVectorizer(min_df=1, max_df=.99, stop_words=None, token_pattern=r'(?u)\b\w+\b',
                              vocabulary=vocabulary)
    tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense())
    id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()]
    tfidf_dense.columns = list(zip(*sorted(id_words)))[1]

    tfidfer.use_idf = False
    tfidfer.norm = None
    bow_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense())
    bow_dense.columns = list(zip(*sorted(id_words)))[1]
    bow_dense = bow_dense.astype(int)
    tfidfer.use_idf = True
"""
>>> import pandas as pd
>>> pd.set_option('display.max_columns', 6)
>>> from sklearn.decomposition import PCA
>>> import seaborn
>>> from matplotlib import pyplot as plt
>>> from nlpia.data.loaders import get_data

>> df = get_data('pointcloud').sample(1000)
>> pca = PCA(n_components=2)
>> df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy'))
>> df2d.plot(kind='scatter', x='x', y='y')
>> plt.show()
"""
import pandas as pd
pd.set_option('display.max_columns', 6)
from sklearn.decomposition import PCA
import matplotlib

# matplotlib.use('TkAgg')  # noqa
import seaborn
from matplotlib import pyplot as plt
from nlpia.data.loaders import get_data

df = get_data('pointcloud').sample(1000)
pca = PCA(n_components=2)
df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy'))
df2d.plot(kind='scatter', x='x', y='y')
plt.show()
Exemple #19
0
def horse_plot():
    df = get_data('pointcloud').sample(1000)
    pca = PCA(n_components=2)
    df2d = pd.DataFrame(pca.fit_transform(df), columns=list('xy'))
    df2d.plot(kind='scatter', x='x', y='y')
    plt.show()
Exemple #20
0
# Manually load w2v
# import os
# from nlpia.data.loaders import BIGDATA_PATH
# from gensim.models import KeyedVectors
# path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')
# wv = KeyedVectors.load_word2vec_format(path, binary=True)

# nlpia can now automatically download and load w2v
from nlpia.data.loaders import get_data
from gensim.models import KeyedVectors

wv = get_data('word2vec')
# wv = KeyedVectors.load_word2vec_format(path, binary=True)
len(wv.vocab)
# 3000000
wv.vectors.shape
# (3000000, 300)

import pandas as pd

vocab = pd.Series(wv.vocab)
vocab.iloc[100000:100006]  # different words for new KeyedVector format
# Illington_Fund             Vocab(count:447860, index:2552140)
# Illingworth                 Vocab(count:2905166, index:94834)
# Illingworth_Halifax       Vocab(count:1984281, index:1015719)
# Illini                      Vocab(count:2984391, index:15609)
# IlliniBoard.com           Vocab(count:1481047, index:1518953)
# Illini_Bluffs              Vocab(count:2636947, index:363053)

import numpy as np
Exemple #21
0
peopleness     0.35
animalness     0.17
conceptness   -0.32
femaleness     0.26
dtype: float64


TODO:
automate the search for synonyms with higher than 60% similarity, walking a shallow graph
"""

import pandas as pd
from nlpia.data.loaders import get_data
from gensim.models.keyedvectors import KeyedVectors

wordvector_path = get_data('word2vec')
word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)

###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors global

COMPONENT_WORDS = [
    ('placeness',
     ('geography Geography geographic geographical geographical_location location '
      + 'locale locations proximity').split()),
    ('peopleness',
     'human Humans homo_sapiens peole people individuals humankind people men women'
     .split()),
    ('animalness',
     'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'
     .split()),
Exemple #22
0
from nlpia.data.loaders import get_data
from nltk.tokenize import casual_tokenize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer

# import nltk
# nltk.download('wordnet')  # noqa
# from nltk.stem.wordnet import WordNetLemmatizer

corpus = get_data('cats_and_dogs')

STOPWORDS = (
    'a an and or the do are with from for of on in by if at to into them' +
    ' I it its it\'s that than me my our you your ? , . !').split()
SYNONYMS = dict(
    zip(
        'wolv people person women woman man human he  we  her she him his hers'
        .split(),
        'wolf her    her    her   her   her her   her her her her her her her'.
        split()))
SYNONYMS.update(
    dict(
        zip(
            'ate pat smarter have had isn\'t hasn\'t no  got get become been was were wa be'
            .split(),
            'eat pet smart   has  has not    not     not has has is     is   is  is   is is'
            .split())))

tfidfer = TfidfVectorizer(min_df=2, max_df=.6)
docs = [doc.lower() for doc in corpus]
def plot_city_wordvectors():
    global wv
    path = get_data('wv')
    wv = KeyedVectors.load_word2vec_format(path,
                                           binary=True) if wv is None else wv
    cities = get_data('cities')
    cities.head(1).T
    # geonameid                       3039154
    # name                          El Tarter
    # asciiname                     El Tarter
    # alternatenames     Ehl Tarter,Эл Тартер
    # latitude                        42.5795
    # longitude                       1.65362
    # feature_class                         P
    # feature_code                        PPL
    # country_code                         AD
    # cc2                                 NaN
    # admin1_code                          02
    # admin2_code                         NaN
    # admin3_code                         NaN
    # admin4_code                         NaN
    # population                         1052
    # elevation                           NaN
    # dem                                1721
    # timezone                 Europe/Andorra
    # modification_date            2012-11-03

    us = cities[(cities.country_code == 'US')
                & (cities.admin1_code.notnull())].copy()
    states = pd.read_csv(
        'http://www.fonz.net/blog/wp-content/uploads/2008/04/states.csv')
    states = dict(zip(states.Abbreviation, states.State))
    us['city'] = us.name.copy()
    us['st'] = us.admin1_code.copy()
    us['state'] = us.st.map(states)
    us[us.columns[-3:]].head()
    #                      city  st    state
    # geonameid
    # 4046255       Bay Minette  AL  Alabama
    # 4046274              Edna  TX    Texas
    # 4046319    Bayou La Batre  AL  Alabama
    # 4046332         Henderson  TX    Texas
    # 4046430           Natalia  TX    Texas

    vocab = pd.np.concatenate([us.city, us.st, us.state])
    vocab = np.array([word for word in vocab if word in wv.wv])
    vocab[:10]
    # array(['Edna', 'Henderson', 'Natalia', 'Yorktown', 'Brighton', 'Berry',
    #        'Trinity', 'Villas', 'Bessemer', 'Aurora'], dtype='<U15')

    # >>> us_300D = pd.DataFrame([[i] + list(wv[c] + (wv[state] if state in vocab else wv[s]) + wv[s]) for i, c, state, s
    # ...                         in zip(us.index, us.city, us.state, us.st) if c in vocab])
    city_plus_state = []
    for c, state, st in zip(us.city, us.state, us.st):
        if c not in vocab:
            continue
        row = []
        if state in vocab:
            row.extend(wv[c] + wv[state])
        else:
            row.extend(wv[c] + wv[st])
        city_plus_state.append(row)
    us_300D = pd.DataFrame(city_plus_state)

    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)  # <1>
    us_300D = get_data('cities_us_wordvectors')
    us_2D = pca.fit_transform(us_300D.iloc[:, :300])  # <2>
Exemple #24
0
                for words in docs]
    docs = [[synonyms.get(w, w) for w in words if w not in stopwords]
            for words in docs]
    docs = [' '.join(w for w in words if w not in stopwords) for words in docs]
    return docs


def tokenize(text, vocabulary, synonyms=SYNONYMS, stopwords=STOPWORDS):
    doc = normalize_corpus_words([text.lower()],
                                 synonyms=synonyms,
                                 stopwords=stopwords)[0]
    stems = [w for w in doc.split() if w in vocabulary]
    return stems


corpus = get_data('cats_and_dogs')
docs = normalize_corpus_words(corpus, stemmer=None)
tfidfer = TfidfVectorizer(min_df=2,
                          max_df=.6,
                          stop_words=None,
                          token_pattern=r'(?u)\b\w+\b')
tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense())
id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()]
tfidf_dense.columns = list(zip(*sorted(id_words)))[1]

pd.options.display.width = 110
pd.options.display.max_columns = 14
pd.options.display.max_colwidth = 32
fun_words = vocabulary = 'cat dog apple lion nyc love big small'
fun_stems = normalize_corpus_words([fun_words])[0].split()
fun_words = fun_words.split()
Exemple #25
0
    replies = []
    for i, record in tqdm(df.iterrows()):
        turns = list(split_turns(record.Context))
        statement = turns[-1] if len(turns) else '\n'  # <1>
        statements.append(statement)
        turns = list(split_turns(record.Utterance))
        reply = turns[-1] if len(turns) else '\n'
        replies.append(reply)
    df['statement'] = statements
    df['reply'] = replies
    return df


def format_ubuntu_dialog(df):
    """ Print statements paired with replies, formatted for easy review """
    s = ''
    for i, record in df.iterrows():
        statement = list(split_turns(record.Context))[-1]  # <1>
        reply = list(split_turns(record.Utterance))[-1]  # <2>
        s += 'Statement: {}\n'.format(statement)
        s += 'Reply: {}\n\n'.format(reply)
    return s
    # <1> We need to use `list` to force iteration through the generator
    # <2> The `[-1]` index retrievs the last "turn" in the sequence, discarding everything else


if __name__ == '__main__':
    df = get_data('ubuntu_dialog')
    df = preprocess_ubuntu_corpus(df)
    print(format_ubuntu_dialog(df.head(4)))