Esempio n. 1
0
def compute_entropy_first100():
    conn = db_conn('map')
    cur = conn.cursor()
    # select all unique observation
    sql = 'SELECT DISTINCT(observation) FROM utterances'
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv, compute the entropy of its first 100 sentences (or less),
    # using the corresponding sentences from other obsvs to train the language model
    for i, obsv in enumerate(unique_observs):
        # select maximum utterID
        cur.execute('SELECT MAX(utterID) FROM utterances WHERE observation = %s', [obsv])
        max_uid = cur.fetchone()[0]
        for j in range(1, min(100, max_uid)+1):
            # select text for train
            sql = 'SELECT tokens FROM utterances WHERE observation != %s AND utterID = %s'
            cur.execute(sql, (obsv, j))
            train_text = [t[0].split() for t in cur.fetchall()]
            # train the model
            lm = NgramModel(3, train_text)
            # compute the entropy and update
            sql = 'SELECT tokens FROM utterances WHERE observation = %s AND utterID = %s'
            cur.execute(sql, (obsv, j))
            test_text = cur.fetchone()[0].split()
            if len(test_text) == 0:
                ent = None
            else:
                ent = lm.entropy(test_text)
            sql = 'UPDATE utterances SET ent = %s WHERE observation = %s AND utterID = %s'
            cur.execute(sql, (ent, obsv, j))
        # print progress
        sys.stdout.write('\r{}/{} conversation done'.format(i+1, len(unique_observs)))
        sys.stdout.flush()
        conn.commit()
Esempio n. 2
0
def proc_swbd_shuffle():
    data = read_data_disk("swbd_shuffle_sents100.dat")
    conn = db_conn("swbd")
    cur = conn.cursor()
    # select all convIds
    query = "select distinct convId from entropy_shuffle"
    cur.execute(query)
    conv_ids = [t[0] for t in cur.fetchall()]
    # shuffle and make folds
    random.shuffle(conv_ids)
    fold_num = 10
    fold_size = int(len(conv_ids) / fold_num)
    conv_ids_folds = []
    for i in range(0, fold_num):
        if i < fold_num - 1:
            conv_ids_folds.append(conv_ids[i * fold_size : (i + 1) * fold_size])
        else:
            conv_ids_folds.append(conv_ids[i * fold_size :])
    # cross validation
    results = []
    for i in range(0, fold_num):
        print("fold {} begins".format(i))
        test_ids = conv_ids_folds[i]
        train_ids = []
        for j in range(0, fold_num):
            if j != i:
                train_ids += conv_ids_folds[j]
        # from sentence position 1 to 100
        for sid in range(1, 101):
            train_sents = get_train_sents(data, train_ids, sid)
            lm = NgramModel(3, train_sents)
            for cid in test_ids:
                sent = []
                if sid in data[cid]:
                    sent = data[cid][sid]
                if len(sent) > 0:
                    ent = lm.entropy(sent)
                    results.append((cid, sid, ent))
            sys.stdout.write("\r{}/{} done".format(sid, 100))
            sys.stdout.flush()
        print("fold {} done".format(i))
    # write results to file
    with open("swbd_shuffle_sents100_res.dat", "w") as fw:
        for item in results:
            row = ", ".join(map(str, item)) + "\n"
            fw.write(row)
    pass
Esempio n. 3
0
def train():
    keys, text = read_tokens()
    sents = []
    for t in text:
        if t is None:
            sents.append([])
        else:
            sents.append(t.strip().split())
    lm = NgramModel(3, sents)
    pickle.dump(lm, open('lm.txt', 'wb'))
    return lm
Esempio n. 4
0
def compute_entropy_first100():
    conn = db_conn('map')
    cur = conn.cursor()
    # select all unique observation
    sql = 'SELECT DISTINCT(observation) FROM utterances'
    cur.execute(sql)
    unique_observs = [t[0] for t in cur.fetchall()]
    # for each obsv, compute the entropy of its first 100 sentences (or less),
    # using the corresponding sentences from other obsvs to train the language model
    for i, obsv in enumerate(unique_observs):
        # select maximum utterID
        cur.execute(
            'SELECT MAX(utterID) FROM utterances WHERE observation = %s',
            [obsv])
        max_uid = cur.fetchone()[0]
        for j in range(1, min(100, max_uid) + 1):
            # select text for train
            sql = 'SELECT tokens FROM utterances WHERE observation != %s AND utterID = %s'
            cur.execute(sql, (obsv, j))
            train_text = [t[0].split() for t in cur.fetchall()]
            # train the model
            lm = NgramModel(3, train_text)
            # compute the entropy and update
            sql = 'SELECT tokens FROM utterances WHERE observation = %s AND utterID = %s'
            cur.execute(sql, (obsv, j))
            test_text = cur.fetchone()[0].split()
            if len(test_text) == 0:
                ent = None
            else:
                ent = lm.entropy(test_text)
            sql = 'UPDATE utterances SET ent = %s WHERE observation = %s AND utterID = %s'
            cur.execute(sql, (ent, obsv, j))
        # print progress
        sys.stdout.write('\r{}/{} conversation done'.format(
            i + 1, len(unique_observs)))
        sys.stdout.flush()
        conn.commit()
Esempio n. 5
0

# read sentences from Switchboard
def read_swbd():
    conn = db_conn('swbd')
    cur = conn.cursor()
    query = 'SELECT rawWord FROM entropy'
    cur.execute(query)
    sents = [t[0].stirp().split() for t in cur.fetchall()]
    return sents


# read train set from file
def read_train_file(filename):
    """
    filename: the text file that contains sentences for training, one sentence in a line
    """
    with open(filename, 'r') as fr:
        sents = []
        for line in fr:
            s = line.strip().split()
            sents.append(s)
        return sents


# main
if __name__ == '__main__':
    sents = read_train_file('train.txt')
    # train the model
    lm = NgramModel(3, sents)