def compute_entropy_first100(): conn = db_conn('map') cur = conn.cursor() # select all unique observation sql = 'SELECT DISTINCT(observation) FROM utterances' cur.execute(sql) unique_observs = [t[0] for t in cur.fetchall()] # for each obsv, compute the entropy of its first 100 sentences (or less), # using the corresponding sentences from other obsvs to train the language model for i, obsv in enumerate(unique_observs): # select maximum utterID cur.execute('SELECT MAX(utterID) FROM utterances WHERE observation = %s', [obsv]) max_uid = cur.fetchone()[0] for j in range(1, min(100, max_uid)+1): # select text for train sql = 'SELECT tokens FROM utterances WHERE observation != %s AND utterID = %s' cur.execute(sql, (obsv, j)) train_text = [t[0].split() for t in cur.fetchall()] # train the model lm = NgramModel(3, train_text) # compute the entropy and update sql = 'SELECT tokens FROM utterances WHERE observation = %s AND utterID = %s' cur.execute(sql, (obsv, j)) test_text = cur.fetchone()[0].split() if len(test_text) == 0: ent = None else: ent = lm.entropy(test_text) sql = 'UPDATE utterances SET ent = %s WHERE observation = %s AND utterID = %s' cur.execute(sql, (ent, obsv, j)) # print progress sys.stdout.write('\r{}/{} conversation done'.format(i+1, len(unique_observs))) sys.stdout.flush() conn.commit()
def proc_swbd_shuffle(): data = read_data_disk("swbd_shuffle_sents100.dat") conn = db_conn("swbd") cur = conn.cursor() # select all convIds query = "select distinct convId from entropy_shuffle" cur.execute(query) conv_ids = [t[0] for t in cur.fetchall()] # shuffle and make folds random.shuffle(conv_ids) fold_num = 10 fold_size = int(len(conv_ids) / fold_num) conv_ids_folds = [] for i in range(0, fold_num): if i < fold_num - 1: conv_ids_folds.append(conv_ids[i * fold_size : (i + 1) * fold_size]) else: conv_ids_folds.append(conv_ids[i * fold_size :]) # cross validation results = [] for i in range(0, fold_num): print("fold {} begins".format(i)) test_ids = conv_ids_folds[i] train_ids = [] for j in range(0, fold_num): if j != i: train_ids += conv_ids_folds[j] # from sentence position 1 to 100 for sid in range(1, 101): train_sents = get_train_sents(data, train_ids, sid) lm = NgramModel(3, train_sents) for cid in test_ids: sent = [] if sid in data[cid]: sent = data[cid][sid] if len(sent) > 0: ent = lm.entropy(sent) results.append((cid, sid, ent)) sys.stdout.write("\r{}/{} done".format(sid, 100)) sys.stdout.flush() print("fold {} done".format(i)) # write results to file with open("swbd_shuffle_sents100_res.dat", "w") as fw: for item in results: row = ", ".join(map(str, item)) + "\n" fw.write(row) pass
def train(): keys, text = read_tokens() sents = [] for t in text: if t is None: sents.append([]) else: sents.append(t.strip().split()) lm = NgramModel(3, sents) pickle.dump(lm, open('lm.txt', 'wb')) return lm
def compute_entropy_first100(): conn = db_conn('map') cur = conn.cursor() # select all unique observation sql = 'SELECT DISTINCT(observation) FROM utterances' cur.execute(sql) unique_observs = [t[0] for t in cur.fetchall()] # for each obsv, compute the entropy of its first 100 sentences (or less), # using the corresponding sentences from other obsvs to train the language model for i, obsv in enumerate(unique_observs): # select maximum utterID cur.execute( 'SELECT MAX(utterID) FROM utterances WHERE observation = %s', [obsv]) max_uid = cur.fetchone()[0] for j in range(1, min(100, max_uid) + 1): # select text for train sql = 'SELECT tokens FROM utterances WHERE observation != %s AND utterID = %s' cur.execute(sql, (obsv, j)) train_text = [t[0].split() for t in cur.fetchall()] # train the model lm = NgramModel(3, train_text) # compute the entropy and update sql = 'SELECT tokens FROM utterances WHERE observation = %s AND utterID = %s' cur.execute(sql, (obsv, j)) test_text = cur.fetchone()[0].split() if len(test_text) == 0: ent = None else: ent = lm.entropy(test_text) sql = 'UPDATE utterances SET ent = %s WHERE observation = %s AND utterID = %s' cur.execute(sql, (ent, obsv, j)) # print progress sys.stdout.write('\r{}/{} conversation done'.format( i + 1, len(unique_observs))) sys.stdout.flush() conn.commit()
# read sentences from Switchboard def read_swbd(): conn = db_conn('swbd') cur = conn.cursor() query = 'SELECT rawWord FROM entropy' cur.execute(query) sents = [t[0].stirp().split() for t in cur.fetchall()] return sents # read train set from file def read_train_file(filename): """ filename: the text file that contains sentences for training, one sentence in a line """ with open(filename, 'r') as fr: sents = [] for line in fr: s = line.strip().split() sents.append(s) return sents # main if __name__ == '__main__': sents = read_train_file('train.txt') # train the model lm = NgramModel(3, sents)