def cross_validate(result_file, root_dir, folds, read_func, sent_num): if os.path.isfile(result_file): os.system('rm ' + result_file) print 'cross-validation started' for i, fold in enumerate(folds): print 'fold %d/%d started' % (i+1, len(folds)) # read test test = read_func(root_dir, fold, sent_num, local_idx = True, full_info = True) # read train train_files = reduce(lambda a,b: a+b, folds[:i] + folds[i+1:]) train = read_func(root_dir, train_files, sent_num) # open result file fw = open(result_file, 'ab') # train and compute count = 1 for key in test.iterkeys(): lm = NgramModel(3, train[key]) for s in test[key]: if len(s[-1]) > 0: e = lm.entropy(s[-1]) # s[0] is xml id # s[1] is div index # key is global index # s[2] is turn index # s[3] is local index fw.write(','.join(map(str, (s[0], s[1], key, s[2], s[3], e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (count, len(test))) sys.stdout.flush() count += 1 print 'fold %d/%d done' % (i+1, len(folds))
def get_similarity(q, idx, tf_idf): COUNT_WORDS_IN_TEXT = 100 urls_for_sim = set() for info in tf_idf: weight = info[0] ds2_lst = info[1] for ds in ds2_lst: urls_for_sim.add(ds.text_id) rank_lm = [] est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) q_sequence = wordpunct_tokenize(q) for text_id in urls_for_sim: text = ds.text sequence = wordpunct_tokenize(text) lm = NgramModel(3, sequence, estimator=est) ds.weight = lm.entropy(q_sequence) # if len(ds.text) > COUNT_WORDS_IN_TEXT: rank_lm += [ds] return sorted(rank_lm, key=lambda ds: ds.weight)
SAMPLE_SENTS_NUM = 100 for fold_idx, fold in enumerate(folds): # takes 37 min # print progress print 'fold %d/%d started' % (fold_idx + 1, len(folds)) # read test sents test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc=True) # read the train sents train_convIDs = reduce(lambda a, b: a + b, folds[:fold_idx] + folds[fold_idx + 1:]) train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM) # open the file to write the results fw = open(res_file_name, 'ab') # train and compute key_count = 1 for key in test_sents.iterkeys(): lm = NgramModel(3, train_sents[key]) for s in test_sents[key]: e = lm.entropy(s[2]) fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents))) sys.stdout.flush() key_count += 1 # print progress print '\nfold %d/%d done\n' % (fold_idx + 1, len(folds))
# $$ # P(X_i|X_{i-1},..,X_{i-n+1}) # $$ # <codecell> from nltk.model.ngram import NgramModel from nltk.probability import WittenBellProbDist, LidstoneProbDist train_words = brown.words()[:-500] test_words = brown.words()[-500:] lm = NgramModel(2, train_words, lambda fd, b: LidstoneProbDist(fd, 0.2)) # <codecell> lm.entropy(test_words) # <markdowncell> # ### Counting # # For example, how many words in a corpus are not in WordNet? # <codecell> from nltk.corpus import wordnet from nltk.probability import ConditionalFreqDist cfd = ConditionalFreqDist( (pos, len(wordnet.synsets(word)) > 0) for word,pos in treebank.tagged_words() )
folds = tmp # remove the existing result file res_file_name = 'results_swbd_nltk_CV.txt' if os.path.isfile(res_file_name): os.system('rm ' + res_file_name) SAMPLE_SENTS_NUM = 100 for fold_idx, fold in enumerate(folds): # takes 37 min # print progress print 'fold %d/%d started' % (fold_idx+1, len(folds)) # read test sents test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc = True) # read the train sents train_convIDs = reduce(lambda a,b: a+b, folds[:fold_idx] + folds[fold_idx+1:]) train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM) # open the file to write the results fw = open(res_file_name, 'ab') # train and compute key_count = 1 for key in test_sents.iterkeys(): lm = NgramModel(3, train_sents[key]) for s in test_sents[key]: e = lm.entropy(s[2]) fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n') # print progress sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents))) sys.stdout.flush() key_count += 1 # print progress print '\nfold %d/%d done\n' % (fold_idx+1, len(folds))