def cross_validate(result_file, root_dir, folds, read_func, sent_num):
    if os.path.isfile(result_file):
        os.system('rm ' + result_file)
    print 'cross-validation started'

    for i, fold in enumerate(folds):
        print 'fold %d/%d started' % (i+1, len(folds))
        # read test
        test = read_func(root_dir, fold, sent_num, local_idx = True, full_info = True)
        # read train
        train_files = reduce(lambda a,b: a+b, folds[:i] + folds[i+1:])
        train = read_func(root_dir, train_files, sent_num)
        # open result file
        fw = open(result_file, 'ab')
        # train and compute
        count = 1
        for key in test.iterkeys():
            lm = NgramModel(3, train[key])
            for s in test[key]:
                if len(s[-1]) > 0:
                    e = lm.entropy(s[-1])
                    # s[0] is xml id
                    # s[1] is div index
                    # key is global index
                    # s[2] is turn index
                    # s[3] is local index
                    fw.write(','.join(map(str, (s[0], s[1], key, s[2], s[3], e))) + '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' % (count, len(test)))
            sys.stdout.flush()
            count += 1
        print 'fold %d/%d done' % (i+1, len(folds))
def get_similarity(q, idx, tf_idf):
    COUNT_WORDS_IN_TEXT = 100
    urls_for_sim = set()
    for info in tf_idf:
        weight = info[0]
        ds2_lst = info[1]
        for ds in ds2_lst:
            urls_for_sim.add(ds.text_id)

    rank_lm = []
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    q_sequence = wordpunct_tokenize(q)
    for text_id in urls_for_sim:
        text = ds.text
        sequence = wordpunct_tokenize(text)
        lm = NgramModel(3, sequence, estimator=est)
        ds.weight = lm.entropy(q_sequence)
        # if len(ds.text) > COUNT_WORDS_IN_TEXT:
        rank_lm += [ds]

    return sorted(rank_lm, key=lambda ds: ds.weight)
Exemple #3
0
    SAMPLE_SENTS_NUM = 100
    for fold_idx, fold in enumerate(folds):  # takes 37 min
        # print progress
        print 'fold %d/%d started' % (fold_idx + 1, len(folds))
        # read test sents
        test_sents = read_firstN_sents(cur,
                                       fold,
                                       SAMPLE_SENTS_NUM,
                                       include_loc=True)
        # read the train sents
        train_convIDs = reduce(lambda a, b: a + b,
                               folds[:fold_idx] + folds[fold_idx + 1:])
        train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM)
        # open the file to write the results
        fw = open(res_file_name, 'ab')
        # train and compute
        key_count = 1
        for key in test_sents.iterkeys():
            lm = NgramModel(3, train_sents[key])
            for s in test_sents[key]:
                e = lm.entropy(s[2])
                fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) +
                         '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' %
                             (key_count, len(test_sents)))
            sys.stdout.flush()
            key_count += 1
        # print progress
        print '\nfold %d/%d done\n' % (fold_idx + 1, len(folds))
Exemple #4
0
# $$
# P(X_i|X_{i-1},..,X_{i-n+1})
# $$

# <codecell>

from nltk.model.ngram import NgramModel
from nltk.probability import WittenBellProbDist, LidstoneProbDist

train_words = brown.words()[:-500]
test_words = brown.words()[-500:]
lm = NgramModel(2, train_words, lambda fd, b: LidstoneProbDist(fd, 0.2))

# <codecell>

lm.entropy(test_words)

# <markdowncell>

# ### Counting
# 
# For example, how many words in a corpus are not in WordNet?

# <codecell>

from nltk.corpus import wordnet
from nltk.probability import ConditionalFreqDist

cfd = ConditionalFreqDist(
      (pos, len(wordnet.synsets(word)) > 0) for word,pos in treebank.tagged_words()
)
Exemple #5
0
    folds = tmp

    # remove the existing result file
    res_file_name = 'results_swbd_nltk_CV.txt'
    if os.path.isfile(res_file_name):
        os.system('rm ' + res_file_name)

    SAMPLE_SENTS_NUM = 100
    for fold_idx, fold in enumerate(folds): # takes 37 min
        # print progress
        print 'fold %d/%d started' % (fold_idx+1, len(folds))
        # read test sents
        test_sents = read_firstN_sents(cur, fold, SAMPLE_SENTS_NUM, include_loc = True)
        # read the train sents
        train_convIDs = reduce(lambda a,b: a+b, folds[:fold_idx] + folds[fold_idx+1:])
        train_sents = read_firstN_sents(cur, train_convIDs, SAMPLE_SENTS_NUM)
        # open the file to write the results
        fw = open(res_file_name, 'ab')
        # train and compute
        key_count = 1
        for key in test_sents.iterkeys():
            lm = NgramModel(3, train_sents[key])
            for s in test_sents[key]:
                e = lm.entropy(s[2])
                fw.write(','.join((str(s[0]), str(key), str(s[1]), str(e))) + '\n')
            # print progress
            sys.stdout.write('\rkey = %d/%d done' % (key_count, len(test_sents)))
            sys.stdout.flush()
            key_count += 1
        # print progress
        print '\nfold %d/%d done\n' % (fold_idx+1, len(folds))