Python read_documentsの例、discourse.util.read_documents Pythonの例

コード例 #1

0

ファイルを表示

ファイル: ibm1_decoder.py プロジェクト: karins/CoherenceFramework

def decode(model, istream, ostream, estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s', model)
    T, vocab = load_model(model)
    logging.info('%d patterns and %d entries', len(vocab), T.size)

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab
    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)
    documents = read_documents(istream, boundaries)  
    logging.info('%d test documents read', len(documents))
   
    # encode test documents using the model's vocabulary
    test = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document
    L = loglikelihood(test, T)

    # dumps scores
    print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
    for i, ll in enumerate(L):
        num_sentences = len(test[i])
        num_patterns = sum(len(row) for row in test[i])
        print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, 
                ll/num_sentences, num_patterns, ll/num_patterns)
    print >> estream, '#sum\t#mean'
    print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))

コード例 #2

0

ファイルを表示

def decode(model, istream, ostream, estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s', model)
    T, vocab = load_model(model)
    logging.info('%d patterns and %d entries', len(vocab), T.size)

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab
    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)
    documents = read_documents(istream, boundaries)
    logging.info('%d test documents read', len(documents))

    # encode test documents using the model's vocabulary
    test = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document
    L = loglikelihood(test, T)

    # dumps scores
    print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
    for i, ll in enumerate(L):
        num_sentences = len(test[i])
        num_patterns = sum(len(row) for row in test[i])
        print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(
            i, ll, num_sentences, ll / num_sentences, num_patterns,
            ll / num_patterns)
    print >> estream, '#sum\t#mean'
    print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))

コード例 #3

0

ファイルを表示

def decode(unigrams, bigrams, c, istream, ostream, estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s and %s', unigrams, bigrams)
    U, B, vocab = load_model(unigrams, bigrams)
    logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0])

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab
    # detect whether insertion was swtiched
    insertion = B[0, :].sum() > 0
    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)
    documents = read_documents(istream, boundaries)
    logging.info('%d test documents read', len(documents))

    # encode test documents using the model's vocabulary
    test = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document
    L = loglikelihood(test, U, B, c, insertion)

    # dumps scores
    print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
    for i, ll in enumerate(L):
        num_sentences = len(test[i])
        num_patterns = sum(len(row) for row in test[i])
        print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(
            i, ll, num_sentences, ll / num_sentences, num_patterns,
            ll / num_patterns)
    print >> estream, '#sum\t#mean'
    print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))

コード例 #4

0

ファイルを表示

ファイル: ibm1_decoder.py プロジェクト: karins/CoherenceFramework

def decode_many(model, ipaths, opaths, jobs, estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s', model)
    T, vocab = load_model(model)
    logging.info('%d patterns and %d entries', len(vocab), T.size)

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab

    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)

    tests = [None] * len(ipaths)
    for i, ipath in enumerate(ipaths):
        documents = read_documents(smart_open(ipath), boundaries)  
        logging.info('%s: %d test documents read', ipath, len(documents))
        # encode test documents using the model's vocabulary
        tests[i] = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document in each test file
    pool = Pool(jobs)
    all_L = pool.map(partial(wrapped_loglikelihood, T=T), tests)

    print >> estream, '#file\t#sum\t#mean'
    for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L):
        with smart_open(opath, 'w') as ostream:
            # dumps scores
            print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
            for i, ll in enumerate(L):
                num_sentences = len(test[i])
                num_patterns = sum(len(row) for row in test[i])
                print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, 
                        ll/num_sentences, num_patterns, ll/num_patterns)
            print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))

コード例 #5

0

ファイルを表示

ファイル: alouis_decoder.py プロジェクト: karins/CoherenceFramework

def decode(unigrams, bigrams, c, istream, ostream, estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s and %s', unigrams, bigrams)
    U, B, vocab = load_model(unigrams, bigrams)
    logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0])

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab
    # detect whether insertion was swtiched
    insertion = B[0,:].sum() > 0
    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)
    documents = read_documents(istream, boundaries)  
    logging.info('%d test documents read', len(documents))
   
    # encode test documents using the model's vocabulary
    test = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document
    L = loglikelihood(test, U, B, c, insertion)

    # dumps scores
    print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
    for i, ll in enumerate(L):
        num_sentences = len(test[i])
        num_patterns = sum(len(row) for row in test[i])
        print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, 
                ll/num_sentences, num_patterns, ll/num_patterns)
    print >> estream, '#sum\t#mean'
    print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))

コード例 #6

0

ファイルを表示

ファイル: alouis.py プロジェクト: tony-hong/CoherenceFramework

def main(args):
    """load data and optimise the likelihood"""
    logging.basicConfig(
        level=(logging.DEBUG if args.verbose else logging.INFO),
        format='%(levelname)s %(message)s')

    # read in documents
    logging.info('Reading documents in ...')
    documents = read_documents(sys.stdin, args.boundary)
    logging.info('%d documents, on average %.2f sentences per document',
                 len(documents), np.mean([len(D) for D in documents]))

    least_common, min_count = find_least_common(documents) if args.unk else (
        frozenset(), 0)
    if args.unk:
        logging.info('Least common patterns: frequency=%d patterns=%d',
                     min_count, len(least_common))

    # decide whether or not there will be a null symbol
    # encode documents using numpy array of ids
    T, vocab = encode_documents(documents, ignore=least_common)

    # gather unigram and bigram counts
    logging.info('Counting ...')
    U, B = count(T, len(vocab), insertion=args.insertion)
    logging.info('%d unigrams, %d bigrams', U.size, B.size)

    # compute log likelihood
    logging.info('Computing likelihood ...')
    ll = loglikelihood(T, U, B, args.smoothing, args.insertion)
    logging.info('Negative log likelihood %f with c=%f and insertion=%s', -ll,
                 args.smoothing, args.insertion)

    # dumps U and B in a nice format
    tokens = [t for t, i in sorted(vocab.iteritems(), key=lambda (t, i): i)]
    V = len(tokens)
    logging.info('Writing unigrams to: %s', '{0}.unigrams'.format(args.output))
    with open('{0}.unigrams'.format(args.output), 'w') as fu:
        print >> fu, '#pattern\t#count'
        for u, n in sorted(enumerate(U), key=lambda (u, n): n, reverse=True):
            print >> fu, '{0}\t{1}'.format(tokens[u], n)
    logging.info('Writing bigrams to: %s', '{0}.bigrams'.format(args.output))
    with open('{0}.bigrams'.format(args.output), 'w') as fb:
        print >> fb, '#trigger\t#pattern\t#count'
        for u in xrange(V):
            # we iterate over triggers so that the most likely ones come first
            for v in sorted(itertools.ifilter(lambda v: B[u, v], xrange(V)),
                            key=lambda v: B[u, v],
                            reverse=True):
                print >> fb, '{0}\t{1}\t{2}'.format(tokens[u], tokens[v], B[u,
                                                                            v])

    # legacy options: optimise likelihood
    if args.mle:
        logging.info('Minimising negative log likelihood')
        print minimize(T, U, B, args.insertion)

コード例 #7

0

ファイルを表示

ファイル: alouis.py プロジェクト: karins/CoherenceFramework

def main(args):
    """load data and optimise the likelihood"""
    logging.basicConfig(
            level=(logging.DEBUG if args.verbose else logging.INFO), 
            format='%(levelname)s %(message)s')

    # read in documents
    logging.info('Reading documents in ...')
    documents = read_documents(sys.stdin, args.boundary)
    logging.info('%d documents, on average %.2f sentences per document', len(documents), np.mean([len(D)for D in documents]))
    
    least_common, min_count = find_least_common(documents) if args.unk else (frozenset(), 0)
    if args.unk:
        logging.info('Least common patterns: frequency=%d patterns=%d', min_count, len(least_common))
   
    # decide whether or not there will be a null symbol
    # encode documents using numpy array of ids
    T, vocab = encode_documents(documents, ignore=least_common)
   
    # gather unigram and bigram counts
    logging.info('Counting ...')    
    U, B = count(T, len(vocab), insertion=args.insertion)
    logging.info('%d unigrams, %d bigrams', U.size, B.size)

    # compute log likelihood
    logging.info('Computing likelihood ...')
    ll = loglikelihood(T, U, B, args.smoothing, args.insertion)
    logging.info('Negative log likelihood %f with c=%f and insertion=%s', -ll, args.smoothing, args.insertion)
    
    # dumps U and B in a nice format
    tokens = [t for t, i in sorted(vocab.iteritems(), key=lambda (t, i): i)]
    V = len(tokens)
    logging.info('Writing unigrams to: %s', '{0}.unigrams'.format(args.output))
    with open('{0}.unigrams'.format(args.output), 'w') as fu:
        print >> fu, '#pattern\t#count'
        for u, n in sorted(enumerate(U), key=lambda (u, n): n, reverse=True):
            print >> fu, '{0}\t{1}'.format(tokens[u], n)
    logging.info('Writing bigrams to: %s', '{0}.bigrams'.format(args.output))
    with open('{0}.bigrams'.format(args.output), 'w') as fb:
        print >> fb, '#trigger\t#pattern\t#count'
        for u in xrange(V):
            # we iterate over triggers so that the most likely ones come first
            for v in sorted(itertools.ifilter(lambda v: B[u,v], xrange(V)), key=lambda v: B[u,v], reverse=True):
                print >> fb, '{0}\t{1}\t{2}'.format(tokens[u], tokens[v], B[u,v])

    # legacy options: optimise likelihood
    if args.mle:
        logging.info('Minimising negative log likelihood')
        print minimize(T, U, B, args.insertion)

コード例 #8

0

ファイルを表示

def decode_many(unigrams,
                bigrams,
                c,
                ipaths,
                opaths,
                jobs,
                estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s and %s', unigrams, bigrams)
    U, B, vocab = load_model(unigrams, bigrams)
    logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0])

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab
    # detect whether insertion was swtiched
    insertion = B[0, :].sum() > 0

    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)

    tests = [None] * len(ipaths)
    for i, ipath in enumerate(ipaths):
        documents = read_documents(smart_open(ipath), boundaries)
        logging.info('%s: %d test documents read', ipath, len(documents))
        # encode test documents using the model's vocabulary
        tests[i] = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document in each test file
    pool = Pool(jobs)
    all_L = pool.map(
        partial(wrapped_loglikelihood, U=U, B=B, c=c, insertion=insertion),
        tests)

    print >> estream, '#file\t#sum\t#mean'
    for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L):
        with smart_open(opath, 'w') as ostream:
            # dumps scores
            print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
            for i, ll in enumerate(L):
                num_sentences = len(test[i])
                num_patterns = sum(len(row) for row in test[i])
                print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(
                    i, ll, num_sentences, ll / num_sentences, num_patterns,
                    ll / num_patterns)
            print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(),
                                                     np.mean(L))

コード例 #9

0

ファイルを表示

ファイル: ibm1.py プロジェクト: tony-hong/CoherenceFramework

def main(args):
    
    logging.basicConfig(
            level=(logging.DEBUG if args.verbose else logging.INFO), 
            format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S')

    # read documents 
    logging.info('Reading documents in...')
    documents = read_documents(args.input, args.boundary)
    logging.info('%d documents read', len(documents))

    least_common, min_count = find_least_common(documents) if args.unk else (frozenset(), 0)
    if args.unk:
        logging.info('Least common patterns: frequency=%d patterns=%d', min_count, len(least_common))

    # maps tokens to integer ids (0 is reserved for a special <null> symbol)
    # and encodes the training data using numpy arrays of vocab ids
    logging.info('Making vocab')
    corpus, vocab = encode_documents(documents, ignore=least_common)
    logging.info('%d tokens read (including <null> and <unk>)', len(vocab))

    # estimates parameters T[f,e] = t(f|e)
    # where (e, f) are syntactic patterns occurring in adjacent sentences in a document
    T, LL = ibm1(corpus, len(vocab), args.max_iterations, args.min_gain, args.progress)
    T = np.nan_to_num(T)
    
    # store the log-likelihood values
    if args.ll:
        with open(args.ll, 'w') as fo:
            [fo.write('{0}\n'.format(ll)) for ll in LL]

    # dumps T in a nice format
    tokens = [t for t, i in sorted(vocab.iteritems(), key=lambda (t, i): i)]
    V = len(tokens)
    # we print a header so that the meaning of each column is clear
    print >> args.output, '#trigger\t#pattern\t#p(pattern|trigger)'  # note that e=trigger and f=pattern 
    # we iterate over f in no particular order (simply that of the vocabulary ids)
    for f in xrange(V):
        # we iterate over triggers so that the most likely ones come first
        for e in sorted(itertools.ifilter(lambda e: T[f,e], xrange(V)), key=lambda e: T[f,e], reverse=True):
            print >> args.output, '{0}\t{1}\t{2}'.format(tokens[e], tokens[f], T[f,e])