Example #1
0
def compute(corpus_file, pvalue, use_perm, out_filename, stopw=None, min_count=5,
         min_bigram_count=5, min_char_count=3, encoding='utf-8'):

    """
    Recursively find collocations for a given corpus.  writes
    the marginal counts to a specified file
    :param encoding: Encoding of the corpus file
    :param stopw: List of stopwords to apply to the analysis
    :param corpus_file: string with file name
    :param pvalue: self-explanatory
    :param use_perm: Boolean. Score by permutation
    :param out_filename: file name to write into
    :param min_count:
    :param min_bigram_count:
    :param min_char_count:
    """

    sys.stdout.write("computing n-grams from %s\n" % corpus_file)
    
    if stopw is None:
        tt._stop_words = []
    else:
        assert isinstance(stopw, list)
        tt._stop_words = stopw

    ### read corpus
    with codecs.open(corpus_file, encoding=encoding) as f:
        corpus = f.readlines()

    ### set up recursive hypothesis tests
    lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm)
    def iter_gen():
        for doc in corpus:
            yield doc
    # note: some hidden defaults here, e.g., no numbers
    char_filter = tt.make_char_filter(min_char_count)
    def my_filter(w):
        char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w)
    def update_fun(count, doc):
        count.update_counts(doc, root_filter=tt.stop_filter)

    ### compute significant n-grams
    cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count)

    ### write n-grams to file
    sys.stdout.write("writing to %s\n" % out_filename)
    with codecs.open(out_filename, 'w', encoding='utf-8') as f:
    # this can be adjusted to write out any information you need
        [f.write(u'{0:s}|{1:g}\n'.format(term, count)) for (term, count) in sorted(cnts.marg.items(), key=lambda x:-x[1])]
    print "Number of seleced bigrams: ", len(cnts.vocab)
    tt.write_vocab(cnts.marg, 'ngram_counts.csv')
    return cnts
Example #2
0
def main(vocab_file, assign_file, corpus_file, ntopics, out, use_perm=True,
        min_count=25, pvalue=.001):
    vocab = read_vocab(vocab_file)
    assigns = parse_word_assignments(assign_file, vocab)
    corpus = file(corpus_file).readlines()

    for topic in range(ntopics):
        sys.stdout.write('writing topic %d\n' % topic)
        sig_bigrams = turbo_topic(corpus, assigns, topic,
                                  use_perm=use_perm,
                                  min=min_count,
                                  pvalue=pvalue)
        tt.write_vocab(sig_bigrams.marg,
                       '%stopic%04d.txt' % (out, topic), incl_stop=True)
Example #3
0
    from optparse import *

    parser = OptionParser()
    parser.add_option("--corpus", type="string", dest="corpus")
    parser.add_option("--assign", type="string", dest="assignments")
    parser.add_option("--vocab", type="string", dest="vocab")
    parser.add_option("--perm", action="store_true", dest="use_perm")
    parser.add_option("--pval", type="float", dest="pvalue")
    parser.add_option("--out", type="string", dest="out")
    parser.add_option("--min-count", type="float", dest="min_count")
    parser.add_option("--ntopics", type="int", dest="ntopics")
    parser.set_defaults(min_count=25, use_perm=False, pval=0.001)

    (opt, args) = parser.parse_args()

    vocab = read_vocab(opt.vocab)
    assigns = parse_word_assignments(opt.assignments, vocab)
    corpus = file(opt.corpus).readlines()

    for topic in range(opt.ntopics):
        stdout.write('writing topic %d\n' % topic)
        sig_bigrams = turbo_topic(corpus, assigns, topic,
                                  use_perm=opt.use_perm,
                                  min=opt.min_count,
                                  pvalue=opt.pvalue)
        tt.write_vocab(sig_bigrams.marg,
                       '%stopic%03d.txt' % (opt.out, topic))


# python lda_topics.py --assign=word-assignments.dat --corpus=corpus.txt --vocab=vocab.dat --out=tt --pval=0.001 --min-count=25 --perm
Example #4
0
def compute(corpus_file,
            pvalue,
            use_perm,
            out_filename,
            stopw=None,
            min_count=5,
            min_bigram_count=5,
            min_char_count=3,
            encoding='utf-8'):
    """
    Recursively find collocations for a given corpus.  writes
    the marginal counts to a specified file
    :param encoding: Encoding of the corpus file
    :param stopw: List of stopwords to apply to the analysis
    :param corpus_file: string with file name
    :param pvalue: self-explanatory
    :param use_perm: Boolean. Score by permutation
    :param out_filename: file name to write into
    :param min_count:
    :param min_bigram_count:
    :param min_char_count:
    """

    sys.stdout.write("computing n-grams from %s\n" % corpus_file)

    if stopw is None:
        tt._stop_words = []
    else:
        assert isinstance(stopw, list)
        tt._stop_words = stopw

    ### read corpus
    with codecs.open(corpus_file, encoding=encoding) as f:
        corpus = f.readlines()

    ### set up recursive hypothesis tests
    lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm)

    def iter_gen():
        for doc in corpus:
            yield doc

    # note: some hidden defaults here, e.g., no numbers
    char_filter = tt.make_char_filter(min_char_count)

    def my_filter(w):
        char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w)

    def update_fun(count, doc):
        count.update_counts(doc, root_filter=tt.stop_filter)

    ### compute significant n-grams
    cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count)

    ### write n-grams to file
    sys.stdout.write("writing to %s\n" % out_filename)
    with codecs.open(out_filename, 'w', encoding='utf-8') as f:
        # this can be adjusted to write out any information you need
        [
            f.write(u'{0:s}|{1:g}\n'.format(term, count))
            for (term, count) in sorted(cnts.marg.items(), key=lambda x: -x[1])
        ]
    print "Number of seleced bigrams: ", len(cnts.vocab)
    tt.write_vocab(cnts.marg, 'ngram_counts.csv')
    return cnts
Example #5
0
    from optparse import *

    parser = OptionParser()
    parser.add_option("--corpus", type="string", dest="corpus")
    parser.add_option("--assign", type="string", dest="assignments")
    parser.add_option("--vocab", type="string", dest="vocab")
    parser.add_option("--perm", action="store_true", dest="use_perm")
    parser.add_option("--pval", type="float", dest="pvalue")
    parser.add_option("--out", type="string", dest="out")
    parser.add_option("--min-count", type="float", dest="min_count")
    parser.add_option("--ntopics", type="int", dest="ntopics")
    parser.set_defaults(min_count=25, use_perm=False, pval=0.001)

    (opt, args) = parser.parse_args()

    vocab = read_vocab(opt.vocab)
    assigns = parse_word_assignments(opt.assignments, vocab)
    corpus = file(opt.corpus).readlines()

    for topic in range(opt.ntopics):
        stdout.write('writing topic %d\n' % topic)
        sig_bigrams = turbo_topic(corpus,
                                  assigns,
                                  topic,
                                  use_perm=opt.use_perm,
                                  min=opt.min_count,
                                  pvalue=opt.pvalue)
        tt.write_vocab(sig_bigrams.marg, '%stopic%03d.txt' % (opt.out, topic))

# python lda_topics.py --assign=word-assignments.dat --corpus=corpus.txt --vocab=vocab.dat --out=tt --pval=0.001 --min-count=25 --perm