def compute(corpus_file, pvalue, use_perm, out_filename, stopw=None, min_count=5, min_bigram_count=5, min_char_count=3, encoding='utf-8'): """ Recursively find collocations for a given corpus. writes the marginal counts to a specified file :param encoding: Encoding of the corpus file :param stopw: List of stopwords to apply to the analysis :param corpus_file: string with file name :param pvalue: self-explanatory :param use_perm: Boolean. Score by permutation :param out_filename: file name to write into :param min_count: :param min_bigram_count: :param min_char_count: """ sys.stdout.write("computing n-grams from %s\n" % corpus_file) if stopw is None: tt._stop_words = [] else: assert isinstance(stopw, list) tt._stop_words = stopw ### read corpus with codecs.open(corpus_file, encoding=encoding) as f: corpus = f.readlines() ### set up recursive hypothesis tests lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm) def iter_gen(): for doc in corpus: yield doc # note: some hidden defaults here, e.g., no numbers char_filter = tt.make_char_filter(min_char_count) def my_filter(w): char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w) def update_fun(count, doc): count.update_counts(doc, root_filter=tt.stop_filter) ### compute significant n-grams cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count) ### write n-grams to file sys.stdout.write("writing to %s\n" % out_filename) with codecs.open(out_filename, 'w', encoding='utf-8') as f: # this can be adjusted to write out any information you need [f.write(u'{0:s}|{1:g}\n'.format(term, count)) for (term, count) in sorted(cnts.marg.items(), key=lambda x:-x[1])] print "Number of seleced bigrams: ", len(cnts.vocab) tt.write_vocab(cnts.marg, 'ngram_counts.csv') return cnts
def main(vocab_file, assign_file, corpus_file, ntopics, out, use_perm=True, min_count=25, pvalue=.001): vocab = read_vocab(vocab_file) assigns = parse_word_assignments(assign_file, vocab) corpus = file(corpus_file).readlines() for topic in range(ntopics): sys.stdout.write('writing topic %d\n' % topic) sig_bigrams = turbo_topic(corpus, assigns, topic, use_perm=use_perm, min=min_count, pvalue=pvalue) tt.write_vocab(sig_bigrams.marg, '%stopic%04d.txt' % (out, topic), incl_stop=True)
from optparse import * parser = OptionParser() parser.add_option("--corpus", type="string", dest="corpus") parser.add_option("--assign", type="string", dest="assignments") parser.add_option("--vocab", type="string", dest="vocab") parser.add_option("--perm", action="store_true", dest="use_perm") parser.add_option("--pval", type="float", dest="pvalue") parser.add_option("--out", type="string", dest="out") parser.add_option("--min-count", type="float", dest="min_count") parser.add_option("--ntopics", type="int", dest="ntopics") parser.set_defaults(min_count=25, use_perm=False, pval=0.001) (opt, args) = parser.parse_args() vocab = read_vocab(opt.vocab) assigns = parse_word_assignments(opt.assignments, vocab) corpus = file(opt.corpus).readlines() for topic in range(opt.ntopics): stdout.write('writing topic %d\n' % topic) sig_bigrams = turbo_topic(corpus, assigns, topic, use_perm=opt.use_perm, min=opt.min_count, pvalue=opt.pvalue) tt.write_vocab(sig_bigrams.marg, '%stopic%03d.txt' % (opt.out, topic)) # python lda_topics.py --assign=word-assignments.dat --corpus=corpus.txt --vocab=vocab.dat --out=tt --pval=0.001 --min-count=25 --perm
def compute(corpus_file, pvalue, use_perm, out_filename, stopw=None, min_count=5, min_bigram_count=5, min_char_count=3, encoding='utf-8'): """ Recursively find collocations for a given corpus. writes the marginal counts to a specified file :param encoding: Encoding of the corpus file :param stopw: List of stopwords to apply to the analysis :param corpus_file: string with file name :param pvalue: self-explanatory :param use_perm: Boolean. Score by permutation :param out_filename: file name to write into :param min_count: :param min_bigram_count: :param min_char_count: """ sys.stdout.write("computing n-grams from %s\n" % corpus_file) if stopw is None: tt._stop_words = [] else: assert isinstance(stopw, list) tt._stop_words = stopw ### read corpus with codecs.open(corpus_file, encoding=encoding) as f: corpus = f.readlines() ### set up recursive hypothesis tests lr = tt.LikelihoodRatio(pvalue=pvalue, use_perm=use_perm) def iter_gen(): for doc in corpus: yield doc # note: some hidden defaults here, e.g., no numbers char_filter = tt.make_char_filter(min_char_count) def my_filter(w): char_filter(w) and tt.stop_filter(w) and tt.digit_filter(w) def update_fun(count, doc): count.update_counts(doc, root_filter=tt.stop_filter) ### compute significant n-grams cnts = tt.nested_sig_bigrams(iter_gen, update_fun, lr, min_count) ### write n-grams to file sys.stdout.write("writing to %s\n" % out_filename) with codecs.open(out_filename, 'w', encoding='utf-8') as f: # this can be adjusted to write out any information you need [ f.write(u'{0:s}|{1:g}\n'.format(term, count)) for (term, count) in sorted(cnts.marg.items(), key=lambda x: -x[1]) ] print "Number of seleced bigrams: ", len(cnts.vocab) tt.write_vocab(cnts.marg, 'ngram_counts.csv') return cnts