#!/usr/bin/env python
# -*- coding: utf-8

from nltk.probability import ConditionalFreqDist
from nltk.corpus import TaggedCorpusReader
from nltk.tag import simplify

FIRST = 0
END = 150
POS = "V"
#POS = "N"
#POS = "ADJ"

corpus_root = './data'
fileids = 'tagged_sent'

corpus = TaggedCorpusReader(corpus_root,
    fileids,
    encoding='utf-8')

processing = [(simplify.simplify_wsj_tag(tag), word.lower()) for (word, tag) in corpus.tagged_words()]
cfd_corpus = ConditionalFreqDist(processing)

for term,freq in cfd_corpus[POS].items():
  print term.encode("utf-8"),freq
Beispiel #2
0
from gmail_corpus.nltk_util.bigram_score import make_score_dict, save_score_dict
from nltk.corpus import TaggedCorpusReader
import numpy as np
from glob import glob
import os, sys

if __name__ == '__main__':
	corpus_path = sys.argv[1]
	# remove empty files
	files = glob('%s/*.txt' % corpus_path)
	for f in files:
		if os.path.getsize(f) == 0:
			os.remove(f)
			print 'Removed empty file %s' % f

	corpus = TaggedCorpusReader(corpus_path, '.*\.txt')
	score_dict = make_score_dict(corpus.tagged_words())
	save_score_dict(score_dict, 'bigram_scores.pkl')
	print 'saved bigram_scores.pkl'