Example #1
0
terms = lib.get_freq(raw)

if '' in terms: terms.pop('')

sys.stderr.write("Extracted freq\n")
sys.stderr.write("Total: " + str(len(terms)) + " entries\n")

posfreq = dict([])

count = 0
tcount = len(terms)

for entry in terms:
	if count % 1000 == 0:
		sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n");
	count += 1
	pos = nltk.pos_tag(nltk.word_tokenize(entry))
	pos = [word[1] for word in pos]
	pos = lib.collapse_string(pos, '/')
	if pos in posfreq:
		posfreq[pos] += terms[entry]
	else:
		posfreq[pos] = terms[entry]

sys.stderr.write("End tagging\n")

for pos, freq in sorted(posfreq.items(), key = lambda entry: entry[1], reverse=True):
	print pos + '\t' + str(freq)


Example #2
0
wnl = nltk.stem.WordNetLemmatizer()

for entry in terms:
    if count % 1000 == 0:
        sys.stderr.write("Tagging and lemmatizing word " + str(count) + "/" +
                         str(tcount) + "\n")
    count += 1
    pos = nltk.pos_tag(nltk.word_tokenize(entry))
    lterm = []
    for word in pos:
        if word[1][0] == 'N':
            lterm.append(wnl.lemmatize(word[0]))
        elif word[1][0] == 'V':
            lterm.append(wnl.lemmatize(word[0], 'v'))
        elif word[1][0] == 'J':
            lterm.append(wnl.lemmatize(word[0], 'a'))
        else:
            lterm.append(word[0])
    lterm = lib.collapse_string(lterm, ' ')
    if lterm in lfreq:
        lfreq[lterm] += terms[entry]
    else:
        lfreq[lterm] = terms[entry]

sys.stderr.write("End tagging\n")

for lword, freq in sorted(lfreq.items(),
                          key=lambda entry: entry[1],
                          reverse=True):
    print lword + '\t' + str(freq)
Example #3
0
import re
import sys
import lib

argc = len(sys.argv)

terms = dict([])

for i in range(1,argc):
	f = open(sys.argv[i])
	sys.stderr.write("Opened " + sys.argv[i] + "\n")
	raw = f.read()
	new_terms = lib.get_multidict(raw)
	for entry in new_terms.items():
		if entry[0] in terms:
			v = terms[entry[0]]
			for j in range(len(v)):
				v[j] += entry[1][j]
			terms[entry[0]] = v
		else:
			terms[entry[0]] = entry[1]
	sys.stderr.write("Combined\n")

sys.stderr.write("Start writing output " + str(len(terms)) + " entries\n")
for word, freq in sorted(terms.items(), key=lambda entry: entry[1][0], reverse=True):
	print word + '\t' + lib.collapse_string([str(f) for f in freq], '\t')

Example #4
0
    count += 1
    add_text = [word for word in re.split("[()]", line[2]) if word != ""]
    for i in range(len(add_text)):
        if lemmatize:
            pos = nltk.pos_tag(add_text[i].split())
            lterm = []
            for word, p in pos:
                if word[0] == 'N':
                    lterm.append(wnl.lemmatize(word))
                elif word[0] == 'V':
                    lterm.append(wnl.lemmatize(word, 'v'))
                elif word[0] == 'J':
                    lterm.append(wnl.lemmatize(word, 'a'))
                else:
                    lterm.append(word)
            add_text[i] = lib.collapse_string(lterm, ' ')
    if lowercase:
        add_text = [word.lower() for word in add_text]
    if (line[1] in je_dict):
        for word in add_text:
            je_dict[line[1]].add(word)
    else:
        je_dict[line[1]] = set(add_text)
        

sys.stderr.write("Finished making dict\n")

sys.stderr.write(str(len(je_dict)))

for jw, ew in sorted(je_dict.items(), key=lambda entry: len(entry[1]), reverse=True):
	print jw + '\t' + str(len(ew)) + '\t' + lib.collapse_string(list(ew), '\t')
Example #5
0
tcount = len(terms)

wnl = nltk.stem.WordNetLemmatizer()

for entry in terms:
	if count %1000 == 0:
		sys.stderr.write("Tagging and lemmatizing word " + str(count) + "/" + str(tcount) + "\n")
	count += 1
	pos = nltk.pos_tag(nltk.word_tokenize(entry))
	lterm = []
	for word in pos:
		if word[1][0] == 'N':
			lterm.append(wnl.lemmatize(word[0]))
		elif word[1][0] == 'V':
			lterm.append(wnl.lemmatize(word[0], 'v'))
		elif word[1][0] == 'J':
			lterm.append(wnl.lemmatize(word[0], 'a'))
		else:
			lterm.append(word[0])
	lterm = lib.collapse_string(lterm, ' ')
	if lterm in lfreq:
		lfreq[lterm] += terms[entry]
	else:
		lfreq[lterm] = terms[entry]

sys.stderr.write("End tagging\n")

for lword, freq in sorted(lfreq.items(), key = lambda entry: entry[1], reverse=True):
	print lword + '\t' + str(freq)
	
Example #6
0
    count += 1
    add_text = [word for word in re.split("[()]", line[2]) if word != ""]
    for i in range(len(add_text)):
        if lemmatize:
            pos = nltk.pos_tag(add_text[i].split())
            lterm = []
            for word, p in pos:
                if word[0] == 'N':
                    lterm.append(wnl.lemmatize(word))
                elif word[0] == 'V':
                    lterm.append(wnl.lemmatize(word, 'v'))
                elif word[0] == 'J':
                    lterm.append(wnl.lemmatize(word, 'a'))
                else:
                    lterm.append(word)
            add_text[i] = lib.collapse_string(lterm, ' ')
    if lowercase:
        add_text = [word.lower() for word in add_text]
    if (line[1] in je_dict):
        for word in add_text:
            je_dict[line[1]].add(word)
    else:
        je_dict[line[1]] = set(add_text)

sys.stderr.write("Finished making dict\n")

sys.stderr.write(str(len(je_dict)))

for jw, ew in sorted(je_dict.items(),
                     key=lambda entry: len(entry[1]),
                     reverse=True):
Example #7
0
terms = lib.get_freq(raw)

if "" in terms:
    terms.pop("")

sys.stderr.write("Extracted freq\n")
sys.stderr.write("Total: " + str(len(terms)) + " entries\n")

posfreq = dict([])

count = 0
tcount = len(terms)

for entry in terms:
    if count % 1000 == 0:
        sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n")
    count += 1
    pos = nltk.pos_tag(nltk.word_tokenize(entry))
    pos = [word[1] for word in pos]
    pos = lib.collapse_string(pos, "/")
    if pos in posfreq:
        posfreq[pos] += terms[entry]
    else:
        posfreq[pos] = terms[entry]

sys.stderr.write("End tagging\n")

for pos, freq in sorted(posfreq.items(), key=lambda entry: entry[1], reverse=True):
    print pos + "\t" + str(freq)
Example #8
0
posdict = dict([])

sys.stderr.write("Start tagging\n")

count = 0
tcount = len(s)

for entry in s:
    if count % 1000 == 0:
        sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) +
                         "\n")
    count += 1
    tagged_entry = nltk.pos_tag(nltk.word_tokenize(entry))
    tag_list = [word[1] for word in tagged_entry]
    posdict[entry] = lib.collapse_string(tag_list, '/')

sys.stderr.write("End tagging\n")

posset = set(posdict.values())

posfreq = dict([(posent, 0) for posent in posset])

sys.stderr.write("Start counting for each POS tag string\n")

for entry in list:
    if entry != '':
        posfreq[posdict[entry]] += 1

sys.stderr.write("Finished counting\n")
sys.stderr.write("Start writing output\n")
Example #9
0
s.discard('')

posdict = dict([])

sys.stderr.write("Start tagging\n")

count = 0
tcount = len(s)

for entry in s:
	if count % 1000 == 0:
		sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n");
	count += 1
	tagged_entry = nltk.pos_tag(nltk.word_tokenize(entry))
	tag_list = [word[1] for word in tagged_entry]
	posdict[entry] = lib.collapse_string(tag_list, '/')
	
sys.stderr.write("End tagging\n")

posset = set(posdict.values())

posfreq = dict([(posent, 0) for posent in posset])

sys.stderr.write("Start counting for each POS tag string\n")

for entry in list:
	if entry != '':
		posfreq[posdict[entry]] += 1

sys.stderr.write("Finished counting\n");
sys.stderr.write("Start writing output\n");