terms = lib.get_freq(raw) if '' in terms: terms.pop('') sys.stderr.write("Extracted freq\n") sys.stderr.write("Total: " + str(len(terms)) + " entries\n") posfreq = dict([]) count = 0 tcount = len(terms) for entry in terms: if count % 1000 == 0: sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n"); count += 1 pos = nltk.pos_tag(nltk.word_tokenize(entry)) pos = [word[1] for word in pos] pos = lib.collapse_string(pos, '/') if pos in posfreq: posfreq[pos] += terms[entry] else: posfreq[pos] = terms[entry] sys.stderr.write("End tagging\n") for pos, freq in sorted(posfreq.items(), key = lambda entry: entry[1], reverse=True): print pos + '\t' + str(freq)
wnl = nltk.stem.WordNetLemmatizer() for entry in terms: if count % 1000 == 0: sys.stderr.write("Tagging and lemmatizing word " + str(count) + "/" + str(tcount) + "\n") count += 1 pos = nltk.pos_tag(nltk.word_tokenize(entry)) lterm = [] for word in pos: if word[1][0] == 'N': lterm.append(wnl.lemmatize(word[0])) elif word[1][0] == 'V': lterm.append(wnl.lemmatize(word[0], 'v')) elif word[1][0] == 'J': lterm.append(wnl.lemmatize(word[0], 'a')) else: lterm.append(word[0]) lterm = lib.collapse_string(lterm, ' ') if lterm in lfreq: lfreq[lterm] += terms[entry] else: lfreq[lterm] = terms[entry] sys.stderr.write("End tagging\n") for lword, freq in sorted(lfreq.items(), key=lambda entry: entry[1], reverse=True): print lword + '\t' + str(freq)
import re import sys import lib argc = len(sys.argv) terms = dict([]) for i in range(1,argc): f = open(sys.argv[i]) sys.stderr.write("Opened " + sys.argv[i] + "\n") raw = f.read() new_terms = lib.get_multidict(raw) for entry in new_terms.items(): if entry[0] in terms: v = terms[entry[0]] for j in range(len(v)): v[j] += entry[1][j] terms[entry[0]] = v else: terms[entry[0]] = entry[1] sys.stderr.write("Combined\n") sys.stderr.write("Start writing output " + str(len(terms)) + " entries\n") for word, freq in sorted(terms.items(), key=lambda entry: entry[1][0], reverse=True): print word + '\t' + lib.collapse_string([str(f) for f in freq], '\t')
count += 1 add_text = [word for word in re.split("[()]", line[2]) if word != ""] for i in range(len(add_text)): if lemmatize: pos = nltk.pos_tag(add_text[i].split()) lterm = [] for word, p in pos: if word[0] == 'N': lterm.append(wnl.lemmatize(word)) elif word[0] == 'V': lterm.append(wnl.lemmatize(word, 'v')) elif word[0] == 'J': lterm.append(wnl.lemmatize(word, 'a')) else: lterm.append(word) add_text[i] = lib.collapse_string(lterm, ' ') if lowercase: add_text = [word.lower() for word in add_text] if (line[1] in je_dict): for word in add_text: je_dict[line[1]].add(word) else: je_dict[line[1]] = set(add_text) sys.stderr.write("Finished making dict\n") sys.stderr.write(str(len(je_dict))) for jw, ew in sorted(je_dict.items(), key=lambda entry: len(entry[1]), reverse=True): print jw + '\t' + str(len(ew)) + '\t' + lib.collapse_string(list(ew), '\t')
tcount = len(terms) wnl = nltk.stem.WordNetLemmatizer() for entry in terms: if count %1000 == 0: sys.stderr.write("Tagging and lemmatizing word " + str(count) + "/" + str(tcount) + "\n") count += 1 pos = nltk.pos_tag(nltk.word_tokenize(entry)) lterm = [] for word in pos: if word[1][0] == 'N': lterm.append(wnl.lemmatize(word[0])) elif word[1][0] == 'V': lterm.append(wnl.lemmatize(word[0], 'v')) elif word[1][0] == 'J': lterm.append(wnl.lemmatize(word[0], 'a')) else: lterm.append(word[0]) lterm = lib.collapse_string(lterm, ' ') if lterm in lfreq: lfreq[lterm] += terms[entry] else: lfreq[lterm] = terms[entry] sys.stderr.write("End tagging\n") for lword, freq in sorted(lfreq.items(), key = lambda entry: entry[1], reverse=True): print lword + '\t' + str(freq)
count += 1 add_text = [word for word in re.split("[()]", line[2]) if word != ""] for i in range(len(add_text)): if lemmatize: pos = nltk.pos_tag(add_text[i].split()) lterm = [] for word, p in pos: if word[0] == 'N': lterm.append(wnl.lemmatize(word)) elif word[0] == 'V': lterm.append(wnl.lemmatize(word, 'v')) elif word[0] == 'J': lterm.append(wnl.lemmatize(word, 'a')) else: lterm.append(word) add_text[i] = lib.collapse_string(lterm, ' ') if lowercase: add_text = [word.lower() for word in add_text] if (line[1] in je_dict): for word in add_text: je_dict[line[1]].add(word) else: je_dict[line[1]] = set(add_text) sys.stderr.write("Finished making dict\n") sys.stderr.write(str(len(je_dict))) for jw, ew in sorted(je_dict.items(), key=lambda entry: len(entry[1]), reverse=True):
terms = lib.get_freq(raw) if "" in terms: terms.pop("") sys.stderr.write("Extracted freq\n") sys.stderr.write("Total: " + str(len(terms)) + " entries\n") posfreq = dict([]) count = 0 tcount = len(terms) for entry in terms: if count % 1000 == 0: sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n") count += 1 pos = nltk.pos_tag(nltk.word_tokenize(entry)) pos = [word[1] for word in pos] pos = lib.collapse_string(pos, "/") if pos in posfreq: posfreq[pos] += terms[entry] else: posfreq[pos] = terms[entry] sys.stderr.write("End tagging\n") for pos, freq in sorted(posfreq.items(), key=lambda entry: entry[1], reverse=True): print pos + "\t" + str(freq)
posdict = dict([]) sys.stderr.write("Start tagging\n") count = 0 tcount = len(s) for entry in s: if count % 1000 == 0: sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n") count += 1 tagged_entry = nltk.pos_tag(nltk.word_tokenize(entry)) tag_list = [word[1] for word in tagged_entry] posdict[entry] = lib.collapse_string(tag_list, '/') sys.stderr.write("End tagging\n") posset = set(posdict.values()) posfreq = dict([(posent, 0) for posent in posset]) sys.stderr.write("Start counting for each POS tag string\n") for entry in list: if entry != '': posfreq[posdict[entry]] += 1 sys.stderr.write("Finished counting\n") sys.stderr.write("Start writing output\n")
s.discard('') posdict = dict([]) sys.stderr.write("Start tagging\n") count = 0 tcount = len(s) for entry in s: if count % 1000 == 0: sys.stderr.write("Tagging word " + str(count) + "/" + str(tcount) + "\n"); count += 1 tagged_entry = nltk.pos_tag(nltk.word_tokenize(entry)) tag_list = [word[1] for word in tagged_entry] posdict[entry] = lib.collapse_string(tag_list, '/') sys.stderr.write("End tagging\n") posset = set(posdict.values()) posfreq = dict([(posent, 0) for posent in posset]) sys.stderr.write("Start counting for each POS tag string\n") for entry in list: if entry != '': posfreq[posdict[entry]] += 1 sys.stderr.write("Finished counting\n"); sys.stderr.write("Start writing output\n");