import re import sys import lib f = open(sys.argv[1]) raw = f.read() lines = lib.get_dat(raw) list = lib.get_eterms(lines) list = [entry.lower() for entry in list] terms = [entry.strip() for entry in list] words = lib.collapse([entry.split() for entry in list]) termset = set(terms) wordset = set(words) worddict = dict([(el, (0,0,0,0,0)) for el in wordset]) for term in terms: term = term.split() for i in range(len(term)): f,a,b,c,d = worddict[term[i]] if len(term) == 1: worddict[term[i]] = f+1,a+1,b,c,d elif i == 0: worddict[term[i]] = f+1,a,b+1,c,d elif i == (len(term)-1): worddict[term[i]] = f+1,a,b,c,d+1 else:
import re import sys import lib f = open(sys.argv[1]) raw = f.read() terms = lib.get_freq(raw) if '' in terms: terms.pop('') list = [term.split() for term in terms.keys()] list = lib.collapse(list) s = set(list) wfreq = dict([(word, 0) for word in s]) for words, freq in terms.items(): for word in words.split(): wfreq[word] += freq for word, a in sorted(wfreq.items(), key=lambda entry: entry[1], reverse=True): print word + '\t' + str(a)
sys.stderr.write("Max dist: " + str(max_dist) + "\n") if len(args) == 0: raw = sys.stdin.read() else: f = open(args[0]) raw = f.read() lines = lib.get_dat(raw) lines = [line[2] for line in lines] if lowercase: lines = [line.lower() for line in lines] if byword: lines = lib.collapse([line.split() for line in lines]) wordset = set(lines) sys.stderr.write(str(len(wordset)) + "\n") allcount = 0 scount = 0 for str1, str2 in itertools.combinations(wordset, 2): if allcount % 10000 == 0: sys.stderr.write("allcount: " + str(allcount) + "\n") sys.stderr.write(" scount: " + str(scount) + "\n") allcount += 1 dist = lib.levenshtein(str1, str2) if dist <= max_dist: scount += 1
import re import sys import lib f = open(sys.argv[1]) raw = f.read() terms = lib.get_freq(raw) if '' in terms: terms.pop('') wordset = set(lib.collapse([term.split() for term in terms.keys()])) worddict = dict([(el, (0,0,0,0,0)) for el in wordset]) for term, count in terms.items(): term = term.split() for i in range(len(term)): f,a,b,c,d = worddict[term[i]] if len(term) == 1: worddict[term[i]] = f+count,a+count,b,c,d elif i == 0: worddict[term[i]] = f+count,a,b+count,c,d elif i == (len(term)-1): worddict[term[i]] = f+count,a,b,c,d+count else: worddict[term[i]] = f+count,a,b,c+count,d for word, (f,a,b,c,d) in sorted(worddict.items(), key=lambda entry: entry[1][0], reverse=True): print word + '\t' + str(f) + '\t' + str(a) + '\t' + str(b) + '\t' + str(c) + '\t' + str(d)
import re import sys import lib f = open(sys.argv[1]) raw = f.read() lines = lib.get_dat(raw) list = lib.get_eterms(lines) list = [entry.lower() for entry in list] list = [entry.split() for entry in list] list = lib.collapse(list) s = set(list) d = dict([(el, 0) for el in s]) for entry in list: d[entry] += 1 for word, a in sorted(d.items(), key=lambda entry: entry[1], reverse=True): print word + '\t' + str(a)
sys.stderr.write("Max dist: " + str(max_dist) + "\n") if len(args) == 0: raw = sys.stdin.read() else: f = open(args[0]) raw = f.read() lines = lib.get_dat(raw) lines = [line[2] for line in lines] if lowercase: lines = [line.lower() for line in lines] if byword: lines = lib.collapse([line.split() for line in lines]) wordset = set(lines) sys.stderr.write(str(len(wordset)) + "\n") allcount = 0 scount = 0 for str1, str2 in itertools.combinations(wordset, 2): if allcount%10000 == 0: sys.stderr.write("allcount: " + str(allcount) + "\n") sys.stderr.write(" scount: " + str(scount) + "\n") allcount += 1 dist = lib.levenshtein(str1, str2) if dist <= max_dist: scount += 1
sys.stderr.write(str(len(lines)) + " entries\n") p = PunktSentenceTokenizer() taking_pos = set(["ADJ", "ADV", "FW", "N", "NP", "NUM", "VG", "VN"]) for i in range(len(lines)): if i % 100 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if "EKYWD" in line and "EABST" in line: abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sent) for sent in abstract] abstract = lib.collapse(abstract) pos_abstract = pos_tag(abstract) pos_abstract = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_abstract] keywords = re.split("\t", line["EKYWD"]) keywords = [word_tokenize(keyword) for keyword in keywords] j = 0 while j < len(abstract): found = False for k in range(len(keywords)): keyword = keywords[k] keyword_len = len(keyword) if keyword_len > 0 and keyword == abstract[j:j+keyword_len]: for l in range(keyword_len): this_word = keyword[l] this_pos = pos_abstract[j+l][1] out = ""
import re import sys import lib f = open(sys.argv[1]) raw = f.read() terms = lib.get_freq(raw) if '' in terms: terms.pop('') wordset = set(lib.collapse([term.split() for term in terms.keys()])) worddict = dict([(el, (0, 0, 0, 0, 0)) for el in wordset]) for term, count in terms.items(): term = term.split() for i in range(len(term)): f, a, b, c, d = worddict[term[i]] if len(term) == 1: worddict[term[i]] = f + count, a + count, b, c, d elif i == 0: worddict[term[i]] = f + count, a, b + count, c, d elif i == (len(term) - 1): worddict[term[i]] = f + count, a, b, c, d + count else: worddict[term[i]] = f + count, a, b, c + count, d for word, (f, a, b, c, d) in sorted(worddict.items(), key=lambda entry: entry[1][0], reverse=True):
sys.stderr.write(str(len(lines)) + " entries\n") p = PunktSentenceTokenizer() taking_pos = set(["ADJ", "ADV", "FW", "N", "NP", "NUM", "VG", "VN"]) for i in range(len(lines)): if i % 100 == 0: sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n") line = dict(lines[i]) if "EKYWD" in line and "EABST" in line: abstract = line["EABST"] abstract = p.tokenize(abstract) abstract = [word_tokenize(sent) for sent in abstract] abstract = lib.collapse(abstract) pos_abstract = pos_tag(abstract) pos_abstract = [(word, tag.simplify.simplify_wsj_tag(t)) for word, t in pos_abstract] keywords = re.split("\t", line["EKYWD"]) keywords = [word_tokenize(keyword) for keyword in keywords] j = 0 while j < len(abstract): found = False for k in range(len(keywords)): keyword = keywords[k] keyword_len = len(keyword) if keyword_len > 0 and keyword == abstract[j:j + keyword_len]: for l in range(keyword_len): this_word = keyword[l] this_pos = pos_abstract[j + l][1]