def main(): with open("/home/beka/thesis/resources/collins/collins.pickle", "rb") as f: col = collins.CollinsDictionary(pickle.load(f)) with open(sys.argv[1]) as f: words = [line.strip() for line in f if line.strip()] words = words[int(sys.argv[2]):] output = [] for word in words: entries = col.by_form(word) if not entries: output.append((word, 'NOT FOUND')) continue if all(s.pos != POSTags.Noun for e in entries for s in e.senses): output.append((word, 'NOT NOUN')) continue for entry in entries: print("\n\n\n{}#{}".format(entry.key, entry.context)) print("\n===\n".join(str(s) for s in entry.senses)) user = input( "{}\nselect 0-5, Q or anything else to skip: ".format(word)) if user == 'Q': return output try: score = int(user) assert 0 <= score <= 5 output.append((word, user)) except: output.append((word, 'NOT NOUN')) return output
def main(): with open("/home/beka/thesis/resources/collins/collins.pickle", "rb") as f: col = collins.CollinsDictionary(pickle.load(f)) with open(sys.argv[1]) as f: data = [line.strip().split('\t') for line in f if line.strip()] nouns = {k: float(v) for k, v in data} with open(sys.argv[1] + '.new', 'wt') as f: skipped = set() while True: entry = col.random_entry(POSTags.Noun) while entry.key in nouns or entry.key in skipped: entry = col.random_entry(POSTags.Noun) print("\n\n\n{}#{}".format(entry.key, entry.context)) print("\n===\n".join(str(s) for s in entry.senses)) user = input("select 0-5, Q or anything else to skip: ") if user == 'Q': break try: score = int(user) assert 0 <= score <= 5 nouns[entry.key] = score except: skipped.add(entry.key) continue f.write("{}\t{}\n".format(entry.key, score / 5)) f.flush()
def __init__(self, dixon_path, collins_path, wikt_path): with open(dixon_path) as f: self.dixon = DixonVerbs(ETree.ElementTree().parse(f)) with open(collins_path, 'rb') as f: self.collins = collins.CollinsDictionary(pickle.load(f)) with open(wikt_path) as f: raw_defs = f.read().split('\n')[:-1] # last line is empty self.wikt = wikt.Wiktionary(raw_defs) self.stemmer = nltk.stem.snowball.EnglishStemmer()
def extract_dict_features(targets, collins_path): targets = [' '.join(target) for target in targets] # tuples to strings with open(collins_path, 'rb') as f: raw_dict = pickle.load(f) coll = collins.CollinsDictionary(raw_dict) feats = [] descriptions = [] for target in targets: entries = coll.by_key(target) if len(entries) != 1: # we don't handle context-dependent entries descriptions.append([]) continue descriptions.append( [s.desc for s in entries[0].senses if s.pos == POSTags.Noun]) for mark in ('activity', 'process', 'act'): feats.append(' '.join( str(int(any(mark in d.split()[1:6] for d in tdesc))) for tdesc in descriptions)) return feats
def extract_hfw_dict_features(targets, collins_path, hfw): targets = [' '.join(target) for target in targets] # tuples to strings with open(collins_path, 'rb') as f: raw_dict = pickle.load(f) coll = collins.CollinsDictionary(raw_dict) feats = [] descriptions = [] for target in targets: entries = coll.by_key(target) if len(entries) != 1: # we don't handle context-dependent entries descriptions.append([]) continue descriptions.append( [s.desc for s in entries[0].senses if s.pos == POSTags.Noun]) for word in hfw: rx = re.compile('\W{}\W'.format(word)) feats.append(' '.join( str(int(any(rx.search(d) for d in tdesc))) for tdesc in descriptions)) return feats
def __init__(self, collins_path, wikt_path): with open(collins_path, 'rb') as f: self.collins = collins.CollinsDictionary(pickle.load(f)) with open(wikt_path) as f: raw_defs = f.read().split('\n')[:-1] # last line is empty self.wikt = wikt.Wiktionary(raw_defs)