Beispiel #1
0
def main():
    with open("/home/beka/thesis/resources/collins/collins.pickle", "rb") as f:
        col = collins.CollinsDictionary(pickle.load(f))
    with open(sys.argv[1]) as f:
        words = [line.strip() for line in f if line.strip()]
        words = words[int(sys.argv[2]):]
    output = []
    for word in words:
        entries = col.by_form(word)
        if not entries:
            output.append((word, 'NOT FOUND'))
            continue
        if all(s.pos != POSTags.Noun for e in entries for s in e.senses):
            output.append((word, 'NOT NOUN'))
            continue
        for entry in entries:
            print("\n\n\n{}#{}".format(entry.key, entry.context))
            print("\n===\n".join(str(s) for s in entry.senses))
        user = input(
            "{}\nselect 0-5, Q or anything else to skip: ".format(word))
        if user == 'Q':
            return output
        try:
            score = int(user)
            assert 0 <= score <= 5
            output.append((word, user))
        except:
            output.append((word, 'NOT NOUN'))
    return output
Beispiel #2
0
def main():
    with open("/home/beka/thesis/resources/collins/collins.pickle", "rb") as f:
        col = collins.CollinsDictionary(pickle.load(f))
    with open(sys.argv[1]) as f:
        data = [line.strip().split('\t') for line in f if line.strip()]
        nouns = {k: float(v) for k, v in data}
    with open(sys.argv[1] + '.new', 'wt') as f:
        skipped = set()
        while True:
            entry = col.random_entry(POSTags.Noun)
            while entry.key in nouns or entry.key in skipped:
                entry = col.random_entry(POSTags.Noun)
            print("\n\n\n{}#{}".format(entry.key, entry.context))
            print("\n===\n".join(str(s) for s in entry.senses))
            user = input("select 0-5, Q or anything else to skip: ")
            if user == 'Q':
                break
            try:
                score = int(user)
                assert 0 <= score <= 5
                nouns[entry.key] = score
            except:
                skipped.add(entry.key)
                continue
            f.write("{}\t{}\n".format(entry.key, score / 5))
            f.flush()
Beispiel #3
0
 def __init__(self, dixon_path, collins_path, wikt_path):
     with open(dixon_path) as f:
         self.dixon = DixonVerbs(ETree.ElementTree().parse(f))
     with open(collins_path, 'rb') as f:
         self.collins = collins.CollinsDictionary(pickle.load(f))
     with open(wikt_path) as f:
         raw_defs = f.read().split('\n')[:-1]  # last line is empty
         self.wikt = wikt.Wiktionary(raw_defs)
     self.stemmer = nltk.stem.snowball.EnglishStemmer()
Beispiel #4
0
def extract_dict_features(targets, collins_path):
    targets = [' '.join(target) for target in targets]  # tuples to strings
    with open(collins_path, 'rb') as f:
        raw_dict = pickle.load(f)
    coll = collins.CollinsDictionary(raw_dict)
    feats = []
    descriptions = []
    for target in targets:
        entries = coll.by_key(target)
        if len(entries) != 1:  # we don't handle context-dependent entries
            descriptions.append([])
            continue
        descriptions.append(
            [s.desc for s in entries[0].senses if s.pos == POSTags.Noun])
    for mark in ('activity', 'process', 'act'):
        feats.append(' '.join(
            str(int(any(mark in d.split()[1:6] for d in tdesc)))
            for tdesc in descriptions))
    return feats
Beispiel #5
0
def extract_hfw_dict_features(targets, collins_path, hfw):
    targets = [' '.join(target) for target in targets]  # tuples to strings
    with open(collins_path, 'rb') as f:
        raw_dict = pickle.load(f)
    coll = collins.CollinsDictionary(raw_dict)
    feats = []
    descriptions = []
    for target in targets:
        entries = coll.by_key(target)
        if len(entries) != 1:  # we don't handle context-dependent entries
            descriptions.append([])
            continue
        descriptions.append(
            [s.desc for s in entries[0].senses if s.pos == POSTags.Noun])
    for word in hfw:
        rx = re.compile('\W{}\W'.format(word))
        feats.append(' '.join(
            str(int(any(rx.search(d) for d in tdesc)))
            for tdesc in descriptions))
    return feats
Beispiel #6
0
 def __init__(self, collins_path, wikt_path):
     with open(collins_path, 'rb') as f:
         self.collins = collins.CollinsDictionary(pickle.load(f))
     with open(wikt_path) as f:
         raw_defs = f.read().split('\n')[:-1]  # last line is empty
         self.wikt = wikt.Wiktionary(raw_defs)