Beispiel #1
0
def build_target_dicts(targetdictFP, unifreqFP, dataFP='../scoring/cv.train.sentences.json'):
    uniFreq = Counter()
    targets = Counter()
    with codecs.open(dataFP, 'r', 'utf-8') as inF:
        for ln in inF:
            sentence = json.loads(ln)
            
            # unigrams
            #print(sentence['tokens'],sentence['pos'], file=sys.stderr)
            if len(sentence['tokens'])!=len(sentence['pos']):
                # sometimes the data is buggy and missing a POS tag :( 
                # stopgap: insert ? as the tag
                # TODO: fix this in preprocessing, make an assert here
                posmap = {stuff['start']: stuff for stuff in sentence['pos']}
                sentence['pos'] = [posmap.get(i, {'start': i, 'end': i+1, 'name': '?', 'text': sentence['tokens'][i]}) for i in range(len(sentence['tokens']))]
            
            tokenOffsets = set()    # tokens of interest: either part of a target or present in the WSL
            
            for wslentry in sentence['wsl']:
                tokenOffsets |= {i for i in range(wslentry['start'],wslentry['end'])}
            
            # targets
            for frame in sentence['frames']:
                target_toks = {i for span in frame['target']['spans'] for i in range(span['start'],span['end'])}
                tokenOffsets |= target_toks
                lemmas = [get_lemma(entry['text'], entry['name'])+'_'+entry['name'].upper()[0] for i in sorted(target_toks) for entry in [sentence['pos'][i]]]
                targets[' '.join(lemmas)] += 1
            
            
            # unigram counts
            # ignore tokens tagged as proper nouns (consequence: if there are proper noun unigram targets, 
            # the target/unigram ratio can be greater than 1)
            uniFreq.update(get_lemma(entry['text'], entry['name'])+'_'+entry['name'].upper()[0] 
                           for entry in sentence['pos'] if entry['start'] in tokenOffsets and not entry['name'].startswith('NP') and not entry['name'].startswith('NNP'))
            
            
    
    with codecs.open(unifreqFP, 'w', 'utf-8') as outF:
        for w,n in uniFreq.items():
            outF.write(w+'\t'+str(n)+'\n')
    with codecs.open(targetdictFP, 'w', 'utf-8') as outF:
        for ww,n in targets.items():
            outF.write(ww+'\t'+str(n)+'\n')
Beispiel #2
0
def extract_features(conll_tokens):
    for t in conll_tokens:
        t.lemma = get_lemma(t.form, t.postag)
        t.cpostag = get_coarse_pos(t.postag)
    with_walls = [LEFT_ANCHOR] + conll_tokens + [RIGHT_ANCHOR]
    trigrams = ngrams(with_walls, 3)
    for trigram in trigrams:
        before, this, after = trigram
        yield {
            'lemma_%s' % this.lemma: True,
            'pos_%s' % this.cpostag: True,
            'prev_lemma_%s' % before.lemma: True,
            'prev_pos_%s' % before.cpostag: True,
            'next_lemma_%s' % after.lemma: True,
            'next_pos_%s' % after.cpostag: True,
        }
Beispiel #3
0
def read_conll(lines, lookup_lemmas=False):
    """
    If no lemma is present and lookup_lemmas is True, consults WordNet by
    calling get_lemma().
    """
    result = []
    for line in lines:
        line = line.strip()
        if line == '':
            yield result
            result = []
        else:
            token = ConllToken.from_line(line)
            if lookup_lemmas:
                token.lemma = get_lemma(token.form, token.postag)
            result.append(token)
    if result:
        yield result
def build_target_dicts(targetdictFP,
                       unifreqFP,
                       dataFP='../scoring/cv.train.sentences.json'):
    uniFreq = Counter()
    targets = Counter()
    with codecs.open(dataFP, 'r', 'utf-8') as inF:
        for ln in inF:
            sentence = json.loads(ln)

            # unigrams
            #print(sentence['tokens'],sentence['pos'], file=sys.stderr)
            if len(sentence['tokens']) != len(sentence['pos']):
                # sometimes the data is buggy and missing a POS tag :(
                # stopgap: insert ? as the tag
                # TODO: fix this in preprocessing, make an assert here
                posmap = {stuff['start']: stuff for stuff in sentence['pos']}
                sentence['pos'] = [
                    posmap.get(
                        i, {
                            'start': i,
                            'end': i + 1,
                            'name': '?',
                            'text': sentence['tokens'][i]
                        }) for i in range(len(sentence['tokens']))
                ]

            tokenOffsets = set(
            )  # tokens of interest: either part of a target or present in the WSL

            for wslentry in sentence['wsl']:
                tokenOffsets |= {
                    i
                    for i in range(wslentry['start'], wslentry['end'])
                }

            # targets
            for frame in sentence['frames']:
                target_toks = {
                    i
                    for span in frame['target']['spans']
                    for i in range(span['start'], span['end'])
                }
                tokenOffsets |= target_toks
                lemmas = [
                    get_lemma(entry['text'], entry['name']) + '_' +
                    entry['name'].upper()[0] for i in sorted(target_toks)
                    for entry in [sentence['pos'][i]]
                ]
                targets[' '.join(lemmas)] += 1

            # unigram counts
            # ignore tokens tagged as proper nouns (consequence: if there are proper noun unigram targets,
            # the target/unigram ratio can be greater than 1)
            uniFreq.update(
                get_lemma(entry['text'], entry['name']) + '_' +
                entry['name'].upper()[0] for entry in sentence['pos']
                if entry['start'] in tokenOffsets and not entry['name'].
                startswith('NP') and not entry['name'].startswith('NNP'))

    with codecs.open(unifreqFP, 'w', 'utf-8') as outF:
        for w, n in uniFreq.items():
            outF.write(w + '\t' + str(n) + '\n')
    with codecs.open(targetdictFP, 'w', 'utf-8') as outF:
        for ww, n in targets.items():
            outF.write(ww + '\t' + str(n) + '\n')