def build_target_dicts(targetdictFP, unifreqFP, dataFP='../scoring/cv.train.sentences.json'): uniFreq = Counter() targets = Counter() with codecs.open(dataFP, 'r', 'utf-8') as inF: for ln in inF: sentence = json.loads(ln) # unigrams #print(sentence['tokens'],sentence['pos'], file=sys.stderr) if len(sentence['tokens'])!=len(sentence['pos']): # sometimes the data is buggy and missing a POS tag :( # stopgap: insert ? as the tag # TODO: fix this in preprocessing, make an assert here posmap = {stuff['start']: stuff for stuff in sentence['pos']} sentence['pos'] = [posmap.get(i, {'start': i, 'end': i+1, 'name': '?', 'text': sentence['tokens'][i]}) for i in range(len(sentence['tokens']))] tokenOffsets = set() # tokens of interest: either part of a target or present in the WSL for wslentry in sentence['wsl']: tokenOffsets |= {i for i in range(wslentry['start'],wslentry['end'])} # targets for frame in sentence['frames']: target_toks = {i for span in frame['target']['spans'] for i in range(span['start'],span['end'])} tokenOffsets |= target_toks lemmas = [get_lemma(entry['text'], entry['name'])+'_'+entry['name'].upper()[0] for i in sorted(target_toks) for entry in [sentence['pos'][i]]] targets[' '.join(lemmas)] += 1 # unigram counts # ignore tokens tagged as proper nouns (consequence: if there are proper noun unigram targets, # the target/unigram ratio can be greater than 1) uniFreq.update(get_lemma(entry['text'], entry['name'])+'_'+entry['name'].upper()[0] for entry in sentence['pos'] if entry['start'] in tokenOffsets and not entry['name'].startswith('NP') and not entry['name'].startswith('NNP')) with codecs.open(unifreqFP, 'w', 'utf-8') as outF: for w,n in uniFreq.items(): outF.write(w+'\t'+str(n)+'\n') with codecs.open(targetdictFP, 'w', 'utf-8') as outF: for ww,n in targets.items(): outF.write(ww+'\t'+str(n)+'\n')
def extract_features(conll_tokens): for t in conll_tokens: t.lemma = get_lemma(t.form, t.postag) t.cpostag = get_coarse_pos(t.postag) with_walls = [LEFT_ANCHOR] + conll_tokens + [RIGHT_ANCHOR] trigrams = ngrams(with_walls, 3) for trigram in trigrams: before, this, after = trigram yield { 'lemma_%s' % this.lemma: True, 'pos_%s' % this.cpostag: True, 'prev_lemma_%s' % before.lemma: True, 'prev_pos_%s' % before.cpostag: True, 'next_lemma_%s' % after.lemma: True, 'next_pos_%s' % after.cpostag: True, }
def read_conll(lines, lookup_lemmas=False): """ If no lemma is present and lookup_lemmas is True, consults WordNet by calling get_lemma(). """ result = [] for line in lines: line = line.strip() if line == '': yield result result = [] else: token = ConllToken.from_line(line) if lookup_lemmas: token.lemma = get_lemma(token.form, token.postag) result.append(token) if result: yield result
def build_target_dicts(targetdictFP, unifreqFP, dataFP='../scoring/cv.train.sentences.json'): uniFreq = Counter() targets = Counter() with codecs.open(dataFP, 'r', 'utf-8') as inF: for ln in inF: sentence = json.loads(ln) # unigrams #print(sentence['tokens'],sentence['pos'], file=sys.stderr) if len(sentence['tokens']) != len(sentence['pos']): # sometimes the data is buggy and missing a POS tag :( # stopgap: insert ? as the tag # TODO: fix this in preprocessing, make an assert here posmap = {stuff['start']: stuff for stuff in sentence['pos']} sentence['pos'] = [ posmap.get( i, { 'start': i, 'end': i + 1, 'name': '?', 'text': sentence['tokens'][i] }) for i in range(len(sentence['tokens'])) ] tokenOffsets = set( ) # tokens of interest: either part of a target or present in the WSL for wslentry in sentence['wsl']: tokenOffsets |= { i for i in range(wslentry['start'], wslentry['end']) } # targets for frame in sentence['frames']: target_toks = { i for span in frame['target']['spans'] for i in range(span['start'], span['end']) } tokenOffsets |= target_toks lemmas = [ get_lemma(entry['text'], entry['name']) + '_' + entry['name'].upper()[0] for i in sorted(target_toks) for entry in [sentence['pos'][i]] ] targets[' '.join(lemmas)] += 1 # unigram counts # ignore tokens tagged as proper nouns (consequence: if there are proper noun unigram targets, # the target/unigram ratio can be greater than 1) uniFreq.update( get_lemma(entry['text'], entry['name']) + '_' + entry['name'].upper()[0] for entry in sentence['pos'] if entry['start'] in tokenOffsets and not entry['name']. startswith('NP') and not entry['name'].startswith('NNP')) with codecs.open(unifreqFP, 'w', 'utf-8') as outF: for w, n in uniFreq.items(): outF.write(w + '\t' + str(n) + '\n') with codecs.open(targetdictFP, 'w', 'utf-8') as outF: for ww, n in targets.items(): outF.write(ww + '\t' + str(n) + '\n')