def _read_entry(self, entry): ds = entry["datasource"].lower() if ds == 'wikimwe': words = entry["lemmas"] # not actually always lemmatized del entry["lemmas"] entry["words"] = words if ds in self.POS_2_PENN: # map POSes to Penn Treebank tagset for i, p in enumerate(entry["poses"]): info = self.POS_2_PENN[ds].get(p, p) entry["poses"][i] = info if isinstance( info, basestring) else (info.get( entry["lemmas" if ds == 'baldwin vpc' else "words"][i]) or info[None]) if "lemmas" not in entry: if 'lvc' in ds: entry["lemmas"] = [ entry["verblemma"], morph.stem(entry["noun"], 'NN') ] else: assert "words" in entry, entry words = entry["words"] poses = [None] * len(words) if "poses" in entry and entry["poses"]: assert ds in {'said', 'semcor', 'wikimwe'}, entry poses = entry["poses"] elif ds in {'phrases.net', "oyz's idioms"}: pass elif entry["label"].startswith( 'NNP') or entry["label"].startswith('NE:'): poses = ['NNP'] * len(words) entry["lemmas"] = [ morph.stem(w, p) for w, p in zip(words, poses) ] try: sig = tuple(l.lower() for l in entry["lemmas"] if not l[0] == l[-1] == '_') if not sig or sig[-1] == 'the' or not any( l for l in sig if len(l) > 2): return # probably garbage entry if len(sig) > 1: self._entries[sig] = entry self._bylast[sig[-1]].add(sig) except: print(entry, file=sys.stderr) raise
def _read_nonblank_line(self, ln, sent): '''Tab-separated format: word pos tag sentId tag and sentId are optional. ''' parts = ln[:-1].split('\t')[:4] if len(parts) == 4: token, pos, tag, sentId = parts sent.sentId = sentId elif len(parts) == 3: token, pos, tag = parts if not tag.strip(): tag = None else: token, pos = parts tag = None if tag is not None: if self._labels is None: pass elif tag == '0' and self._legacy0: assert 'O' in self._labels, self._labels tag = 'O' elif tag not in self._labels: tag = 'O' tag = uintern(unicode(tag)) pos = uintern(unicode(pos)) stemS = uintern(unicode(morph.stem(token, pos))) sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag)
def _read_nonblank_line(self, ln, sent): '''Tab-separated format: word pos tag sentId tag and sentId are optional. ''' parts = ln[:-1].split('\t')[:4] if len(parts)==4: token, pos, tag, sentId = parts sent.sentId = sentId elif len(parts)==3: token, pos, tag = parts if not tag.strip(): tag = None else: token, pos = parts tag = None if tag is not None: if self._labels is None: pass elif tag=='0' and self._legacy0: assert 'O' in self._labels,self._labels tag = 'O' elif tag not in self._labels: tag = 'O' tag = uintern(unicode(tag)) pos = uintern(unicode(pos)) stemS = uintern(unicode(morph.stem(token,pos))) sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag)
def _read_nonblank_line(self, ln, sent): '''Tab-separated format: offset word lemma POS tag parent strength label sentId lemma will (for now) be ignored in favor of the automatic stemmer. label may be the empty string; sentId is optional. ''' parts = ln[:-1].split('\t') if len(parts)==9: offset, token, _, pos, tag, parent, strength, label, sentId = parts sent.sentId = sentId else: offset, token, _, pos, tag, parent, strength, label = parts offset = int(offset) parent = int(parent) assert len(sent)+1==offset assert parent<offset if tag is not None: if self._labels is None: pass elif tag=='0' and self._legacy0: assert 'O' in self._labels,self._labels tag = 'O' elif tag not in self._labels: tag = 'O' tag = uintern(unicode(tag)) pos = uintern(unicode(pos)) stemS = uintern(unicode(morph.stem(token,pos))) sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag, goldparent=int(parent), goldstrength=uintern(unicode(strength)), goldlabel=uintern(unicode(label)))
def _read_entry(self, entry): ds = entry["datasource"].lower() if ds=='wikimwe': words = entry["lemmas"] # not actually always lemmatized del entry["lemmas"] entry["words"] = words if ds in self.POS_2_PENN: # map POSes to Penn Treebank tagset for i,p in enumerate(entry["poses"]): info = self.POS_2_PENN[ds].get(p,p) entry["poses"][i] = info if isinstance(info,basestring) else (info.get(entry["lemmas" if ds=='baldwin vpc' else "words"][i]) or info[None]) if "lemmas" not in entry: if 'lvc' in ds: entry["lemmas"] = [entry["verblemma"], morph.stem(entry["noun"],'NN')] else: assert "words" in entry,entry words = entry["words"] poses = [None]*len(words) if "poses" in entry and entry["poses"]: assert ds in {'said','semcor','wikimwe'},entry poses = entry["poses"] elif ds in {'phrases.net', "oyz's idioms"}: pass elif entry["label"].startswith('NNP') or entry["label"].startswith('NE:'): poses = ['NNP']*len(words) entry["lemmas"] = [morph.stem(w,p) for w,p in zip(words,poses)] try: sig = tuple(l.lower() for l in entry["lemmas"] if not l[0]==l[-1]=='_') if not sig or sig[-1]=='the' or not any(l for l in sig if len(l)>2): return # probably garbage entry if len(sig)>1: self._entries[sig] = entry self._bylast[sig[-1]].add(sig) except: print(entry, file=sys.stderr) raise
def _read_nonblank_line(self, ln, sent): '''Tab-separated format: offset word lemma POS tag parent strength label sentId lemma will (for now) be ignored in favor of the automatic stemmer. label may be the empty string; sentId is optional. ''' parts = ln[:-1].split('\t') if len(parts) == 9: offset, token, _, pos, tag, parent, strength, label, sentId = parts sent.sentId = sentId else: offset, token, _, pos, tag, parent, strength, label = parts offset = int(offset) parent = int(parent) assert len(sent) + 1 == offset assert parent < offset if tag is not None: if self._labels is None: pass elif tag == '0' and self._legacy0: assert 'O' in self._labels, self._labels tag = 'O' elif tag not in self._labels: tag = 'O' tag = uintern(unicode(tag)) pos = uintern(unicode(pos)) stemS = uintern(unicode(morph.stem(token, pos))) sent.addToken(token=token, stem=stemS, pos=pos, goldTag=tag, goldparent=int(parent), goldstrength=uintern(unicode(strength)), goldlabel=uintern(unicode(label)))