def read_conlldoc(self, inputdoc): words = list() sentbounds = list() # pos = list() tags = list() # lemmas = list() for sent in string2doc(inputdoc, hide_fields=HIDDEN_FIELDS): for i, tok in enumerate(sent): if i == 0: sentbounds.append(True) else: sentbounds.append(False) words.append(tok.word) tags.append(self.nlp.vocab.strings.add(tok.xpos)) # pos.append(self.nlp.vocab.strings.add(conv_table.get(tok.xpos, "_"))) # lemmas.append(self.nlp.vocab.strings.add(tok.lemma)) # attrs = [POS, TAG] attrs = [TAG] # arr = np.array(list(zip(pos, tags)), dtype="uint64") arr = np.array(tags, dtype="uint64") sdoc = Doc(self.nlp.vocab, words=words).from_array(attrs, arr) for i, sb in enumerate(sentbounds): if sb: sdoc[i].is_sent_start = True else: # these must be set to False, since, # if left as None, spaCy will add further sentbounds sdoc[i].is_sent_start = False # lemma_array = np.array([[lemma] for lemma in lemmas], dtype="uint64") # sdoc.from_array([LEMMA], lemma_array) if any(tags): sdoc.is_tagged = True return sdoc
def myprocessor(myinput): results = list() for sent in string2doc(myinput, hide_fields=HIDDEN_FIELDS): sent_arr = jpype.java.util.ArrayList() for tok in sent: sent_arr.add(ling.TaggedWord(tok.word, tok.xpos)) results.append(self.parser.predict(sent_arr)) return results
def myprocessor(myinput): mydoc = string2doc(myinput) for sent in mydoc: tokens = [t.word for t in sent] tags = [t.xpos for t in sent] lemmas = lemmatize_sentence(tokens, tags) for tok, lem in zip(sent, lemmas): tok.hide_fields(HIDDEN_FIELDS) tok.lemma = lem return mydoc
def myprocessor(myinput): newinput = list() for sent in string2doc(myinput, hide_fields=HIDDEN_FIELDS): sent_strs = list() for tok in sent: sent_strs.append(tok.word + "\t" + tok.xpos) newinput.append("\n".join(sent_strs)) reformatted_input = "\n\n".join(newinput) return self.parser.main(reformatted_input, inputformat="tagged", outputformat="conll")
def myprocessor(myinput): mydoc = string2doc(myinput) for sent in mydoc: for tok in sent: try: tok.lemma = self.lemmatizer.find_lemma( tok.word, tok.xpos) except ValueError: # unsupported POS # use empty lemma tok.lemma = "_" # don't repeat gold pos in output tok.hide_fields(HIDDEN_FIELDS) return mydoc
def myprocessor(myinput): mydoc = string2doc(myinput) for sent in mydoc: for tok in sent: try: matching_lemmas = self.lemmatizer.lemmatize( tok.word, conv_table.get(tok.xpos)) if matching_lemmas is None: tok.lemma = "_" # elif len(matching_lemmas) > 1: # print("lots o lemmas!", matching_lemmas) else: # unclear how to select best alternative # just use first item in list tok.lemma = matching_lemmas[0] except ValueError: tok.lemma = "_" # don't repeat gold pos in output tok.hide_fields(HIDDEN_FIELDS) return mydoc
def postprocess(self): self.data = string2doc(self.output_data.conll_file.conll_as_string())