def _process(self, input_pack: MultiPack): query = input_pack.get_pack(self.in_pack_name).text params = '?' + urlencode( {'api-version': '3.0', 'from': self.src_language, 'to': [self.target_language]}, doseq=True) microsoft_constructed_url = self.microsoft_translate_url + params response = requests.post( microsoft_constructed_url, headers=self.microsoft_headers, json=[{"text": query}]) if response.status_code != 200: raise RuntimeError(response.json()['error']['message']) text = response.json()[0]["translations"][0]["text"] pack = DataPack() document = Document(pack, 0, len(text)) utterance = Utterance(pack, 0, len(text)) pack.add_or_get_entry(document) pack.add_or_get_entry(utterance) pack.set_text(text=text) input_pack.update_pack({self.out_pack_name: pack})
def _process(self, input_pack: DataPack): doc = input_pack.text end_pos = 0 # sentence parsing sentences = self.nlp(doc).sentences # type: ignore # Iterating through stanfordnlp sentence objects for sentence in sentences: begin_pos = doc.find(sentence.words[0].text, end_pos) end_pos = doc.find(sentence.words[-1].text, begin_pos) + len( sentence.words[-1].text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) input_pack.add_or_get_entry(sentence_entry) tokens: List[Token] = [] if "tokenize" in self.processors: offset = sentence_entry.span.begin end_pos_word = 0 # Iterating through stanfordnlp word objects for word in sentence.words: begin_pos_word = sentence_entry.text. \ find(word.text, end_pos_word) end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word + offset, end_pos_word + offset) if "pos" in self.processors: token.set_fields(pos=word.pos) token.set_fields(upos=word.upos) token.set_fields(xpos=word.xpos) if "lemma" in self.processors: token.set_fields(lemma=word.lemma) token = input_pack.add_or_get_entry(token) tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.governor - 1] # Root token relation_entry = Dependency(input_pack, parent, child) relation_entry.set_fields( rel_type=word.dependency_relation) input_pack.add_or_get_entry(relation_entry)
def _process(self, input_pack: DataPack): # pylint: disable=no-self-use text = input_pack.text begin_pos = 0 while begin_pos < len(text): end_pos = min(text.find('.', begin_pos)) if end_pos == -1: end_pos = len(text) - 1 sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1) input_pack.add_or_get_entry(sentence_entry) begin_pos = end_pos + 1 while begin_pos < len(text) and text[begin_pos] == " ": begin_pos += 1