def labeled_json_to_labeled_instances( self, json_dict: JsonDict) -> Dict[int, Instance]: seq_offset = 0 seq_len = -1 adhoc_vocab = Vocabulary() instances = {} for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())): inst_obj = json_dict[str_i] if seq_len == -1: seq_len = len(inst_obj['words']) text_field = TextField( [Token(tok['text']) for tok in inst_obj['words']], {}) instance = Instance({'tokens': text_field}) new_instance = instance.duplicate() tags_field = ConstructiveSupertagField( [json_to_cat(tag) for tag in inst_obj['tags']], text_field, [i - seq_offset]) adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels') new_instance.add_field('tags', tags_field) new_instance.index_fields(adhoc_vocab) instances[i] = new_instance if i + 1 - seq_offset == seq_len: seq_offset += seq_len seq_len = -1 return instances
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use """ If you don't want your outputs in JSON-lines format you can override this function to output them differently. """ if 'beam_sql_query' in outputs.keys(): return outputs['predicted_sql_query'] + "\n" + outputs['beam_sql_query'] + "\n" else: return outputs['predicted_sql_query'] + "\n"
def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]: sentence = json_dict["sentence"] if "verbs" in json_dict.keys(): text = sentence.split() pos = ["VERB" if i == json_dict["verbs"] else "NOUN" for i, _ in enumerate(text)] tokens = [Token(t, i, i + len(text), pos_=p) for i, (t, p) in enumerate(zip(text, pos))] else: tokens = self._tokenizer.tokenize(sentence) return self.tokens_to_instances(tokens)
def align_entities(extracted: List[str], literals: JsonDict, stemmer: NltkPorterStemmer) -> List[str]: """ Use stemming to attempt alignment between extracted world and given world literals. If more words align to one world vs the other, it's considered aligned. """ literal_keys = list(literals.keys()) literal_values = list(literals.values()) overlaps = [get_stem_overlaps(extract, literal_values, stemmer) for extract in extracted] worlds = [] for overlap in overlaps: if overlap[0] > overlap[1]: worlds.append(literal_keys[0]) elif overlap[0] < overlap[1]: worlds.append(literal_keys[1]) else: worlds.append(None) return worlds