Beispiel #1
0
    def labeled_json_to_labeled_instances(
            self, json_dict: JsonDict) -> Dict[int, Instance]:
        seq_offset = 0
        seq_len = -1
        adhoc_vocab = Vocabulary()
        instances = {}
        for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())):
            inst_obj = json_dict[str_i]
            if seq_len == -1:
                seq_len = len(inst_obj['words'])
                text_field = TextField(
                    [Token(tok['text']) for tok in inst_obj['words']], {})
                instance = Instance({'tokens': text_field})

            new_instance = instance.duplicate()

            tags_field = ConstructiveSupertagField(
                [json_to_cat(tag) for tag in inst_obj['tags']], text_field,
                [i - seq_offset])
            adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels')
            new_instance.add_field('tags', tags_field)
            new_instance.index_fields(adhoc_vocab)

            instances[i] = new_instance

            if i + 1 - seq_offset == seq_len:
                seq_offset += seq_len
                seq_len = -1

        return instances
Beispiel #2
0
 def dump_line(self, outputs: JsonDict) -> str:  # pylint: disable=no-self-use
     """
     If you don't want your outputs in JSON-lines format
     you can override this function to output them differently.
     """
     if 'beam_sql_query' in outputs.keys():
         return outputs['predicted_sql_query'] + "\n" + outputs['beam_sql_query'] + "\n"
     else:
         return outputs['predicted_sql_query'] + "\n"
Beispiel #3
0
 def _sentence_to_srl_instances(self, json_dict: JsonDict) -> List[Instance]:
     sentence = json_dict["sentence"]
     if "verbs" in json_dict.keys():
         text = sentence.split()
         pos = ["VERB" if i == json_dict["verbs"] else "NOUN" for i, _ in enumerate(text)]
         tokens = [Token(t, i, i + len(text), pos_=p) for i, (t, p) in enumerate(zip(text, pos))]
     else:
         tokens = self._tokenizer.tokenize(sentence)
     return self.tokens_to_instances(tokens)
Beispiel #4
0
def align_entities(extracted: List[str],
                   literals: JsonDict,
                   stemmer: NltkPorterStemmer) -> List[str]:
    """
    Use stemming to attempt alignment between extracted world and given world literals.
    If more words align to one world vs the other, it's considered aligned.
    """
    literal_keys = list(literals.keys())
    literal_values = list(literals.values())
    overlaps = [get_stem_overlaps(extract, literal_values, stemmer) for extract in extracted]
    worlds = []
    for overlap in overlaps:
        if overlap[0] > overlap[1]:
            worlds.append(literal_keys[0])
        elif overlap[0] < overlap[1]:
            worlds.append(literal_keys[1])
        else:
            worlds.append(None)
    return worlds