class SentenceTaggerPredictor(Predictor): """ Wrapper for any model that takes in a sentence and returns a single set of tags for it. In particular, it can be used with the :class:`~allennlp.models.crf_tagger.CrfTagger` model and also the :class:`~allennlp.models.simple_tagger.SimpleTagger` model. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = JustSpacesWordSplitter() @overrides def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokens = self._tokenizer.split_words(sentence) instance = self._dataset_reader.text_to_instance(tokens) return_dict: JsonDict = {"words": [token.text for token in tokens]} return instance, return_dict
class PairsDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__() self._tokenizer = JustSpacesWordSplitter() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): line_json = json.loads(line) if not line_json: continue query_paper = line_json["query_paper"] candidate_paper = line_json["candidate_paper"] relevance = line_json["relevance"] instance = self.text_to_instance( query_paper=query_paper, candidate_paper=candidate_paper, relevance=relevance) if instance is not None: yield instance @overrides def text_to_instance(self, query_paper: str, candidate_paper: str, relevance: str = None) -> Instance: # type: ignore # pylint: disable=arguments-differ fields: Dict[str, Field] = {} query_tokens = self._tokenizer.split_words(query_paper) fields['query_paper'] = TextField(query_tokens, self._token_indexers) candidate_tokens = self._tokenizer.split_words(candidate_paper) fields['candidate_paper'] = TextField(candidate_tokens, self._token_indexers) if relevance is not None: fields['label'] = LabelField(relevance) return Instance(fields)
class LegalDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy=False) self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._word_splitter = JustSpacesWordSplitter() self.lazy = lazy @overrides def text_to_instance(self, graf_tokens: List[Token], labels: List[str] = None) -> Instance: graf_field = TextField(graf_tokens, self.token_indexers) metadata = MetadataField(({"graf_words": graf_tokens})) fields = {"graf": graf_field, "metadata": metadata} if labels is not None: label_field = MultiLabelField(labels) fields["label"] = label_field return Instance(fields) def _read(self, file_path: str) -> Iterator[Instance]: """ This is a file that has been created by json2lines. :param file_path: :return: """ counts = {"pos": 0, "neg": 0} with open(file_path) as f: lines = f.readlines() for line in lines: graf_str, label_str = line.strip().split("\t") if "unmatched" == label_str: counts["neg"] += 1 else: counts["pos"] += 1 yield self.text_to_instance( self._word_splitter.split_words(graf_str), label_str.split(",")) print(counts)
def _read(self, file_path: str): file_path = cached_path(file_path) tokenizer = JustSpacesWordSplitter() with open(file_path, 'r') as f: while True: headline = f.readline() if not headline: break parts = headline.strip().split('\t') step_count = int(parts[2]) sent_list = [] # sentences of each paragraph sent_anno_list = [] # list of sentence annotations (P C F) word_pos_list = [] # list of word pos, -2 -1 0 1 2, to indicate the position of participants part_mask_list = [] # list of participant mask, 0 0 1 1 0 0, to obtain the part embeddings before_category_status_list = [] # list of bef category annotations 0-known, 1-unknown, 2-null before_category_mask_list = [] # list of bef category masks, 0-unknown 1-known before_loc_start_list = [] # list of start positions of bef loc before_loc_end_list = [] # list of end positions of bef loc after_category_status_list = [] # list of aft category annotations 0-known, 1-unknown, 2-null after_category_mask_list = [] # list of aft category masks, 0-unknown 1-known after_loc_start_list = [] # list of start positions of aft loc after_loc_end_list = [] # list of end positions of aft loc for i in range(step_count): paraline = f.readline() paraline = paraline.lower() words = tokenizer.split_words(paraline.strip()) sent_anno_line = f.readline() sent_annos = tokenizer.split_words(sent_anno_line.strip()) anno_line = f.readline() anno_parts = anno_line.split('\t') part_start = int(anno_parts[1]) part_end = int(anno_parts[2]) participant_mask = [] before_loc_start = int(anno_parts[4]) before_loc_end = int(anno_parts[5]) after_loc_start = int(anno_parts[8]) after_loc_end = int(anno_parts[9]) word_pos_line = "" for i in range(0, part_start): pos = i - part_start word_pos_line += " " + str(pos) participant_mask.append(0) for i in range(part_start, part_end): pos = 0 word_pos_line += " " + str(pos) participant_mask.append(1) for i in range(part_end, len(words)): pos = i - part_end word_pos_line += " " + str(pos) participant_mask.append(0) input_length = len(words) word_pos = tokenizer.split_words(word_pos_line.strip()) assert input_length == len(participant_mask) # 0: known 1: unk 2: null before_category_status = 0 after_category_status = 0 # 1: known 0: category before_category_mask = 1 after_category_mask = 1 category_index = 0 # -2 -- null, -1 -- unk if before_loc_start==-2 and before_loc_end==-2: before_category_status = 2 before_category_mask = 0 before_loc_start = category_index before_loc_end = category_index elif before_loc_start==-1 and before_loc_end==-1: before_category_status = 1 before_category_mask = 0 before_loc_start = category_index before_loc_end = category_index if after_loc_start == -2 and after_loc_end == -2: after_category_status = 2 after_category_mask = 0 after_loc_start = category_index after_loc_end = category_index elif after_loc_start == -1 and after_loc_end == -1: after_category_status = 1 after_category_mask = 0 after_loc_start = category_index after_loc_end = category_index sent_list.append(words) sent_anno_list.append(sent_annos) word_pos_list.append(word_pos) part_mask_list.append(participant_mask) before_category_status_list.append(before_category_status) before_category_mask_list.append(before_category_mask) before_loc_start_list.append(before_loc_start) before_loc_end_list.append(before_loc_end) after_category_status_list.append(after_category_status) after_category_mask_list.append(after_category_mask) after_loc_start_list.append(after_loc_start) after_loc_end_list.append(after_loc_end) yield self.text_to_instance([sent_list, sent_anno_list, word_pos_list, part_mask_list, before_category_status_list, before_category_mask_list, before_loc_start_list, before_loc_end_list, after_category_status_list, after_category_mask_list, after_loc_start_list, after_loc_end_list])
class ChatDatasetReader(DatasetReader): """ How to Design a Personal Data Reader. ---------- ---------- ---------- ---------- Override Function Needed: ( __init__: init tokenizer and token indexer ) _read: read data set file and get data instances list text_to_instance: turn a text field to instance ---------- ---------- ---------- ---------- Usage: 1. Get reader class instance: reader = ChatDatasetReader() 2. Get data instances: ensure_list(reader.read('dataset.json')) 3. get data content: What's the structure of a single instance ? instance.fields: text and label field of an instance instance.fields['key']: information of 'key' field For Text Field: instance.fields['key'].sentence: 'key' field's sentence instance.fields['key'].sentence[i].text: get a 'key'-field-token's text content For Label Field: instance.fields['key'].label: 'key' field's label ---------- ---------- ---------- ---------- Multiplex: Change _read and text_to_instance functions to multiplex the reader. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) # self._tokenizer = tokenizer or WordTokenizer() self._tokenizer = JustSpacesWordSplitter() self._token_indexers = token_indexers or { "sentence": SingleIdTokenIndexer() } @overrides # 实现DatasetReader的read方法 def _read(self, file_path): # cached_path: Input can be a local path or an url. If input is url, it will download the file then read it. with open(cached_path(file_path), "r", encoding='utf-8') as data_file: json_data = json.loads(data_file.read()) for line in json_data: sentence = line['sentence'] label = line['label'] yield self.text_to_instance( sentence, label) # generator (get/return a list of text instance) @overrides def text_to_instance(self, sentence: str, label: int = None) -> Instance: # type: ignore # Text/Label field can be more than one # tokenized_sentence = self._tokenizer.tokenize(sentence) # step 1: text to token list tokenized_sentence = self._tokenizer.split_words(sentence) sentence_field = TextField( tokenized_sentence, self._token_indexers) # step 2: token to sentence field # tokenized_example = self._tokenizer.tokenize(example) # example_field = TextField(tokenized_example, self._token_indexers) fields = { 'sentence': sentence_field, # 'example': example_field, } if label is not None: # when it use in predictor, label is init as None fields['label'] = LabelField( str(label)) # LabelField need a string input return Instance(fields)
class DatasetReader(DatasetReader): """ Parameters ---------- lazy : ``bool`` (optional, default=False) Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will take longer per batch. This also allows training with datasets that are too large to fit in memory. tokenizer : ``Tokenizer``, optional Tokenizer to use to split the sentence into words or other kinds of tokens. Defaults to ``WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define input token representations. Defaults to ``{"tokens": SingleIdTokenIndexer()}``. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._word_spliter = JustSpacesWordSplitter() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } @overrides def _read(self, file_path): text_id = 0 with open(cached_path(file_path), "r", encoding="utf8") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip("\n") if not line: continue segments = line.split('\t') if len(segments) == 2: label = segments[0] sentence = segments[1] else: continue text_id += 1 yield self.text_to_instance(sentence, label, text_id) @overrides def text_to_instance(self, text: str, label: int, text_id: int) -> Instance: # type: ignore """ Parameters ---------- text : ``str``, required. The text to classify label : ``int``, optional, (default = None). The label for this text. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label of the sentence or phrase. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # tokens = self._tokenizer.tokenize(text) text = " ".join(list(text)) tokens = self._word_spliter.split_words(text) fields['tokens'] = TextField(tokens, self._token_indexers) fields["text_id"] = LabelField(text_id, skip_indexing=True) if label is not None: fields['label'] = LabelField(label) return Instance(fields)
class TruecaserPredictor(Predictor): """ This is basically a copy of the SentenceTagger from allennlp. It is modified to dump output in a more sensible manner. """ def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: super().__init__(model, dataset_reader) self._tokenizer = JustSpacesWordSplitter() self.model = model def predict(self, sent): js = {"sentence": sent} return self.predict_instance(self._json_to_instance(js)) @overrides def predict_instance(self, sent: Instance) -> JsonDict: output = super().predict_instance(sent) #output["chars"] = sent["tokens"] output["words"] = sent["tokens"].tokens return output @overrides def predict_batch_instance(self, sents: List[Instance]) -> List[JsonDict]: outputs = super().predict_batch_instance(sents) for i, sent in enumerate(sents): #output["chars"] = sent["tokens"] outputs[i]["words"] = sent["tokens"].tokens return outputs def load_line(self, line: str) -> JsonDict: """ This will usually be overridden with use_dataset_reader = True on the command line. :param line: :return: """ return {"sentence": line} def dump_line(self, outputs: JsonDict): newd = {} tags = outputs["tags"] chars = outputs["words"] # all chars are lower case by default. out = [] for token, t in zip(chars, tags): c = token.text if t == "U": c = c.upper() out.append(c) return "".join(out) + "\n" @overrides def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokenized_sent = " ".join( map(str, self._tokenizer.split_words(sentence))) chars = [Token(c) for c in tokenized_sent.lower()] return self._dataset_reader.text_to_instance(chars)
def predict_json(self, inputs: JsonDict, cuda_device: int = -1) -> JsonDict: instance_text = inputs["instance"] step_list = instance_text.split("####") tokenizer = JustSpacesWordSplitter() headline = step_list[0] parts = headline.strip().split('\t') para_id = parts[0] paragraph = "" sent_list = [] # sentences of each paragraph sent_anno_list = [] # list of sentence annotations (P C F) word_pos_list = [ ] # list of word pos, -2 -1 0 1 2, to indicate the position of participants part_mask_list = [ ] # list of participant mask, 0 0 1 1 0 0, to obtain the part embeddings before_category_status_list = [ ] # list of bef category annotations 0-known, 1-unknown, 2-null before_category_mask_list = [ ] # list of bef category masks, 0-unknown 1-known before_loc_start_list = [] # list of start positions of bef loc before_loc_end_list = [] # list of end positions of bef loc after_category_status_list = [ ] # list of aft category annotations 0-known, 1-unknown, 2-null after_category_mask_list = [ ] # list of aft category masks, 0-unknown 1-known after_loc_start_list = [] # list of start positions of aft loc after_loc_end_list = [] # list of end positions of aft loc i = 1 while i < len(step_list): para_line = step_list[i] para_line = para_line.lower() words = tokenizer.split_words(para_line.strip()) paragraph = para_line i += 1 sent_anno_line = step_list[i] sent_annos = tokenizer.split_words(sent_anno_line.strip()) i += 1 anno_line = step_list[i] anno_parts = anno_line.split('\t') part_start = int(anno_parts[1]) part_end = int(anno_parts[2]) i += 1 participant_mask = [] before_loc_start = int(anno_parts[4]) before_loc_end = int(anno_parts[5]) after_loc_start = int(anno_parts[8]) after_loc_end = int(anno_parts[9]) if before_loc_start == -3: before_loc_start = -2 before_loc_end = -2 if after_loc_start == -3: after_loc_start = -2 after_loc_end = -2 word_pos_line = "" for m in range(0, part_start): pos = m - part_start word_pos_line += " " + str(pos) participant_mask.append(0) for m in range(part_start, part_end): pos = 0 word_pos_line += " " + str(pos) participant_mask.append(1) for m in range(part_end, len(words)): pos = m - part_end word_pos_line += " " + str(pos) participant_mask.append(0) input_length = len(words) word_pos = tokenizer.split_words(word_pos_line.strip()) assert input_length == len(participant_mask) # 0: known 1: unk 2: null before_category_status = 0 after_category_status = 0 # 1: known 0: category before_category_mask = 1 after_category_mask = 1 category_index = 0 # -2 -- null, -1 -- unk if before_loc_start == -2 and before_loc_end == -2: before_category_status = 2 before_category_mask = 0 before_loc_start = category_index before_loc_end = category_index elif before_loc_start == -1 and before_loc_end == -1: before_category_status = 1 before_category_mask = 0 before_loc_start = category_index before_loc_end = category_index if after_loc_start == -2 and after_loc_end == -2: after_category_status = 2 after_category_mask = 0 after_loc_start = category_index after_loc_end = category_index elif after_loc_start == -1 and after_loc_end == -1: after_category_status = 1 after_category_mask = 0 after_loc_start = category_index after_loc_end = category_index sent_list.append(words) sent_anno_list.append(sent_annos) word_pos_list.append(word_pos) part_mask_list.append(participant_mask) before_category_status_list.append(before_category_status) before_category_mask_list.append(before_category_mask) before_loc_start_list.append(before_loc_start) before_loc_end_list.append(before_loc_end) after_category_status_list.append(after_category_status) after_category_mask_list.append(after_category_mask) after_loc_start_list.append(after_loc_start) after_loc_end_list.append(after_loc_end) instance = self._dataset_reader.text_to_instance([ sent_list, sent_anno_list, word_pos_list, part_mask_list, before_category_status_list, before_category_mask_list, before_loc_start_list, before_loc_end_list, after_category_status_list, after_category_mask_list, after_loc_start_list, after_loc_end_list ]) outputs = self._model.forward_on_instance(instance=instance) predictions = {} predictions["paraid"] = para_id predictions["entity"] = parts[1] predictions["paragraph"] = paragraph predictions["best_span"] = str(outputs["best_span"].numpy()) predictions["true_span"] = str(outputs["true_span"].numpy()) return predictions