class TestSentenceSplitter(AllenNlpTestCase): def setUp(self): super().setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_dep_parse_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [ self.rule_based_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [ self.dep_parse_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence
class TestSentenceSplitter(AllenNlpTestCase): def setUp(self): super().setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens @pytest.mark.skipif(spacy.__version__ < "2.1", reason="this model changed from 2.0 to 2.1") def test_dep_parse_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence
def test_read_from_file_reuters_corpus_and_segments_sentences_properly( self, lazy, max_sequence_length): reader = MultiLabelTextClassificationJsonReader( lazy=lazy, segment_sentences=True, max_sequence_length=max_sequence_length) reuters_path = Path( "tests/fixtures") / "data" / "reuters-21578" / "train.jsonl" instances = reader.read(reuters_path) instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) text1 = ( "U.K. GROWING IMPATIENT WITH JAPAN - THATCHER Prime Minister Margaret Thatcher said the" " U.K. Was growing more impatient with Japanese trade barriers and warned that it would" " soon have new powers against countries not offering reciprocal access to their" " markets.") instance1 = {"text": text1, "labels": ["acq", "trade"]} text2 = ( "CANADA OIL EXPORTS RISE 20 PCT IN 1986 Canadian oil exports rose 20 pct in 1986 over" " the previous year to 33.96 mln cubic meters, while oil imports soared 25.2 pct to" " 20.58 mln cubic meters, Statistics Canada said. Production, meanwhile, was unchanged" " from the previous year at 91.09 mln cubic feet.") instance2 = {"text": text2, "labels": ["nat-gas", "crude"]} text3 = ( "COFFEE, SUGAR AND COCOA EXCHANGE NAMES CHAIRMAN The New York Coffee, Sugar and Cocoa" " Exchange (CSCE) elected former first vice chairman Gerald Clancy to a two-year term" " as chairman of the board of managers, replacing previous chairman Howard Katz. Katz," " chairman since 1985, will remain a board member.") instance3 = {"text": text3, "labels": ["sugar", "cocoa", "coffee"]} for instance in [instance1, instance2, instance3]: sentences = splitter.split_sentences(instance["text"]) tokenized_sentences: List[List[str]] = [] for sentence in sentences: tokens = [token.text for token in spacy_tokenizer(sentence)] if max_sequence_length: tokens = tokens[:max_sequence_length] tokenized_sentences.append(tokens) instance["tokens"] = tokenized_sentences assert len(instances) == 3 fields = instances[0].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance1["tokens"] assert fields["labels"].labels == instance1["labels"] fields = instances[1].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance2["tokens"] assert fields["labels"].labels == instance2["labels"] fields = instances[2].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance3["tokens"] assert fields["labels"].labels == instance3["labels"]
class TestSentenceSplitter(AllenNlpTestCase): def setUp(self): super(TestSentenceSplitter, self).setUp() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence - yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens def test_dep_parse_splitter_passes_through_correctly(self): text = ("This is the first sentence. This is the second sentence! " "Here's the '3rd' sentence - yes, it is. And yes; this is a fourth sentence?") tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = ["This is the first sentence.", "This is the second sentence!", "Here's the '3rd' sentence -", "yes, it is.", "And yes; this is a fourth sentence?"] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [self.rule_based_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = ["This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", "This is the 3rd sentence?", "Here's the 'fourth' sentence - yes, it is. And this is a second sentence."] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [self.dep_parse_splitter.split_sentences(doc) for doc in text] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip(batch_doc, separate_doc): assert batch_sentence == separate_sentence
class TWTCDatasetReader(DatasetReader): """ Reads a JSON file from the TWTC dataset. Expected format for each input line: {"report": "text", "label": "int"} The output of ``read`` is a list of ``Instance`` s with the fields: text: ``TextField`` label: ``LabelField`` Parameters ---------- lazy : ``bool`` (optional, default=False) Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will take longer per batch. This also allows training with datasets that are too large to fit in memory. tokenizer : ``Tokenizer``, optional Tokenizer to use to split the title and abstrct into words or other kinds of tokens. Defaults to ``WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define input token representations. Defaults to ``{"tokens": SingleIdTokenIndexer()}``. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._sentence_splitter = SpacySentenceSplitter() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self.cache_data(os.path.expanduser('~/.allennlp/cache/datasets')) @overrides def _read(self, file_path): file_path = cached_path(file_path) data = pd.read_json(file_path, lines=True, orient='records')[['text', 'label']].values for text, label in data: assert isinstance(label, int) inst = self.text_to_instance(text, str(label)) yield inst @overrides def text_to_instance(self, document: str, label: str = None) -> Instance: sentences: List[str] = self._sentence_splitter.split_sentences( document) tokenized_sents: List[List[str]] = (self._tokenizer.tokenize(sent) for sent in sentences) fields = { 'tokens': ListField( [TextField(s, self._token_indexers) for s in tokenized_sents]) } if label: fields['label'] = LabelField(int(label), skip_indexing=True) return Instance(fields)
def test_read_from_file_ag_news_corpus_and_segments_sentences_properly( self, lazy: bool, label_name: str, max_sequence_length: Optional[int]): reader = TextSentimentReader(lazy=lazy, segment_sentences=True, label_name=label_name, max_sequence_length=max_sequence_length) ag_path = Path(DATA_DIR, 'ag_news_corpus_original.jsonl') if label_name == 'text_sentiment': ag_path = Path(DATA_DIR, 'ag_news_corpus.jsonl') instances = reader.read(ag_path) instances = ensure_list(instances) splitter = SpacySentenceSplitter() spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False) text1 = ("Memphis Rout Still Stings for No. 14 Louisville; Coach " "Petrino Vows to Have Team Better Prepared. NASHVILLE, " "Tenn. Nov 3, 2004 - Louisville #39;s 30-point loss " "at home to Memphis last season is still a painful memory " "for the Cardinals.") instance1 = {"text": text1, "label": "2"} text2 = ("AP - Eli Manning has replaced Kurt Warner as the New York" " Giants' starting quarterback.") instance2 = {"text": text2, "label": "2"} text3 = ("A conference dedicated to online journalism explores the " "effect blogs have on news reporting. Some say they draw " "attention to under-reported stories. Others struggle to " "establish the credibility enjoyed by professionals.") instance3 = {"text": text3, "label": "4"} for instance in [instance1, instance2, instance3]: sentences = splitter.split_sentences(instance['text']) tokenized_sentences: List[List[str]] = [] for sentence in sentences: tokens = [token.text for token in spacy_tokenizer(sentence)] if max_sequence_length: tokens = tokens[:max_sequence_length] tokenized_sentences.append(tokens) instance["tokens"] = tokenized_sentences assert len(instances) == 3 fields = instances[0].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance1["tokens"] assert fields["label"].label == instance1["label"] fields = instances[1].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance2["tokens"] assert fields["label"].label == instance2["label"] fields = instances[2].fields text = [[token.text for token in sentence.tokens] for sentence in fields["tokens"]] assert text == instance3["tokens"] assert fields["label"].label == instance3["label"]
class DoGDatasetReader(DatasetReader): def __init__(self, lazy: bool = True, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, split_sentence_in_doc: bool = False): super().__init__(lazy) self.tokenizer = tokenizer or WordTokenizer() self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} if split_sentence_in_doc: self.sentence_splitter = SpacySentenceSplitter() else: self.sentence_splitter = None @overrides def _read(self, file_path: str) -> Iterable[Instance]: with open(os.path.join(os.path.split(file_path)[0], 'documents.json'), 'r') as doc_file: doc_json = json.load(doc_file) doc_field_dict = self.get_doc_field_dict(doc_json) with open(file_path, 'r') as data_file: for line in data_file: line = line.strip() dialog_json = json.loads(line) yield self.text_to_instance(dialog_json['dialogue'], doc_field_dict[dialog_json['docId']], dialog_json['whoSawDoc']) @overrides def text_to_instance(self, dialogs: List[str], doc_field: Field, who_saw_doc: int): tokenized_dialogs = [self.tokenizer.tokenize(dialog) for dialog in dialogs] for tokenized_dialog in tokenized_dialogs: tokenized_dialog.insert(0, Token(START_SYMBOL)) tokenized_dialog.append(Token(END_SYMBOL)) dialogue_field = ListField([TextField(tokenized_dialog, self.token_indexers) for tokenized_dialog in tokenized_dialogs]) # who_saw_doc_field = MetadataField(who_saw_doc) # return Instance({'dialogue': dialogue_field, 'document': doc_field, 'who_saw_doc': who_saw_doc_field}) return Instance({'dialogue': dialogue_field, 'document': doc_field}) def get_doc_field_dict(self, doc_json: Dict) -> Dict[int, Field]: doc_field_dict = {} for idx, doc in doc_json.items(): if self.sentence_splitter is not None: doc_sentence_list: List[str] = [] for i in ('0', '1', '2', '3'): doc_sentence_list.extend(self.sentence_splitter.split_sentences(doc[i])) tokenized_doc_sentence_list = [self.tokenizer.tokenize(doc_sequence) for doc_sequence in doc_sentence_list] doc_field = ListField([TextField(tokenized_doc_sentence, self.token_indexers) for tokenized_doc_sentence in tokenized_doc_sentence_list]) else: doc_sequence = ' '.join(doc[i] for i in ('0', '1', '2', '3')) tokenized_doc = self.tokenizer.tokenize(doc_sequence) doc_field = TextField(tokenized_doc, self.token_indexers) doc_field_dict[int(idx)] = doc_field return doc_field_dict
def entity_extraction_wikihop(args): predictor_conll = AllenNER( "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") predictor_onto_note = \ AllenNER("https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz") sentence_splitter = SpacySentenceSplitter(rule_based=True) with open(args.path, 'r') as f: data = json.load(f) for d in tqdm(data): golden_ners = [] passage = [] question = d['query'].strip().replace("\n", "") question_entity = " ".join(question.split()[1:]) question = " ".join(question.split("_")) for para in d['supports']: sentences = sentence_splitter.split_sentences(para) para_ners = [] outputs_conll = predictor_conll.predict_batch_raw(sentences) outputs_onto_note = predictor_onto_note.predict_batch_raw(sentences) for out1, out2 in zip(outputs_conll, outputs_onto_note): entities1 = entity_extraction_(out1['words'], out1['tags']) entities2 = entity_extraction_(out2['words'], out2['tags']) entities = set(entities1).union(set(entities2)) # print(entities) para_ners.append(list(entities)) golden_ners.append(para_ners) passage.append(sentences) # parsing_info.append([title, outputs_conll]) # print(question) # print(question_entity) # input() d['supports'] = passage d['question_entities'] = [question_entity] d['ners'] = golden_ners d['query'] = question # input() with open(args.output, 'w') as f: json.dump(data, f)
json.dump(named_entities_frequency_table, f) instances = create_nabert_reader( data_path='../../data/drop_dataset/drop_dataset_train.json') ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018() sentences_splitter = SpacySentenceSplitter() named_entities = defaultdict(list) with torch.no_grad(): for instance_idx, instance in enumerate(instances): original_question = instance.fields['metadata'].metadata[ 'original_question'] original_passage = instance.fields['metadata'].metadata[ 'original_passage'] aggregate_named_entities(original_question, named_entities) # NER tagger is more accurate when single sentences are fed as input passage_sentences = sentences_splitter.split_sentences( original_passage) for passage_sentence in passage_sentences: aggregate_named_entities(passage_sentence, named_entities) if instance_idx % 501 == 500: dump_frequency_table(named_entities, 'ner_frequencies_latest.json') dump_frequency_table(named_entities, 'ner_frequencies.json') print('Done.')
class TestSentenceSplitter(AllenNlpTestCase): def setup_method(self): super().setup_method() self.dep_parse_splitter = SpacySentenceSplitter(rule_based=False) self.rule_based_splitter = SpacySentenceSplitter(rule_based=True) def test_rule_based_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.rule_based_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_dep_parse_splitter_passes_through_correctly(self): text = "This is the first sentence. This is the second sentence! " tokens = self.dep_parse_splitter.split_sentences(text) expected_tokens = [ "This is the first sentence.", "This is the second sentence!" ] assert tokens == expected_tokens def test_batch_rule_based_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.rule_based_splitter.batch_split_sentences(text) separately_split = [ self.rule_based_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_batch_dep_parse_sentence_splitting(self): text = [ "This is a sentence. This is a second sentence.", "This isn't a sentence. This is a second sentence! This is a third sentence.", ] batch_split = self.dep_parse_splitter.batch_split_sentences(text) separately_split = [ self.dep_parse_splitter.split_sentences(doc) for doc in text ] assert len(batch_split) == len(separately_split) for batch_doc, separate_doc in zip(batch_split, separately_split): assert len(batch_doc) == len(separate_doc) for batch_sentence, separate_sentence in zip( batch_doc, separate_doc): assert batch_sentence == separate_sentence def test_to_params(self): params = self.dep_parse_splitter.to_params() assert isinstance(params, Params) assert params.params == { "type": "spacy", "language": self.dep_parse_splitter._language, "rule_based": self.dep_parse_splitter._rule_based, }
class TextClassificationJsonReader(DatasetReader): """ Reads tokens and their labels from a labeled text classification dataset. The output of `read` is a list of `Instance` s with the fields: tokens : `TextField` and label : `LabelField` Registered as a `DatasetReader` with name "text_classification_json". [0]: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf # Parameters token_indexers : `Dict[str, TokenIndexer]`, optional optional (default=`{"tokens": SingleIdTokenIndexer()}`) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : `Tokenizer`, optional (default = `{"tokens": SpacyTokenizer()}`) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences : `bool`, optional (default = `False`) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like [the Hierarchical Attention Network][0]. max_sequence_length : `int`, optional (default = `None`) If specified, will truncate tokens to specified maximum length. skip_label_indexing : `bool`, optional (default = `False`) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. text_key: `str`, optional (default=`"text"`) The key name of the source field in the JSON data file. label_key: `str`, optional (default=`"label"`) The key name of the target field in the JSON data file. """ def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, text_key: str = "text", label_key: str = "label", **kwargs, ) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._text_key = text_key self._label_key = label_key if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in self.shard_iterable(data_file.readlines()): if not line: continue items = json.loads(line) text = items[self._text_key] label = items.get(self._label_key) if label is not None: if self._skip_label_indexing: try: label = int(label) except ValueError: raise ValueError( "Labels must be integers if skip_label_indexing is True." ) else: label = str(label) yield self.text_to_instance(text=text, label=label) def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._max_sequence_length: tokens = tokens[:self._max_sequence_length] return tokens @overrides def text_to_instance( self, text: str, label: Union[str, int] = None) -> Instance: # type: ignore """ # Parameters text : `str`, required. The text to classify label : `str`, optional, (default = `None`). The label for this text. # Returns An `Instance` containing the following fields: - tokens (`TextField`) : The tokens in the sentence or phrase. - label (`LabelField`) : The label label of the sentence or phrase. """ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens) if label is not None: fields["label"] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields) @overrides def apply_token_indexers(self, instance: Instance) -> None: if self._segment_sentences: for text_field in instance.fields["tokens"]: # type: ignore text_field._token_indexers = self._token_indexers else: instance.fields[ "tokens"]._token_indexers = self._token_indexers # type: ignore
class ExampleLoader(object): def __init__(self): self.label_list = None self.sentence_splitter = SpacySentenceSplitter() def get_loss_weights(): # Calculate loss weights as the inverse of label occurrence. loss_weights = {} for label in self.label_list: loss_weights[label] = 0 for ex in train_examples: loss_weights[ex.str_label] += 1 num_examples = len(train_examples) for key in loss_weights: loss_weights[key] = num_examples / loss_weights[key] weights_list = [ float("%3.f" % loss_weights[key]) for key in loader.label_list ] return weights_list def get_text_from_element(self, node): if node.nodeType == node.TEXT_NODE: if node.data.isspace(): return "" else: return node.data.replace("\n", " ") else: text = "" for child in node.childNodes: text += " " + self.get_text_from_element(child) + " " return text def process_node(self, node, events, times, full_text): if node.nodeName == "EVENT": eid = node.attributes['eid'].value cls = node.attributes['class'].value event = Event(eid=eid, cls=cls, sentence=None, pos_in_sentence=None) event.idx_in_doc = len(full_text) events[eid] = event return event if node.nodeName == "TIMEX3": tid = node.attributes['tid'].value type = node.attributes['type'].value time = TimeX3(tid=tid, sentence=None, pos_in_sentence=None) time.idx_in_doc = len(full_text) times[tid] = time return time def get_instances(self, instance_elts, event_instances, events, input_file): for instance in instance_elts: eiid = instance.attributes["eiid"].value eventID = instance.attributes["eventID"].value tense = instance.attributes["tense"].value aspect = instance.attributes["aspect"].value polarity = instance.attributes["polarity"].value pos = instance.attributes["pos"].value if eventID not in events: print(eventID, input_file) continue event = events[eventID] sentence = event.sentence pos_in_sentence = event.pos_in_sentence instance = EventInstance(eiid, event, tense, aspect, polarity, pos, sentence, pos_in_sentence) event_instances[eiid] = instance def parse_node(self, root, events, times, full_text): # print(full_text) for node in root.childNodes: if node.nodeType == node.TEXT_NODE and not node.data.isspace(): text = re.sub(r"\n+", " ", node.data) text = re.sub(r"_", "", node.data) text = re.sub(r"&UR;", "", node.data) text = re.sub(r"&LR;", "", node.data) split_space = text.split() full_text += split_space elif node.nodeName == "TEXT": self.parse_node(node, events, times, full_text) else: el = self.process_node(node, events, times, full_text) text = self.get_text_from_element(node) if el: el.text = text.strip() full_text += text.split() def get_full_text_to_sentences(self, full_text, sentences): split_sentences = [s.split() for s in sentences] def next_position(split_sentences, sent_num, sent_idx): cur_sent = split_sentences[sent_num] if sent_idx < len(cur_sent) - 1: sent_idx += 1 else: sent_idx = 0 sent_num += 1 if sent_num < len(split_sentences): cur_sent = split_sentences[sent_num] return sent_num, sent_idx split_sentences = [s.split() for s in sentences] full_text_to_sentences = [] sent_num = 0 sent_idx = 0 for i, tok in enumerate(full_text): sent_tok = split_sentences[sent_num][sent_idx] # print(tok, sent_tok) assert tok.startswith( sent_tok), str(i) + " " + tok + " " + sent_tok + "\n" + str( split_sentences[sent_num]) full_text_to_sentences.append(tuple([sent_num, sent_idx])) while len(tok) > len(sent_tok): tok = tok[len(sent_tok):] sent_num, sent_idx = next_position(split_sentences, sent_num, sent_idx) sent_tok = split_sentences[sent_num][sent_idx] # print("WHILE", tok, sent_tok) assert tok.startswith(sent_tok), str( i) + " " + tok + " " + sent_tok + "\n" + str( split_sentences[sent_num]) # print(tok) sent_num, sent_idx = next_position(split_sentences, sent_num, sent_idx) return full_text_to_sentences def convert_doc_idx_to_sentences(self, sentences, full_text_to_sentences, its): for key, obj in its.items(): idx = obj.idx_in_doc sentence, pos_in_sentence = full_text_to_sentences[idx] # print(idx, sentence, pos_in_sentence) text = sentences[sentence].split()[pos_in_sentence] assert text == obj.text.split()[0], text + " " + obj.text obj.sentence = sentence obj.pos_in_sentence = pos_in_sentence def read_file(self, input_file): """ Parameters ---------- input_file: str, path to input file Returns ------- TimeMLFile containing sentences, events, eventInstances, times, and tlinks. """ doc = dom.parse(input_file) root = doc.childNodes[0] events = {} times = {} full_text = [] self.parse_node(root, events, times, full_text) # print(full_text) sentences = self.sentence_splitter.split_sentences(" ".join(full_text)) full_text_to_sentences = self.get_full_text_to_sentences( full_text, sentences) self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences, events) self.convert_doc_idx_to_sentences(sentences, full_text_to_sentences, times) event_instances = {} instanceElts = root.getElementsByTagName("MAKEINSTANCE") self.get_instances(instanceElts, event_instances, events, input_file) tlinks = [] tlinkElts = root.getElementsByTagName("TLINK") for tlinkElt in tlinkElts: if tlinkElt.hasAttribute("relatedToEventInstance") and \ tlinkElt.hasAttribute("eventInstanceID"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value eiid = tlinkElt.attributes["eventInstanceID"].value relatedToEventInstance = tlinkElt.attributes[ "relatedToEventInstance"].value if eiid not in event_instances or relatedToEventInstance not in event_instances: continue tlink = Tlink(lid, relType, event_instances[eiid], event_instances[relatedToEventInstance]) tlinks.append(tlink) if tlinkElt.hasAttribute("eventInstanceID") and \ tlinkElt.hasAttribute("relatedToTime"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value eiid = tlinkElt.attributes["eventInstanceID"].value relatedToTime = tlinkElt.attributes["relatedToTime"].value if eiid not in event_instances or relatedToTime not in times: continue tlink = Tlink(lid, relType, event_instances[eiid], times[relatedToTime]) tlinks.append(tlink) if tlinkElt.hasAttribute("timeID") and \ tlinkElt.hasAttribute("relatedToEventInstance"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value tid = tlinkElt.attributes["timeID"].value eiid = tlinkElt.attributes["relatedToEventInstance"].value if tid not in times or eiid not in event_instances: continue tlink = Tlink(lid, relType, times[tid], event_instances[eiid]) tlinks.append(tlink) if tlinkElt.hasAttribute("timeID") and \ tlinkElt.hasAttribute("relatedToTime"): lid = tlinkElt.attributes["lid"].value relType = tlinkElt.attributes["relType"].value tid = tlinkElt.attributes["timeID"].value relatedToTime = tlinkElt.attributes["relatedToTime"].value if tid not in times or relatedToTime not in times: continue tlink = Tlink(lid, relType, times[tid], times[relatedToTime]) tlinks.append(tlink) return TimeMLFile(sentences, events, event_instances, times, tlinks, input_file) def read_examples(self, input_file): file_data = self.read_file(input_file) examples = [] for tlink in file_data.tlinks: #print(tlink.lid, tlink.relType, tlink.e1, tlink.e2) sent1 = tlink.e1.sentence sent2 = tlink.e2.sentence #print(sent1, sent2) example = None if sent1 >= len(file_data.sentences) or sent2 >= len( file_data.sentences): continue if sent1 == sent2: text = file_data.sentences[sent1] example = TimeMLExample(text, tlink.e1.pos_in_sentence, tlink.e2.pos_in_sentence, tlink.relType) elif sent1 < sent2: sents = file_data.sentences[sent1:sent2 + 1] text = " [SEP] ".join(sents) e1_pos = tlink.e1.pos_in_sentence e2_pos = sum([len(s.split()) + 1 for s in sents[:-1]]) + tlink.e2.pos_in_sentence example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType) elif sent1 > sent2: sents = file_data.sentences[sent2:sent1 + 1] text = " [SEP] ".join(sents) e1_pos = sum([len(s.split()) + 1 for s in sents[:-1]]) + tlink.e1.pos_in_sentence e2_pos = tlink.e2.pos_in_sentence example = TimeMLExample(text, e1_pos, e2_pos, tlink.relType) if example: examples.append(example) #print(example) return examples def antithetics(self, all_examples): new_exs = [] for ex in all_examples: new_ex = None if ex.str_label == "AFTER": new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "BEFORE") new_ex.int_label = self.label_list.index("BEFORE") new_exs.append(new_ex) if ex.str_label == "BEFORE": new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "AFTER") new_ex.int_label = self.label_list.index("AFTER") new_exs.append(new_ex) if ex.str_label == "DURING": new_ex = TimeMLExample(ex.text, ex.e2_pos, ex.e1_pos, "DURING") new_ex.int_label = self.label_list.index("DURING") new_exs.append(new_ex) if new_ex != None: new_ex.sentences = ex.sentences new_ex.e1_sentence_num = ex.e2_sentence_num new_ex.e1_sentence_pos = ex.e2_sentence_pos new_ex.e2_sentence_num = ex.e1_sentence_num new_ex.e2_sentence_pos = ex.e1_sentence_pos all_examples.extend(new_exs) def assign_num_labels(self, all_examples): if not self.label_list: labels = set() for ex in all_examples: labels.add(ex.str_label) labels = list(labels) labels.sort() print(labels) print(len(labels)) self.label_list = labels for ex in all_examples: ex.int_label = self.label_list.index(ex.str_label) def read_examples_from_directory(self, dir_path): #os.chdir(dir_path) examples_list = [] for file in glob.glob(dir_path + "*.tml"): #file_path = dir_path + file examples = self.read_examples(file) examples_list.append(examples) all_examples = list(itertools.chain.from_iterable(examples_list)) #antithetics(all_examples) print(len(all_examples)) self.assign_num_labels(all_examples) return all_examples def read_example_files(self, dir_path): all_files = glob.glob(dir_path + "*.tml") train_files = all_files[:-4] dev_files = all_files[-4:] train_examples_list = [] for file in train_files: examples = self.read_examples(file) train_examples_list.append(examples) train = list(itertools.chain.from_iterable(train_examples_list)) dev_examples_list = [] for file in dev_files: examples = self.read_examples(file) dev_examples_list.append(examples) dev = list(itertools.chain.from_iterable(dev_examples_list)) self.assign_num_labels(train + dev) return train, dev def read_dense_examples(self, td_path, extra=False, window_size=None): class DenseExample(object): def __init__(self, file_name, e1, e2, label): self.file_name = file_name self.e1 = e1 self.e2 = e2 self.label = self.parse_label(label) def parse_label(self, label): labels = { "a": "AFTER", "b": "BEFORE", "i": "INCLUDES", "ii": "IS_INCLUDED", "s": "SIMULTANEOUS", "v": "VAGUE" } return labels[label] DEV_DOCS = { "APW19980227.0487", "CNN19980223.1130.0960", "NYT19980212.0019", "PRI19980216.2000.0170", "ed980111.1130.0089" } TEST_DOCS = { "APW19980227.0489", "APW19980227.0494", "APW19980308.0201", "APW19980418.0210", "CNN19980126.1600.1104", "CNN19980213.2130.0155", "NYT19980402.0453", "PRI19980115.2000.0186", "PRI19980306.2000.1675" } files_to_exs = {} f = open(td_path, "r") for line in f.readlines(): split = line.split() ex = DenseExample(split[0], split[1], split[2], split[3]) if ex.file_name not in files_to_exs: files_to_exs[ex.file_name] = [ex] else: files_to_exs[ex.file_name].append(ex) files = set(files_to_exs.keys()) train_files = files - DEV_DOCS - TEST_DOCS dev_files = DEV_DOCS train_examples = [] for file_name in train_files: file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \ if extra \ else self.read_file(FILE_DIR + "/" + file_name + ".tml") for ex in files_to_exs[file_name]: e1 = file.get_element(ex.e1) e2 = file.get_element(ex.e2) if e1 == None or e2 == None: #print("oops", file_name, ex.e1, ex.e2) continue example = file.get_example(e1, e2, ex.label, window_size) if not example: print("o no") else: train_examples.append(example) self.assign_num_labels(train_examples) dev_examples = [] for file_name in dev_files: file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \ if extra \ else self.read_file(FILE_DIR + "/" + file_name + ".tml") for ex in files_to_exs[file_name]: e1 = file.get_element(ex.e1) e2 = file.get_element(ex.e2) if e1 == None or e2 == None: #print("oops", file_name, ex.e1, ex.e2) continue example = file.get_example(e1, e2, ex.label, window_size) if not example: print("o no") else: dev_examples.append(example) self.assign_num_labels(dev_examples) return train_examples, dev_examples def read_dense_test_examples(self, td_path, extra=False, window_size=None): class DenseExample(object): def __init__(self, file_name, e1, e2, label): self.file_name = file_name self.e1 = e1 self.e2 = e2 self.label = self.parse_label(label) def parse_label(self, label): labels = { "a": "AFTER", "b": "BEFORE", "i": "INCLUDES", "ii": "IS_INCLUDED", "s": "SIMULTANEOUS", "v": "VAGUE" } return labels[label] TEST_DOCS = { "APW19980227.0489", "APW19980227.0494", "APW19980308.0201", "APW19980418.0210", "CNN19980126.1600.1104", "CNN19980213.2130.0155", "NYT19980402.0453", "PRI19980115.2000.0186", "PRI19980306.2000.1675" } files_to_exs = {} f = open(td_path, "r") for line in f.readlines(): split = line.split() ex = DenseExample(split[0], split[1], split[2], split[3]) if ex.file_name not in files_to_exs: files_to_exs[ex.file_name] = [ex] else: files_to_exs[ex.file_name].append(ex) test_examples = [] for file_name in TEST_DOCS: file = self.read_extra_file(EXTRA_FILE_DIR + "/" + file_name + ".tml") \ if extra \ else self.read_file(FILE_DIR + "/" + file_name + ".tml") for ex in files_to_exs[file_name]: e1 = file.get_element(ex.e1) e2 = file.get_element(ex.e2) if e1 == None or e2 == None: #print("oops", file_name, ex.e1, ex.e2) continue example = file.get_example(e1, e2, ex.label, window_size) if not example: print("o no") else: test_examples.append(example) self.assign_num_labels(test_examples) return test_examples def read_tempeval3_examples(): return None, None
class IssueReaderSiamese(DatasetReader): """ Parameters ---------- lazy : ``bool`` (optional, default=False) Passed to ``DatasetReader``. If this is ``True``, training will start sooner, but will take longer per batch. This also allows training with datasets that are too large to fit in memory. tokenizer : ``Tokenizer``, optional Tokenizer to use to split the sentence into words or other kinds of tokens. Defaults to ``WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional Indexers used to define input token representations. Defaults to ``{"tokens": SingleIdTokenIndexer()}``. """ def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, segment_sentences: bool = True, token_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( word_splitter=SpacyWordSplitter(pos_tags=True), word_stemmer=PorterStemmer()) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if segment_sentences: self._segment_sentences = SpacySentenceSplitter() self._class_cnt = defaultdict(int) def read_dataset(self, file_path): features = [] others = [] with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: if not line or len(line) == 0: continue line = json.loads(line) if "id" not in line.keys(): d_id = "" else: d_id = line['id'] report = split_issue_template(line['body']) report = self._segment_sentences.split_sentences(report) cmts = line['comments'] comments = [] for comment in cmts: user_name = comment['user'] comment = replace_tokens(comment['body']) if len(comment) == 0: continue comments.append((user_name, comment)) dialog = report + comments if len(dialog) == 0: continue labels = line['label'] if len(labels) == 0: label = None else: label = "feature" if "feature" in labels or "type: feature" in labels else "other" if "feature" == label: features.append((d_id, dialog, label)) else: others.append((d_id, dialog, label)) return features, others @overrides def _read(self, file_path): features, others = self.read_dataset(file_path) all_data = features + others random.shuffle(all_data) same_num = 0 diff_num = 0 if "unlabel" in file_path: logger.info("Begin predict------") features, others = self.read_dataset( "frmodel/data/{}_target_train.txt") for sample in features + others: yield self.text_to_instance((sample, sample), is_gold=True) for sample in all_data: yield self.text_to_instance((sample, sample)) logger.info(f"Predict sample num is {len(all_data)}") else: logger.info("Begin training-------") iter_num = 1 if "test" in file_path: features, others = self.read_dataset( re.sub("test", "train", file_path)) iter_num = 1 for _ in range(iter_num): # plain balance data if "train" in file_path: for k in range(len(others) - len(features)): all_data.append(random.choice(features)) for sample in all_data: positive = random.choice(features) negative = random.choice(others) yield self.text_to_instance((sample, positive)) yield self.text_to_instance((sample, negative)) same_num += 1 diff_num += 1 logger.info( f"Dataset Count: Same : {same_num} / Diff : {diff_num}") @overrides def text_to_instance(self, p, is_gold=False) -> Instance: # type: ignore fields: Dict[str, Field] = {} ins1, ins2 = p dialog = ListField([ TextField([word for word in self._tokenizer.tokenize(line[1])], self._token_indexers) for line in ins1[1] ]) fields['dialog1'] = dialog fields["pos_tags1"] = ListField([ SequenceLabelField( [word.tag_ for word in self._tokenizer.tokenize(line[1])], tokens, label_namespace="pos") for line, tokens in zip(ins1[1], dialog) ]) if ins1[-1] is not None and ins2[-1] is not None: if ins1[-1] == ins2[-1]: fields['label'] = LabelField("same") else: fields['label'] = LabelField("diff") fields['label_tags'] = LabelField("@".join([ins1[-1], ins2[-1]]), label_namespace="label_tags") fields['label'] = LabelField(ins1[-1]) fields['metadata'] = MetadataField({ "is_gold": is_gold, "pair_instance": p }) return Instance(fields)
class TextCatReader(DatasetReader): """ Reads tokens and their labels from a labeled text classification dataset. Expects a "tokens" field and a "category" field in JSON format. The output of ``read`` is a list of ``Instance`` s with the fields: tokens: ``TextField`` and label: ``LabelField`` Parameters ---------- token_indexers : ``Dict[str, TokenIndexer]``, optional optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : ``Tokenizer``, optional (default = ``{"tokens": WordTokenizer()}``) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences: ``bool``, optional (default = ``False``) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like the Hierarchical Attention Network. sequence_length: ``int``, optional (default = ``None``) If specified, will truncate tokens to specified maximum length. ignore_labels: ``bool``, optional (default = ``False``) If specified, will ignore labels when reading data, useful for semi-supervised textcat skip_label_indexing: ``bool``, optional (default = ``False``) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. lazy : ``bool``, optional, (default = ``False``) Whether or not instances can be read lazily. """ def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, unrestricted_tokenizer: Tokenizer = None, segment_sentences: bool = False, sequence_length: int = None, ignore_labels: bool = False, skip_label_indexing: bool = False, sample: int = None, unlabeled_data_path: str = None, lazy: bool = False) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or WordTokenizer() self._unrestricted_tokenizer = unrestricted_tokenizer self._sample = sample self._segment_sentences = segment_sentences self._sequence_length = sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._unlabeled_data_path = unlabeled_data_path if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() def _reservoir_sampling(self, file_): """ reservoir sampling for reading random lines from file without loading entire file into memory See here for explanation of algorithm: https://stackoverflow.com/questions/35680236/select-100-random-lines-from-a-file-with-a-1-million-which-cant-be-read-into-me Parameters ---------- file : `str` - file path sample_size : `int` - size of random sample you want Returns ------- result : `List[str]` - sample lines of file """ file_iterator = iter(file_) try: result = [next(file_iterator) for _ in range(self._sample)] except StopIteration: raise ValueError("Sample larger than population") for index, item in enumerate(file_iterator, start=self._sample): sample_index = np.random.randint(0, index) if sample_index < self._sample: result[sample_index] = item np.random.shuffle(result) return result @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: if self._sample is not None: lines = [(item, False) for item in self._reservoir_sampling(data_file)] else: lines = [(item, True) for item in data_file.readlines()] if self._unlabeled_data_path: with open(cached_path(self._unlabeled_data_path)) as data_file: lines += [(item, False) for item in data_file.readlines()] for line, is_labeled in lines: items = json.loads(line) text = items["tokens"] label = str(items['category']) instance = self.text_to_instance(text=text, label=label, is_labeled=is_labeled) if instance is not None: yield instance def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._sequence_length: tokens = tokens[:self._sequence_length] return tokens @overrides def text_to_instance(self, text: str, label: str = None, is_labeled: bool = False) -> Instance: # type: ignore """ Parameters ---------- text : ``str``, required. The text to classify label ``str``, optional, (default = None). The label for this text. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields['tokens'] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._sequence_length is not None: tokens = self._truncate(tokens) fields['tokens'] = TextField(tokens, self._token_indexers) if self._unrestricted_tokenizer: unrestricted_tokens = self._unrestricted_tokenizer.tokenize( text) if self._sequence_length is not None: unrestricted_tokens = self._truncate(unrestricted_tokens) fields['filtered_tokens'] = TextField(unrestricted_tokens, self._token_indexers) # TODO: Document 'default' unsupervised label as pre-condition. if label is not None: fields['label'] = LabelField( label, skip_indexing=self._skip_label_indexing) fields['metadata'] = MetadataField({"is_labeled": is_labeled}) return Instance(fields)
def main(args): print(f"Arguments: {args}") attribute_to_use = args["attribute_to_use"] word_tokenizer = WordTokenizer() sentence_splitter = SpacySentenceSplitter() buckets = [] bucket_strings = [i.split(':') for i in args['buckets']] for lower, upper in bucket_strings: buckets.append( BucketTuple(lower_bound=float(lower), upper_bound=float(upper))) buckets_map = OrderedDict({i: v for i, v in enumerate(buckets)}) story_buckets_map = defaultdict(lambda: defaultdict(lambda: list())) story_text_map = defaultdict(lambda: defaultdict(lambda: list())) with jsonlines.open(args["source_json"], mode='r') as reader: for json_obj in reader: story_id = json_obj["metadata"]["story_id"] source_text = json_obj["metadata"]["source_text"] target_text = json_obj["metadata"]["target_text"] source_len = len(word_tokenizer.tokenize(source_text)) target_len = len(word_tokenizer.tokenize(target_text)) # Reconstruct the complete text of the story for completeness. if json_obj["metadata"]["absolute_position"] == 1: story_text_map[story_id]["text"].extend( sentence_splitter.split_sentences(source_text)) story_text_map[story_id]["text"].append(source_text) attribute = float(json_obj[attribute_to_use]) # TODO: Restrict to in length. for i, bucket in buckets_map.items(): if attribute >= bucket.lower_bound and attribute < bucket.upper_bound: if source_len < args[ "min_word_length"] or target_len < args[ "min_word_length"]: continue story_buckets_map[story_id][i].append(json_obj) with jsonlines.open(args["target_json"], mode='w') as writer: for story_id, buckets in story_buckets_map.items(): # If at least one from each of the buckets is in the story then randomly select one. if all([len(buckets[b]) > 0 for b in buckets_map.keys()]): selection = [] for i, contexts in buckets.items(): selected = random.choice(contexts) selected["bucket"] = i selection.append(selected) random.shuffle(selection) task_map = { "story_id": story_id, "all_story_text": story_text_map[story_id], "selection": selection } print(task_map) writer.write(task_map)
class ICCDatasetReader(DatasetReader): def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = DummyTokenizer() # assumes our tokens unchanged self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() @overrides def _read(self, file_path): with open(cached_path(file_path)) as data_file: for line in data_file: try: text, label = line.strip().split("\t") except ValueError as e: continue instance = self.text_to_instance(text=text, label=label) if instance is not None: yield instance def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._max_sequence_length: tokens = tokens[:self._max_sequence_length] return tokens @overrides def text_to_instance(self, text: str, label: Union[str, int] = None) -> Instance: fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens, self._token_indexers) if label is not None: fields["label"] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields)
class TextClassificationJsonReader(DatasetReader): """ Reads tokens and their labels from a labeled text classification dataset. Expects a "text" field and a "label" field in JSON format. The output of ``read`` is a list of ``Instance`` s with the fields: tokens : ``TextField`` and label : ``LabelField`` # Parameters token_indexers : ``Dict[str, TokenIndexer]``, optional optional (default=``{"tokens": SingleIdTokenIndexer()}``) We use this to define the input representation for the text. See :class:`TokenIndexer`. tokenizer : ``Tokenizer``, optional (default = ``{"tokens": SpacyTokenizer()}``) Tokenizer to use to split the input text into words or other kinds of tokens. segment_sentences : ``bool``, optional (default = ``False``) If True, we will first segment the text into sentences using SpaCy and then tokenize words. Necessary for some models that require pre-segmentation of sentences, like the Hierarchical Attention Network (https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf). max_sequence_length : ``int``, optional (default = ``None``) If specified, will truncate tokens to specified maximum length. skip_label_indexing : ``bool``, optional (default = ``False``) Whether or not to skip label indexing. You might want to skip label indexing if your labels are numbers, so the dataset reader doesn't re-number them starting from 0. lazy : ``bool``, optional, (default = ``False``) Whether or not instances can be read lazily. """ def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter() @overrides def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): if not line: continue items = json.loads(line) text = items["text"] label = items.get("label", None) if label is not None: if self._skip_label_indexing: try: label = int(label) except ValueError: raise ValueError( "Labels must be integers if skip_label_indexing is True." ) else: label = str(label) instance = self.text_to_instance(text=text, label=label) if instance is not None: yield instance def _truncate(self, tokens): """ truncate a set of tokens using the provided sequence length """ if len(tokens) > self._max_sequence_length: tokens = tokens[:self._max_sequence_length] return tokens @overrides def text_to_instance( self, text: str, label: Union[str, int] = None) -> Instance: # type: ignore """ # Parameters text : ``str``, required. The text to classify label : ``str``, optional, (default = None). The label for this text. # Returns An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens, self._token_indexers) if label is not None: fields["label"] = LabelField( label, skip_indexing=self._skip_label_indexing) return Instance(fields)
class SquadReader(DatasetReader): """ Reads a JSON-formatted SQuAD file and returns a ``Dataset`` where the ``Instances`` have four fields: ``question``, a ``TextField``, ``passage``, another ``TextField``, and ``span_start`` and ``span_end``, both ``IndexFields`` into the ``passage`` ``TextField``. We also add a ``MetadataField`` that stores the instance's ID, the original passage text, gold answer strings, and token offsets into the original passage, accessible as ``metadata['id']``, ``metadata['original_passage']``, ``metadata['answer_texts']`` and ``metadata['token_offsets']``. This is so that we can more easily use the official SQuAD evaluation script to get metrics. We also support limiting the maximum length for both passage and question. However, some gold answer spans may exceed the maximum passage length, which will cause error in making instances. We simply skip these spans to avoid errors. If all of the gold answer spans of an example are skipped, during training, we will skip this example. During validating or testing, since we cannot skip examples, we use the last token as the pseudo gold answer span instead. The computed loss will not be accurate as a result. But this will not affect the answer evaluation, because we keep all the original gold answer texts. Parameters ---------- tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``) We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`. Default is ```WordTokenizer()``. token_indexers : ``Dict[str, TokenIndexer]``, optional We similarly use this for both the question and the passage. See :class:`TokenIndexer`. Default is ``{"tokens": SingleIdTokenIndexer()}``. lazy : ``bool``, optional (default=False) If this is true, ``instances()`` will return an object whose ``__iter__`` method reloads the dataset each time it's called. Otherwise, ``instances()`` returns a list. passage_length_limit : ``int``, optional (default=None) if specified, we will cut the passage if the length of passage exceeds this limit. question_length_limit : ``int``, optional (default=None) if specified, we will cut the question if the length of passage exceeds this limit. skip_invalid_examples: ``bool``, optional (default=False) if this is true, we will skip those invalid examples """ def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, passage_length_limit: int = None, question_length_limit: int = None, skip_invalid_examples: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._sentence_splitter = SpacySentenceSplitter() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self.passage_length_limit = passage_length_limit self.question_length_limit = question_length_limit self.skip_invalid_examples = skip_invalid_examples @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading file at %s", file_path) with open(file_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] logger.info("Reading the dataset") total = 0.0 find = 0.0 for article in dataset: for paragraph_json in article['paragraphs']: paragraph = paragraph_json["context"] sentences = self._sentence_splitter.split_sentences(paragraph) for question_answer in paragraph_json['qas']: question_text = question_answer["question"].strip().replace("\n", "") answer_texts = [answer['text'] for answer in question_answer['answers']] concat_article = "" sent_labels = [] sent_starts = [] sent_ends = [] passage_offsets = [] passage_tokens = [] for sent in sentences: tokenized_sent = self._tokenizer.tokenize(sent) tokenized_sent = [Token(text=tk.text, idx=tk.idx) for tk in tokenized_sent] sent_offset = [(tk.idx + len(concat_article), tk.idx + len(tk.text) + len(concat_article)) for tk in tokenized_sent] passage_offsets.extend(sent_offset) concat_article += sent passage_tokens.extend(tokenized_sent) if sent_offset: sent_start = sent_offset[0][0] sent_end = sent_offset[-1][1] sent_starts.append(sent_start) sent_ends.append(sent_end) ans_appears = False for ans in answer_texts: if ans in sent: ans_appears = True if ans_appears: sent_labels.append(1) find += 1 else: sent_labels.append(0) total += 1 instance = self.text_to_instance(question_text, concat_article, zip(sent_starts, sent_ends), sent_labels, answer_texts, passage_tokens, passage_offsets) if instance is not None: yield instance print("percentage:", float(find) / float(total)) @overrides def text_to_instance(self, # type: ignore question_text: str, passage_text: str, char_spans_sent: List[Tuple[int, int]] = None, sent_labels: List[int] = None, answer_texts: List[str] = None, passage_tokens: List[Token] = None, passage_offsets: List[Tuple] = None) -> Instance: token_spans_sent: List[Tuple[int, int]] = [] for char_span_sent_start, char_span_sent_end in char_spans_sent: (span_start_sent, span_end_sent), error = util.char_span_to_token_span(passage_offsets, (char_span_sent_start, char_span_sent_end)) token_spans_sent.append((span_start_sent, span_end_sent)) tokenized_ques = self._tokenizer.tokenize(question_text) tokenized_ques = [Token(text=tk.text, idx=tk.idx) for tk in tokenized_ques] return make_reading_comprehension_instance(tokenized_ques, passage_tokens, self._token_indexers, passage_text, token_spans_sent, sent_labels, answer_texts, passage_offsets)