def consolidate_NER_results(final_sequences, text): """ Function that from a list of sequences returned from the NER function is updated with spans :param final_sequences: Sequences returned from NER function. Sequence is a array of arrays of tokens in format (token,label). :param text: full text article :return: a list of tuples that includes spans in the following format: (token,label,span_begin,span_end) """ tokens = [] for a in final_sequences: for b in a: tokens.append(b[0]) spans = align_tokens(tokens, text) fin = [] multiplier = 0 for i in range(0, len(final_sequences)): #multiplier = 0 if i > 0: multiplier = multiplier + len(final_sequences[i-1]) #subtractor = 1 for j in range(0, len(final_sequences[i])): token = final_sequences[i][j][0] label = final_sequences[i][j][1] span_min = spans[multiplier+j][0] span_max = spans[multiplier+j][1] fin.append((token, label, span_min, span_max)) return fin
def span_tokenize(self, text): """ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> TreebankWordTokenizer().span_tokenize(s) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True """ raw_tokens = self.tokenize(text) # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) if '"' in text: # Find double quotes and converted quotes matched = [m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text)] # Replace converted quotes back to double quotes tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens] else: tokens = raw_tokens return align_tokens(tokens, text)
def getWhitespaceTokens(file_path): file = open(file_path, "r") raw_text = file.read() ## Testing the replacement of all "=" signs by spaces before tokenizing. text = raw_text.translate(str.maketrans("=", ' ')) ## Tokenize the sentences sentences = sent_tokenize(text) ## Get spans of the sentences sent_spans = align_tokens(sentences, text) ## create empty arrays for white space tokens and sentence delimiters tokenized_text = [] text_spans = [] ## Loop through each sentence and get the tokens and token spans for s in range(0, len(sentences)): # get the tokens and token spans within the sentence toks = WhitespaceTokenizer().tokenize(sentences[s]) span_generator = WhitespaceTokenizer().span_tokenize(sentences[s]) rel_spans = [span for span in span_generator] # convert the relative spans into absolute spans abs_spans = [] for start, end in rel_spans: abs_spans = abs_spans + [ (sent_spans[s][0] + start, sent_spans[s][0] + end) ] tokenized_text = tokenized_text + toks text_spans = text_spans + abs_spans ## Now we have the token list and the spans. We should be able to continue finding sentnence boundaries as before tags = nltk.pos_tag(tokenized_text) sent_boundaries = [0] * len(tokenized_text) ## figure out which tokens are at the end of a sentence tok_counter = 0 for s in range(0, len(sentences)): sent = sentences[s] if "\n" in sent: sent_newline = sent.split("\n") for sn in sent_newline: sent_split = WhitespaceTokenizer().tokenize(sn) nw_idx = len(sent_split) + tok_counter - 1 sent_boundaries[nw_idx] = 1 tok_counter = tok_counter + len(sent_split) else: sent_split = WhitespaceTokenizer().tokenize(sent) nw_idx = len(sent_split) + tok_counter - 1 sent_boundaries[nw_idx] = 1 tok_counter = tok_counter + len(sent_split) return raw_text, text, tokenized_text, text_spans, tags, sent_boundaries
def tokenize(self): sents = self.sent_tokenizer.tokenize(self.text) sent_spans = self.sent_tokenizer.span_tokenize(self.text) tokens = [self.tokenizer.tokenize(sent) for sent in sents] idxs = [ align_tokens(['"' if x in ['``', "''"] else x for x in toks], sent) for sent, toks in zip(sents, tokens) ] return sents, tokens, idxs, sent_spans
def custom_span_tokenize(text, language='english', preserve_line=False): tokens = custom_word_tokenize(text) tokens = ['"' if tok in ['``',"''"] else tok for tok in tokens] return align_tokens(tokens, text) #print(custom_span_tokenize("He was a 47-year-old man born on 10/12/1975. His phone number is 170-574-2276")) # documents = readSurrogate("../Datasets/i2b2_data/training-PHI-Gold-Set1") # documents = tokenize(documents) # print("Hi")
def span_tokenize(self, string): if self.__tokenizer == 'nltk': raw_tokens = nltk.word_tokenize(string) if ('"' in string) or ("''" in string): matched = [m.group() for m in re.finditer(r"``|'{2}|\"", string)] tokens = [matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens] else: tokens = raw_tokens spans = align_tokens(tokens, string) return spans
def span_tokenize(self, text): """ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True Additional example >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\\n each in New (York)."''' >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), ... (82, 83), (83, 84)] >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True """ raw_tokens = self.tokenize(text) # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) or double # single-quotes (because '' might be transformed to `` if it is # treated as starting quotes). if ('"' in text) or ("''" in text): # Find double quotes and converted quotes matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens for tok in align_tokens(tokens, text): yield tok
def __call__(self, doc, **kwargs): if doc.text is None: return doc if self.has_span_tokenize: spans = self.tokenizer.span_tokenize(doc.text) else: tks = self.tokenizer.tokenize(doc.text) spans = align_tokens(tks, doc.text) annset = doc.annset(self.out_set) for span in spans: annset.add(span[0], span[1], self.token_type) return doc
def custom_span_tokenize(text, language='english', preserve_line=True): """ Returns a spans of tokens in text. :param text: text to split into words :param language: the model name in the Punkt corpus :type language: str :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it. :type preserver_line: bool """ tokens = custom_word_tokenize(text) tokens = ['"' if tok in ['``', "''"] else tok for tok in tokens] return align_tokens(tokens, text)
def tokenize(text): sents = sent_tokenizer.tokenize(text) sent_spans = sent_tokenizer.span_tokenize(text) tokens = [tokenizer.tokenize(sent) for sent in sents] idxs = [ align_tokens([ x.replace('``', '"').replace("''", '"') if '``' in x or "''" in x else x for x in toks ], sent.replace('``', '"').replace("''", '"')) for sent, toks in zip(sents, tokens) ] return sents, tokens, idxs, sent_spans
def _process(self, input_pack: DataPack): inputs = self.tokenizer(input_pack.text, return_tensors="pt") tokens = self.tokenizer.convert_ids_to_tokens( inputs['input_ids'][0].tolist())[1:-1] tokens_clean = [ token.replace('##', '') if token.startswith('##') else token for token in tokens ] for i, (begin, end) in enumerate( align_tokens(tokens_clean, input_pack.text.lower())): subword = Subword(input_pack, begin, end) subword.is_subword = tokens[i].startswith('##')
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: r""" Returns the spans of the tokens in ``text``. Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True :param text: A string with a sentence or sentences. :type text: str :yield: Tuple[int, int] """ raw_tokens = self.tokenize(text) # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) or double # single-quotes (because '' might be transformed to `` if it is # treated as starting quotes). if ('"' in text) or ("''" in text): # Find double quotes and converted quotes matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens yield from align_tokens(tokens, text)
def span_tokenize(self, text): """ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> TreebankWordTokenizer().span_tokenize(s) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True """ tokens = self.tokenize(text) return align_tokens(tokens, text)
def span_tokenize(self, text): """ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> TreebankWordTokenizer().span_tokenize(s) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True """ tokens = self.tokenize(text) return align_tokens(tokens, text)
def span_tokenize(self, text): """ Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).''' >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] >>> TreebankWordTokenizer().span_tokenize(s) == expected True >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected True """ raw_tokens = self.tokenize(text) # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) if '"' in text: # Find double quotes and converted quotes matched = [ m.group() for m in re.finditer(r'[(``)(\'\')(")]+', text) ] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens return align_tokens(tokens, text)
def main(args): if len(args) < 3: sys.stderr.write( "Required arguments: <input directory> <rest host> <output directory>\n" ) sys.exit(-1) hostname = args[1] # initialize rest server init_url = 'http://%s:8000/temporal/initialize' % hostname process_url = 'http://%s:8000/temporal/process' % hostname # sentence segmenter rush = RuSH('conf/rush_rules.tsv') # tokenizer tokenizer = TreebankWordTokenizer() r = requests.post(init_url) if r.status_code != 200: sys.stderr.write('Error: rest init call was not successful\n') sys.exit(-1) combine_sentences = True token_threshold = 100 for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex): print("Processing filename: %s" % (text_name)) if len(xml_names) > 1: sys.stderr.write( 'There were multiple valid xml files for file %s\n' % (text_name)) filtered_names = [] for xml_name in xml_names: if 'Relation' in xml_name: filtered_names.append(xml_name) if len(filtered_names) == 1: sys.stderr.write( 'Picking the file with "Relation" in the title: %s\n' % (filtered_names[0])) xml_names = filtered_names else: sys.exit(-1) xml_name = xml_names[0] section_texts = [] sentences = [] text = '' with open(os.path.join(args[0], sub_dir, text_name)) as f: cur_section = [] cur_ind = 0 section_start = 0 for line in f.readlines(): text += line line_len = len(line) line = line.rstrip() if line.startswith('[meta') or line.startswith( '[start section') or line.startswith('[end section'): if len(cur_section) > 0: section_texts.append('\n'.join(cur_section)) section_text = '\n'.join(cur_section) section_sents = rush.segToSentenceSpans(section_text) if len(section_sents) > 0: section_sents[0].text = '<section>' #section_sents[-1].text = '</section>' for section_sent in section_sents: section_sent.begin += section_start section_sent.end += section_start sentences.extend(section_sents) cur_section = [] section_start = cur_ind + line_len else: cur_section.append(line) cur_ind += line_len #sentences = rush.segToSentenceSpans(text) sent_tokens = [] merged_sentences = [] if combine_sentences: for sentence_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] if tb_tokenize: raw_tokens = tokenizer.tokenize(sent_txt) # From https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer.span_tokenize # Convert converted quotes back to original double quotes # Do this only if original text contains double quote(s) or double # single-quotes (because '' might be transformed to `` if it is # treated as starting quotes). if ('"' in sent_txt) or ("''" in sent_txt): # Find double quotes and converted quotes matched = [ m.group() for m in re.finditer(r"``|'{2}|\"", sent_txt) ] # Replace converted quotes back to double quotes tokens = [ matched.pop(0) if tok in ['"', "``", "''"] else tok for tok in raw_tokens ] else: tokens = raw_tokens else: tokens = tokenize(sent_txt) # fix apostrophe s ('s) to be one token def fix_simple_tokenize(tokens): new_tokens = [] ind = 0 while ind < len(tokens): if tokens[ind] == "'" and ind + 1 < len( tokens) and tokens[ind + 1] == 's': new_tokens.append("'s") ind += 2 else: new_tokens.append(tokens[ind]) ind += 1 return new_tokens tokens = fix_simple_tokenize(tokens) if text[sentence.end] == '\n': tokens.append('<cr>') # print("Sentence number %d has %d tokens" % (sentence_ind, len(tokens))) if len(sent_tokens) > 0 and ( len(sent_tokens[-1]) + len(tokens)) < token_threshold and sentence.text == '': sent_tokens[-1].extend(tokens) merged_sentences[-1].end = sentence.end else: sent_tokens.append(tokens) merged_sentences.append(sentence) for tokens in sent_tokens: while tokens[-1] == "<cr>": tokens.pop() sentences = merged_sentences else: for sentence in sentences: sent_txt = text[sentence.begin:sentence.end] sent_tokens.append(tokenize(sent_txt)) r = requests.post(process_url, json={ 'sent_tokens': sent_tokens, 'metadata': text_name }) if r.status_code != 200: sys.stderr.write('Error: rest call was not successful\n') sys.exit(-1) json = r.json() anafora_data = AnaforaData() cur_id = 0 rel_id = 0 for sent_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] sent_events = json['events'][sent_ind] sent_timexes = json['timexes'][sent_ind] sent_rels = json['relations'][sent_ind] event_ids = [] timex_ids = [] meta_rev_loc = sent_txt.find('[meta rev_date') if meta_rev_loc >= 0: meta_rev_end = sent_txt.find(']', meta_rev_loc) meta_rev_loc += sentence.begin meta_rev_end += sentence.begin # Replace <cr> with empty string so that tokens align again, # then after alignment add them back in so token offsets from classifier are correct. cr_token_inds = [] num_crs_at_position = [] for ind in range(len(sent_tokens[sent_ind])): num_crs_at_position.append(len(cr_token_inds)) if sent_tokens[sent_ind][ind] == '<cr>': cr_token_inds.append(ind) sent_tokens[sent_ind][ind] = '' try: token_spans = align_tokens(sent_tokens[sent_ind], sent_txt) except Exception as e: sys.stderr.write( 'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n' % (text_name, str(e), sent_txt)) sys.exit(-1) for event in sent_events: begin_token_ind = event['begin'] end_token_ind = event['end'] dtr = event['dtr'] event_start_offset = token_spans[ begin_token_ind + num_crs_at_position[begin_token_ind]][0] + sentence.begin event_end_offset = token_spans[ end_token_ind + num_crs_at_position[end_token_ind]][1] + sentence.begin event_text = text[event_start_offset:event_end_offset] annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name if event_text.endswith('_date'): annot.properties['datesectiontime'] = 'True' event_ids.append(-1) else: event_ids.append(annot.id) annot.spans = ((event_start_offset, event_end_offset), ) annot.type = "EVENT" annot.properties['DocTimeRel'] = dtr anafora_data.annotations.append(annot) cur_id += 1 #print("Found event %s" % (event_text)) for timex in sent_timexes: begin_token_ind = timex['begin'] end_token_ind = timex['end'] time_class = timex['timeClass'] timex_start_offset = token_spans[ begin_token_ind + num_crs_at_position[begin_token_ind]][0] + sentence.begin timex_end_offset = token_spans[ end_token_ind + num_crs_at_position[end_token_ind]][1] + sentence.begin timex_text = text[timex_start_offset:timex_end_offset] if meta_rev_loc >= 0 and timex_start_offset > meta_rev_loc and timex_end_offset < meta_rev_end: timex_ids.append(-1) elif time_class == 'SECTIONTIME': timex_ids.append(-1) elif not re.match(r'\d{5}', timex_text) is None: timex_ids.append(-1) else: # create anafora entry annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name timex_ids.append(annot.id) cur_id += 1 annot.spans = ((timex_start_offset, timex_end_offset), ) annot.type = "TIMEX3" annot.properties['Class'] = time_class anafora_data.annotations.append(annot) #print("Found timex %s" % (timex_text)) if not 'path' in text_name.lower(): # no relations in pathology notes, so if we find any they are false positives. for rel in sent_rels: arg1_type, arg1_ind = rel['arg1'].split('-') arg2_type, arg2_ind = rel['arg2'].split('-') if arg1_type == 'EVENT': arg1 = event_ids[int(arg1_ind)] elif arg1_type == 'TIMEX': arg1 = timex_ids[int(arg1_ind)] if arg1 == -1: continue if arg2_type == 'EVENT': arg2 = event_ids[int(arg2_ind)] elif arg2_type == 'TIMEX': arg2 = timex_ids[int(arg2_ind)] if arg2 == -1: continue reln = AnaforaRelation() reln.id = str(rel_id) + '@r@' + text_name rel_id += 1 reln.type = 'TLINK' reln.properties['Type'] = rel['category'] reln.properties['Source'] = arg1 reln.properties['Target'] = arg2 anafora_data.annotations.append(reln) #break anafora_data.indent() os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True) anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
def span_tokenize(self, string): if self.__tokenizer == 'jieba': tokens = self.tokenize(string) spans = align_tokens(tokens, string) return spans
def align_tokens(self): if self.tokens_spans is None: self.tokens_spans = align_tokens([t.word for t in self.tokens], self.text) return self.tokens_spans
def main(args): if len(args) < 3: sys.stderr.write( "Required arguments: <input directory> <rest host> <output directory>\n" ) sys.exit(-1) hostname = args[1] # initialize rest server init_url = 'http://%s:8000/temporal/initialize' % hostname process_url = 'http://%s:8000/temporal/process' % hostname # sentence segmenter rush = RuSH('conf/rush_rules.tsv') # tokenizer # tokenizer = TreebankWordTokenizer() r = requests.post(init_url) if r.status_code != 200: sys.stderr.write('Error: rest init call was not successful\n') sys.exit(-1) for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex): print("Processing filename: %s" % (text_name)) if len(xml_names) > 1: sys.stderr.write( 'There were multiple valid xml files for file %s' % (text_name)) sys.exit(-1) xml_name = xml_names[0] with open(os.path.join(args[0], sub_dir, text_name)) as f: text = f.read() sentences = rush.segToSentenceSpans(text) sent_tokens = [] for sentence in sentences: sent_txt = text[sentence.begin:sentence.end] sent_tokens.append(tokenize(sent_txt)) r = requests.post(process_url, json={'sent_tokens': sent_tokens}) if r.status_code != 200: sys.stderr.write('Error: rest call was not successful\n') sys.exit(-1) json = r.json() anafora_data = AnaforaData() cur_id = 0 for sent_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] sent_events = json['events'][sent_ind] sent_timexes = json['timexes'][sent_ind] try: token_spans = align_tokens(sent_tokens[sent_ind], sent_txt) except Exception as e: sys.stderr.write( 'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n' % (text_name, str(e), sent_txt)) sys.exit(-1) for event in sent_events: begin_token_ind = event['begin'] end_token_ind = event['end'] dtr = event['dtr'] event_start_offset = token_spans[begin_token_ind][ 0] + sentence.begin event_end_offset = token_spans[end_token_ind][ 1] + sentence.begin event_text = text[event_start_offset:event_end_offset] annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name cur_id += 1 annot.spans = ((event_start_offset, event_end_offset), ) annot.type = "EVENT" annot.properties['DocTimeRel'] = dtr anafora_data.annotations.append(annot) #print("Found event %s" % (event_text)) for timex in sent_timexes: begin_token_ind = timex['begin'] end_token_ind = timex['end'] time_class = timex['timeClass'] timex_start_offset = token_spans[begin_token_ind][ 0] + sentence.begin timex_end_offset = token_spans[end_token_ind][ 1] + sentence.begin timex_text = text[timex_start_offset:timex_end_offset] # create anafora entry annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name cur_id += 1 annot.spans = ((timex_start_offset, timex_end_offset), ) annot.type = "TIMEX3" annot.properties['Class'] = time_class anafora_data.annotations.append(annot) #print("Found timex %s" % (timex_text)) #break anafora_data.indent() os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True) anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
def get_usable_tokens(sent: str) -> List[TaggedToken]: # TODO: complexity is bad! # TODO: stricter linting rules? tokens = word_tokenize(sent) quote_pattern = r"``|''|\"" quotes = [m.group() for m in re.finditer(quote_pattern, sent)] restored_tokens = [ quotes.pop(0) if re.match(quote_pattern, tok) else tok for tok in tokens ] token_spans = align_tokens(restored_tokens, sent) tagged: List[str, POS] = pos_tag(tokens) regulars: Dict[Span, POS] = {} for i, span in enumerate(token_spans): regulars[span] = tagged[i][1] if tagged[i][1] not in literalised_pos else 'LITERAL' word_spans = Corpifier.span_word(sent) irregulars = {} # irregular span to index for i, s in enumerate(token_spans): if s not in word_spans: irregulars[s] = i def merge_irregular_spans(spans: Dict[Span, int]) -> Set[Span]: spans_t = spans.items() shift: List[Tuple[Span, List[int]]] = [] # span of words and for s, i in spans_t: if len(shift) == 0: shift.append((s, [i])) continue if s[0] == shift[len(shift) - 1][0][1]: prev = shift.pop(len(shift) - 1) prev_span, ind = prev[0], prev[1] new_span = (prev_span[0], s[1]) ind.append(i) shift.append((new_span, ind)) else: shift.append((s, [i])) ret: Set[Span] = set() for span, _ in shift: ret.add(span) return ret merged_irregulars = merge_irregular_spans(irregulars) ret: List[TaggedToken] = [] for i, w_span in enumerate(word_spans): tag: POS word: str = sent[w_span[0]:w_span[1]] spaced = not (i < len(word_spans) - 1 and w_span[1] == word_spans[i + 1][0]) if w_span in merged_irregulars: if word.lower() in apostrophised: tag = apostrophised[word.lower()] else: tag = 'LITERAL' elif w_span in regulars: tag = regulars[w_span] else: tag = 'LITERAL' # print("unexpected literal: " + word + " from " + sent) ret.append(TaggedToken(word, tag, spaced)) return ret