def main(args): args = parser.parse_args() tagger = SequenceTagger.load_from_file( join(args.model_dir, 'best-model.pt')) txt_files = glob.glob(join(args.data_dir, '*.txt')) sent_splitter = PunktSentenceTokenizer() #tokenizer = TreebankWordTokenizer() for txt_fn in txt_files: print("Processing %s" % (basename(txt_fn))) ann_fn = join(args.output_dir, basename(txt_fn)[:-3] + 'ann') with open(txt_fn, 'r') as myfile: text = myfile.read() ann_out = open(ann_fn, 'w') ent_id = 0 sent_spans = sent_splitter.span_tokenize(text) raw_offset = 0 for sent_span in sent_spans: raw_offset = sent_span[0] sent = text[sent_span[0]:sent_span[1]] #tokens = tokenizer.tokenize(sent) # tagged = pos_tag(tokens) #flair_sent = Sentence(' '.join(tokens)) flair_sent = Sentence(sent, use_tokenizer=True) ade_tagged = tagger.predict(flair_sent) cmap = raw_flair_charmap(sent, flair_sent.to_tokenized_string()) #print('Sent is %s' % (sent) ) #print(ade_tagged[0].to_tagged_string()) # Check for annotated drugs: drug_found = False for entity in ade_tagged[0].to_dict(tag_type='ner')['entities']: if entity['type'] == 'Drug': drug_found = True break if drug_found or not args.conservative: for entity in ade_tagged[0].to_dict( tag_type='ner')['entities']: start = entity['start_pos'] end = entity['end_pos'] raw_start = start #cmap[start] raw_end = end #cmap[end] # print('Mapped entity type %s(%s):(%d, %d) => (%d, %d)' % (entity['type'], entity['text'], start, end, raw_offset+raw_start, raw_offset+raw_end) ) ann_out.write( 'T%d\t%s %d %d\t%s\n' % (ent_id, entity['type'], raw_offset + raw_start, raw_offset + raw_end, entity['text'])) ent_id += 1 raw_offset += len(sent) + 1
def __get_setences_boundaries(self): """ function to tokenize sentences and return sentence boundaries of each sentence using a tokenizer. :return: """ tokenizer = PunktSentenceTokenizer(punkt_param) sentences = list(tokenizer.span_tokenize(self.text)) return sentences
def analyse_entry(self, entry, params): chunker_type = params["delimiter"] original_text = entry['nif:isString'] if chunker_type == "sentence": tokenizer = PunktSentenceTokenizer() if chunker_type == "paragraph": tokenizer = LineTokenizer() chars = list(tokenizer.span_tokenize(original_text)) for i, chunk in enumerate(tokenizer.tokenize(original_text)): print(chunk) e = Entry() e['nif:isString'] = chunk if entry.id: e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1]) yield e
def split_into_sentences(text): # splits the text into sentences and also preserves the corresponding starting and ending indices startIndices = [] endIndices = [] corpus = [] punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'doc', 'mr', 'mrs', 'prof', 'inc', 'mgr', 'ing', 'st']) sentence_splitter = PunktSentenceTokenizer(punkt_param) for start, end in sentence_splitter.span_tokenize(text): startIndices.append(start) endIndices.append(end) token = text[start:end] corpus.append(token) return startIndices, endIndices, corpus
def analyse_entry(self, entry, activity): yield entry chunker_type = activity.params["delimiter"] original_text = entry['nif:isString'] if chunker_type == "sentence": tokenizer = PunktSentenceTokenizer() if chunker_type == "paragraph": tokenizer = LineTokenizer() chars = list(tokenizer.span_tokenize(original_text)) if len(chars) == 1: # This sentence was already split return for i, chunk in enumerate(chars): start, end = chunk e = Entry() e['nif:isString'] = original_text[start:end] if entry.id: e.id = entry.id + "#char={},{}".format(start, end) yield e
def analyse_entry(self, entry, params): yield entry chunker_type = params["delimiter"] original_text = entry['nif:isString'] if chunker_type == "sentence": tokenizer = PunktSentenceTokenizer() if chunker_type == "paragraph": tokenizer = LineTokenizer() chars = list(tokenizer.span_tokenize(original_text)) if len(chars) == 1: # This sentence was already split return for i, chunk in enumerate(chars): start, end = chunk e = Entry() e['nif:isString'] = original_text[start:end] if entry.id: e.id = entry.id + "#char={},{}".format(start, end) yield e
def sent_tokenize(data, filter_threshold=None): ''' Tokenizes a string into sentences and corresponding offsets Args: data(str): The document itself filter_threshold(int): if sentence length is less than this, it will be ignored Returns: tuple(list(str), list(list))): tokenized sentences and corresponding offsets ''' punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) return (sentences, offsets)
def marker_surr_patt(in_dir): """ Find most frequent POS tag patterns at the end of sentences. """ punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) file_names = os.listdir(in_dir) patt_comb_freq_map = {} patt_orig_freq_map = {} patt_comb_freq_map_cit = {} patt_orig_freq_map_cit = {} # num_sentences_total = 0 for file_idx, fn in enumerate(file_names): if file_idx % 100 == 0: print('{}/{}'.format(file_idx, len(file_names))) # print(num_sentences_total/(file_idx+1)) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue if re.search(r'[a-z]', aid): split = re.search(r'[a-z][0-9]', aid).span()[0] + 1 aid = aid[:split] + '/' + aid[split:] with open(path) as f: text = f.read() text = re.sub(E_G_PATT, 'e.g.', text) if not re.search(CITE_MULTI_PATT, text): continue marker = ' \u241F ' doc_len = len(text) for sent_idx, sent_edx in tokenizer.span_tokenize(text): cit_end = False sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] if len(words) == 0: continue if words[-1][0] == marker.strip(): cit_end = True words = words[:-1] if len(words) < 3: continue # num_sentences_total += 1 sent_len = len(words) patt_comb = [None, None, None, None] patt_orig = [None, None, None, None] for x_idx in range(sent_len - 3, sent_len + 1): patt_idx = x_idx - (sent_len - 3) if x_idx < 0 or \ x_idx >= len(words): patt_comb[patt_idx] = '<EOS>' patt_orig[patt_idx] = '<EOS>' continue wrd = words[x_idx][0] pos = words[x_idx][1] patt_orig[patt_idx] = pos if 'V' in pos: patt_comb[patt_idx] = 'V' elif pos in ['NN', 'NNS']: patt_comb[patt_idx] = 'NN' elif pos in ['NNP', 'NNPS']: patt_comb[patt_idx] = 'NNP' elif pos == 'IN': patt_comb[patt_idx] = 'IN' elif 'JJ' in pos: patt_comb[patt_idx] = 'JJ' elif 'W' in pos: patt_comb[patt_idx] = 'WH' elif 'RB' in pos: patt_comb[patt_idx] = 'ADV' elif 'PR' in pos: patt_comb[patt_idx] = 'PR' elif wrd == 'FORMULA': patt_comb[patt_idx] = 'FORMULA' elif wrd == 'FIGURE': patt_comb[patt_idx] = 'FIGURE' elif wrd == 'TABLE': patt_comb[patt_idx] = 'TABLE' else: patt_comb[patt_idx] = 'OTHER' comb_id = '¦'.join(patt_comb) orig_id = '¦'.join(patt_orig) if comb_id not in patt_comb_freq_map: patt_comb_freq_map[comb_id] = 0 patt_comb_freq_map[comb_id] += 1 if cit_end: if comb_id not in patt_comb_freq_map_cit: patt_comb_freq_map_cit[comb_id] = 0 patt_comb_freq_map_cit[comb_id] += 1 if orig_id not in patt_orig_freq_map: patt_orig_freq_map[orig_id] = 0 patt_orig_freq_map[orig_id] += 1 if cit_end: if orig_id not in patt_orig_freq_map_cit: patt_orig_freq_map_cit[orig_id] = 0 patt_orig_freq_map_cit[orig_id] += 1 if orig_id == 'RB¦JJ¦NNS¦<EOS>': print(sentence) print(fn) input() # if file_idx > 200: # break patt_comb_freq = sorted(patt_comb_freq_map.items(), key=operator.itemgetter(1), reverse=True) patt_orig_freq = sorted(patt_orig_freq_map.items(), key=operator.itemgetter(1), reverse=True) patt_comb_freq_cit = sorted(patt_comb_freq_map_cit.items(), key=operator.itemgetter(1), reverse=True) patt_orig_freq_cit = sorted(patt_orig_freq_map_cit.items(), key=operator.itemgetter(1), reverse=True) print('- - - C O M B - - -') for pid in patt_comb_freq[:25]: print(pid) print('- - - O R I G - - -') for pid in patt_orig_freq[:25]: print(pid) with open('sentence_comb.json', 'w') as f: json.dump(patt_comb_freq, f) with open('sentence_orig.json', 'w') as f: json.dump(patt_orig_freq, f) with open('marker_comb.json', 'w') as f: json.dump(patt_comb_freq_cit, f) with open('marker_orig.json', 'w') as f: json.dump(patt_orig_freq_cit, f)
def sent_tokenize(data): punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) new_sentences = deepcopy(sentences) new_offsets = deepcopy(offsets) for i, off in enumerate(offsets): if len(tokenizer.tokenize(sentences[i])) < 7: # Skip short sentences pass else: if i < len(offsets) - 1: if ((offsets[i + 1][0] - offsets[i][1]) < 5): new_sentences.append(sentences[i] + ' ' + sentences[i + 1]) new_offsets.append((offsets[i][0], offsets[i + 1][1])) if i < len(offsets) - 2: if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\ ((offsets[i + 1][0] - offsets[i][0]) < 5): new_sentences.append(sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2]) new_offsets.append((offsets[i][0], offsets[i + 2][1])) # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) print new_offsets return {'sentences': new_sentences, 'offsets': new_offsets}
def resolve_phrases(section, tokens, book, id): phrases = [] sentences = [] # find and resolve parantheses if book == "almizan_fa": if int(id.split("_")[0]) <= 2: html = section.html() replace = lambda start, end, oldtext, newtext: oldtext[:start] + newtext + oldtext[end:] # in chapter1, remove parantheses for ayas iter = re.finditer(r"(<span[^\n]*>)[ ]*\(([^\)s]*)\)[^\)]*(</span[^\n]*>)", html) for m in reversed(list(iter)): html = replace(m.start(), m.end(), html, m.group().replace("(", "").replace(")", "")) iter = re.finditer(r"\([^\)]{3,15}\)", html) for match in reversed(list(iter)): m = match.group()[1:-1] resolved = resolve_phrase(m, tokens, book[-2:]) if resolved: html = replace(match.start(), match.end(), html, '<em rel="{0}">{1}</em>'.format(resolved[0], m)) section.html(html) pst = PunktSentenceTokenizer() # resolve em elements for em in section.find("em").items(): resolved = resolve_phrase(em.text(), tokens, book[-2:]) if resolved: em.attr("rel", resolved[0]) phrases.append((em.text(), resolved[1], resolved[0])) paragraph = em.parent().html(method="html") for start, end in pst.span_tokenize(paragraph): if paragraph[start:end].find(em.outerHtml()) != -1: this_sentence = paragraph[start:end].lstrip() this_sentence = refine_sentence(this_sentence) while this_sentence.startswith("<code"): if this_sentence.find("</code>") != -1: new_start = this_sentence.find("</code>") + 7 this_sentence = this_sentence[new_start:].lstrip() this_sentence = refine_sentence(this_sentence) while this_sentence.startswith("<span"): new_start = this_sentence.find("</span>") + 7 this_sentence = this_sentence[new_start:].lstrip() this_sentence = refine_sentence(this_sentence) before = this_sentence.index(em.outerHtml()) after = len(this_sentence) - len(em.outerHtml()) - before em.attr("data-sentence", "{0}:{1}".format(before, after)) break sentences.append((em.text(), resolved[0], (before, after), [this_sentence])) else: phrases.append((em.text(),)) new_section = section.html(method="html") p = re.compile(r'<em rel="([^"]+)" data-sentence="([^"]+)">([^<]+)<\/em>') matched_list = [(m.start(0), m.end(0), m.group(1), m.group(2)) for m in re.finditer(p, new_section)] last_start = -1 for matched in reversed(matched_list): start_span = matched[0] - int(matched[3].split(":")[0]) end_span = matched[1] + int(matched[3].split(":")[1]) if start_span != last_start: new_section = ( new_section[:start_span] + '<span class="phrase">' + new_section[start_span:end_span] + "</span>" + new_section[end_span:] ) last_start = start_span section.html(new_section) return phrases, sentences
class Textualizer: """The Textualzer class.""" def __init__(self, abbrev=['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']): """Initialize Textualizer. Usually, you need to create only one textualizer in your script. Args: abbrev (list): List of abbreviations """ punkt = PunktParameters() punkt.abbrev_types = set(abbrev) self.tokenizer = PunktSentenceTokenizer(punkt) def find_sentences(self, par): """Finding sentences from paragraph using nltk. Args: par (Paragraph): The input paragraph Returns: A list of sentences. """ text = ''.join([w.text for w in par.words]) word_iter = iter(par.words) word = next(word_iter) sent_ls = [] for i, (b, e) in enumerate(self.tokenizer.span_tokenize(text)): _id = 's-' + par._id[2:] + '-' + str(i) sent = Sentence(_id, par.sec_name, par.box_name, text[b:e], []) sent_ls.append(sent) while word is not None and word.start < b: word = next(word_iter) while word is not None and word.start < e: if word._id is not None: sent.words.append(word._id) try: word = next(word_iter) except StopIteration: word = None return sent_ls def textualize(self, doc, remove_pos=True): """Textualize the document. The main function of this methods are summarized: 1. Add sentence information to Document.tree (i.e., XML) 2. Put list of sentences to Document.sentences 3. Provide additional attributes (Documents.words, Documents.sentences) Additionally, some normalization, e.g., breaking ligatures, are applied. In order for the compatibility, it adds following attributes for now: - Document.maths - Document.cites These attributes should be added by other modules in the future. Args: doc (Document): The input document remove_pos (bool): Whether remove positions or not """ # basic variables ligatures = { '\ufb00': 'ff', '\ufb01': 'fi', '\ufb02': 'fl', '\ufb03': 'ffi', '\ufb04': 'ffl', '\ufb05': 'st', '\ufb06': 'st', } ns = {'x': 'http://www.w3.org/1999/xhtml'} end_token = re.compile(r'[?!.]$') tag_token = re.compile(r'(?<!\s)-$') pars = {} par_ls = [] last_par = None ref_pars = [] cite = None math = None ignore_math = False math_ls = [] word_nodes = {} cites = {} for sec in doc.tree.xpath('x:body/x:div', namespaces=ns): sec_id = sec.get('id') sec_name = sec.get('data-name') for box in sec.xpath('x:div', namespaces=ns): box_name = box.get('data-name') for par in box.xpath('x:p', namespaces=ns): math_par = False par_id = par.get('id') page_id = int(par.get('data-page')) # standalone equations should continue from last par if box_name == 'Equation' and last_par: p = pars[last_par] # TODO: a bit different from the original; please check # print(len(p.words)) if not end_token.match(p.words[-1].text): continued_from_id = last_par math_par = True else: continued_from_id = par.get('data-continued-from') # if not continued_from_id or par_ls: # text = '\n\n' # par_ls[-1].words.append(Word(None, text)) if not continued_from_id and box_name == 'Reference': ref_pars.append(par_id) nodes = list(par) tmp_ref = nodes[0].get('data-refid', False) tmp_ref = nodes[0].get('data-refid', False) if tmp_ref and tmp_ref != nodes[0].get('id'): del nodes[0] if continued_from_id: par = pars[continued_from_id] else: par = Paragraph(sec_id, par_id, sec_name, box_name, [], []) par_ls.append(par) pars[par_id] = par for node in filter( lambda n: n.get('data-refid') is None or n.get( 'id') == n.get('data-refid'), nodes): sp_val = node.get('data-space') if sp_val == 'nospace' or ( sp_val == 'bol' and (not par.words or tag_token.search(par.words[-1].text))): space = None else: space = Word(None, ' ') text = node.get('data-fullform') or node.text or '' text = re.sub(r'\s+', ' ', text) text = ''.join([ ligatures[c] if ligatures.get(c, False) else c for c in text ]) _id = node.get('id') word_nodes[_id] = node # inside a citation; skip everything if cite: math = None space = None word = Word(_id, '') if cite == node.get('id'): cite = None # starting a citation; make a dummy word elif node.get('data-cite-end', False): if math: par.words = par.backup_words math = None ignore_math = True cite = node.get('data-cite-end') cids = node.get('data-cite-id').split(',') text = ', '.join(map(lambda c: 'CITE-' + c, cids)) cites.update({c: _id for c in cids}) word = Word(_id, text) # starting an equation elif not ignore_math and ( node.get('data-math') == 'B-Math' or (node.get('data-math') == 'I-Math' and not math) or (math_par and not math)): par.backup_words = par.words.copy() if space: par.backup_words.append(space) par.backup_words.append(Word(_id, text, node)) if math_par: mid = 'MATH-' + par_id else: mid = 'MATH-' + _id word = Word(_id, mid) math = [mid, _id, _id, page_id] + [ float(a) for a in node.get('data-bdr').split(',') ] math_ls.append(math) # inside an equation: skip while calculating bbox elif not ignore_math and (node.get('data-math') == 'I-Math' or math_par): if space: par.backup_words.append(space) par.backup_words.append(Word(_id, text, node)) space = None word = Word(_id, '') math[2] = _id new = [ float(a) for a in node.get('data-bdr').split(',') ] math[4] = min(math[4], new[0]) math[5] = min(math[5], new[1]) math[6] = max(math[6], new[2]) math[7] = max(math[7], new[3]) # normal texts else: math = None ignore_math = False word = Word(_id, text, node) # finish the loop if space is not None: par.words.append(space) if word is not None: par.words.append(word) # set last_par if box_name != 'Body': last_par = None elif continued_from_id: last_par = continued_from_id else: last_par = par_id par_pos = 0 for p in par_ls: p.words.append(Word(None, '\n\n')) pos = 0 for w in p.words: w.start = pos next_pos = pos + len(w.text) if w.node is not None: w.node.set('data-from', str(par_pos + pos)) w.node.set('data-to', str(par_pos + next_pos)) pos = next_pos par_pos += pos # textualize sent_ls = [] for p in par_ls: sent_ls.extend(self.find_sentences(p)) for s in sent_ls: for w in s.words: word_nodes[w].set('data-sent-id', str(s._id)) # collect the data word_ls = [(w.get('id'), int(w.get('data-from', 0)), int(w.get('data-to', 0))) for w in doc.tree.xpath('//x:span', namespaces=ns)] doc.words = sorted(word_ls, key=lambda w: (w[1], w[2])) doc.text = ''.join([w.text for p in par_ls for w in p.words]) doc.sentences = sent_ls doc.maths = math_ls doc.cites = [('CITE-' + p, ''.join([w.text for w in pars[p].words]).strip(), cites.get(p, [])) for p in ref_pars] # scrub the tree; remove positions if remove_pos: for w in doc.tree.xpath('//x:span', namespaces=ns): if w.get('class', None) == 'word': w.attrib.pop('data-from', None)
def sent_tokenize(data, filter_short=False, filter_verbless=False): """ Tokenize sentences Tokenize `data` into two arrays: sentences and offsets Returns a tuple (`sentences`,`offsets`) """ punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) new_sentences = [] new_offsets = [] to_del = [] if filter_verbless: pos = pos_tagger.extract_nlp_batch() for i in range(sentences): okay = False for word in pos['sentences'][i]['words']: if word[1]['PartOfSpeech'] in verbs: okay = True break if not okay: # the sentence doesn't have verb, to_del.append(i) # mark for deletion sentences = multi_delete(sentences, to_del) offsetes = multi_delete(offsets, to_del) if filter_short and not filter_verbless: for i in range(len(sentences)): if len(sentences[i]) >= filter_short: new_sentences.append(sentences[i]) new_offsets.append(new_offsets[i]) new_sentences = [s for s in sentences if sentences] # new_sentences = deepcopy(sentences) # new_offsets = deepcopy(offsets) # for i, off in enumerate(offsets): # if i < len(offsets) - 1: # if ((offsets[i + 1][0] - offsets[i][1]) < 5): # new_sentences.append(sentences[i] + ' ' + sentences[i + 1]) # new_offsets.append((offsets[i][0], offsets[i + 1][1])) # if i < len(offsets) - 2: # if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\ # ((offsets[i + 1][0] - offsets[i][0]) < 5): # new_sentences.append( # sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2]) # new_offsets.append((offsets[i][0], offsets[i + 2][1])) # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) print new_offsets return {'sentences': new_sentences, 'offsets': new_offsets}
shard.append(json.loads(line)) for paper in shard: manuscript_id = paper["paper_id"] full_text = paper["body_text"] for paragraph in full_text: section_name = paragraph["section"].lower() if "discuss" not in section_name and "conclu" not in section_name: continue if not paragraph["cite_spans"]: continue paragraph_text = paragraph["text"] endpoints = list(tokenizer.span_tokenize(paragraph_text)) j = 0 for cite_span in paragraph["cite_spans"]: cite_id = cite_span["cite_id"] if cite_id not in paper_ids: continue cite_text = cite_span["text"] start, end = cite_span["start"], cite_span["end"] a, b = endpoints[j] while start >= b: j += 1 a, b = endpoints[j]
def resolve_phrases(section, tokens, book, id): phrases = [] sentences = [] # find and resolve parantheses if book == 'almizan_fa': if int(id.split('_')[0]) <= 2: html = section.html() replace = lambda start, end, oldtext, newtext: oldtext[:start ] + newtext + oldtext[ end:] # in chapter1, remove parantheses for ayas iter = re.finditer( r'(<span[^\n]*>)[ ]*\(([^\)s]*)\)[^\)]*(</span[^\n]*>)', html) for m in reversed(list(iter)): html = replace(m.start(), m.end(), html, m.group().replace('(', '').replace(')', '')) iter = re.finditer(r'\([^\)]{3,15}\)', html) for match in reversed(list(iter)): m = match.group()[1:-1] resolved = resolve_phrase(m, tokens, book[-2:]) if resolved: html = replace( match.start(), match.end(), html, '<em rel="{0}">{1}</em>'.format(resolved[0], m)) section.html(html) pst = PunktSentenceTokenizer() # resolve em elements for em in section.find('em').items(): resolved = resolve_phrase(em.text(), tokens, book[-2:]) if resolved: em.attr('rel', resolved[0]) phrases.append((em.text(), resolved[1], resolved[0])) paragraph = em.parent().html(method='html') for start, end in pst.span_tokenize(paragraph): if paragraph[start:end].find(em.outerHtml()) != -1: this_sentence = paragraph[start:end].lstrip() this_sentence = refine_sentence(this_sentence) while this_sentence.startswith("<code"): if this_sentence.find('</code>') != -1: new_start = this_sentence.find('</code>') + 7 this_sentence = this_sentence[new_start:].lstrip() this_sentence = refine_sentence(this_sentence) while this_sentence.startswith('<span'): new_start = this_sentence.find('</span>') + 7 this_sentence = this_sentence[new_start:].lstrip() this_sentence = refine_sentence(this_sentence) before = this_sentence.index(em.outerHtml()) after = len(this_sentence) - len(em.outerHtml()) - before em.attr('data-sentence', '{0}:{1}'.format(before, after)) break sentences.append( (em.text(), resolved[0], (before, after), [this_sentence])) else: phrases.append((em.text(), )) new_section = section.html(method='html') p = re.compile(r'<em rel="([^"]+)" data-sentence="([^"]+)">([^<]+)<\/em>') matched_list = [(m.start(0), m.end(0), m.group(1), m.group(2)) for m in re.finditer(p, new_section)] last_start = -1 for matched in reversed(matched_list): start_span = matched[0] - int(matched[3].split(':')[0]) end_span = matched[1] + int(matched[3].split(':')[1]) if start_span != last_start: new_section = new_section[:start_span] + '<span class="phrase">' + new_section[ start_span:end_span] + '</span>' + new_section[end_span:] last_start = start_span section.html(new_section) return phrases, sentences
def marker_surr_patt(in_dir): """ Find most frequent POS tag patterns surrounding citation marker """ punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) file_names = os.listdir(in_dir) patt_comb_freq_map = {} patt_orig_freq_map = {} for file_idx, fn in enumerate(file_names): if file_idx%100 == 0: print('{}/{}'.format(file_idx, len(file_names))) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue if re.search(r'[a-z]', aid): split = re.search(r'[a-z][0-9]', aid).span()[0] + 1 aid = aid[:split] + '/' + aid[split:] with open(path) as f: text = f.read() text = re.sub(E_G_PATT, 'e.g.', text) marker = ' \u241F ' doc_len = len(text) for sent_idx, sent_edx in tokenizer.span_tokenize(text): sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) if marker in sentence: words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] sent_len = len(words) indices = [i for i, tup in enumerate(words) if tup[0] == marker.strip()] for word_idx in indices: word = words[word_idx][0] if word == marker.strip(): patt_comb = [None, None, None, '[]', None, None, None] patt_orig = [None, None, None, '[]', None, None, None] for shift in range(-3, 4): x_idx = shift+3 if shift == 0: # marker itself continue if word_idx+shift < 0 or \ word_idx+shift >= len(words): patt_comb[x_idx] = '<EOS>' patt_orig[x_idx] = '<EOS>' continue wrd = words[word_idx+shift][0] pos = words[word_idx+shift][1] patt_orig[x_idx] = pos if 'V' in pos: patt_comb[x_idx] = 'V' elif pos in ['NN', 'NNS']: patt_comb[x_idx] = 'NN' elif pos in ['NNP', 'NNPS']: patt_comb[x_idx] = 'NNP' elif pos == 'IN': patt_comb[x_idx] = 'IN' elif 'JJ' in pos: patt_comb[x_idx] = 'JJ' elif 'W' in pos: patt_comb[x_idx] = 'WH' elif 'RB' in pos: patt_comb[x_idx] = 'ADV' elif 'PR' in pos: patt_comb[x_idx] = 'PR' elif wrd == 'FORMULA': patt_comb[x_idx] = 'FORMULA' elif wrd == 'FIGURE': patt_comb[x_idx] = 'FIGURE' elif wrd == 'TABLE': patt_comb[x_idx] = 'TABLE' else: patt_comb[x_idx] = 'OTHER' comb_id = '¦'.join(patt_comb) orig_id = '¦'.join(patt_orig) # # look at examples # if orig_id == 'VBN¦IN¦NNP¦[]¦<EOS>¦<EOS>¦<EOS>': # print(sentence) # input() # print('.') if comb_id not in patt_comb_freq_map: patt_comb_freq_map[comb_id] = 0 patt_comb_freq_map[comb_id] += 1 if orig_id not in patt_orig_freq_map: patt_orig_freq_map[orig_id] = 0 patt_orig_freq_map[orig_id] += 1 # if file_idx > 200: # break patt_comb_freq = sorted(patt_comb_freq_map.items(), key=operator.itemgetter(1), reverse=True) patt_orig_freq = sorted(patt_orig_freq_map.items(), key=operator.itemgetter(1), reverse=True) print('- - - C O M B - - -') for pid in patt_comb_freq[:25]: print(pid) print('- - - O R I G - - -') for pid in patt_orig_freq[:25]: print(pid) store_comb = [] for tup in patt_comb_freq: pid = tup[0] freq = tup[1] if '[]¦<EOS>¦<EOS>¦<EOS>' in pid: new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>') store_comb.append((new_pid, freq)) with open('marker_comb.json', 'w') as f: json.dump(store_comb, f) store_orig = [] for tup in patt_orig_freq: pid = tup[0] freq = tup[1] if '[]¦<EOS>¦<EOS>¦<EOS>' in pid: new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>') store_orig.append((new_pid, freq)) with open('marker_orig.json', 'w') as f: json.dump(store_orig, f)
def sent_pos(in_dir): """ Positions of citation markers in sentences, relatve to where in doc """ arxiv_base_url = 'http://export.arxiv.org/api/query?search_query=id:' arxiv_ns = { 'atom': 'http://www.w3.org/2005/Atom', 'opensearch': 'http://a9.com/-/spec/opensearch/1.1/', 'arxiv': 'http://arxiv.org/schemas/atom' } punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) with open('hedge_words') as f: hedge_words = [l.strip() for l in f.readlines()] x_all = list(range(-5, 6)) y_verb = [] y_noun = [] y_propnoun = [] y_prepos = [] y_adj = [] y_wh = [] y_adv = [] y_pr = [] y_form = [] y_fig = [] y_tab = [] for x in x_all: y_verb.append(0) y_noun.append(0) y_propnoun.append(0) y_prepos.append(0) y_adj.append(0) y_wh.append(0) y_adv.append(0) y_pr.append(0) y_form.append(0) y_fig.append(0) y_tab.append(0) file_names = os.listdir(in_dir) for file_idx, fn in enumerate(file_names): if file_idx % 100 == 0: print('{}/{}'.format(file_idx, len(file_names))) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue phys_cat = [ 'hep-th', 'hep-ph', 'hep-lat', 'hep-ex', 'cond-mat', 'astro-ph', 'physics', 'nucl', 'gr-qc', 'quant-ph', 'nlin' ] math_cat = ['math', 'math-ph'] cs_cat = ['cs'] if re.search(r'[a-z]', aid): split = re.search(r'[a-z][0-9]', aid).span()[0] + 1 aid = aid[:split] + '/' + aid[split:] resp = requests.get('{}{}&start=0&max_results=1'.format( arxiv_base_url, aid)) xml_root = etree.fromstring(resp.text.encode('utf-8')) result_elems = xml_root.xpath('/atom:feed/atom:entry', namespaces=arxiv_ns) result = result_elems[0] cat = result.find('arxiv:primary_category', namespaces=arxiv_ns).get('term') high_cat = None for pc in phys_cat: if pc in cat: high_cat = 'phys' break if not high_cat: for mc in math_cat: if pc in cat: high_cat = 'math' break if not high_cat: if 'cs' in cat: high_cat = 'cs' if not high_cat: continue if high_cat != 'phys': continue with open(path) as f: text = f.read() marker = ' \u241F ' doc_len = len(text) for sent_idx, sent_edx in tokenizer.span_tokenize(text): sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) if marker in sentence: words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] sent_len = len(words) indices = [ i for i, tup in enumerate(words) if tup[0] == marker.strip() ] for word_idx in indices: word = words[word_idx][0] if word == marker.strip(): for shift in x_all: x_idx = shift + 5 if shift == 0: # marker itself continue if word_idx+shift < 0 or \ word_idx+shift >= len(words): # out of range continue wrd = words[word_idx + shift][0] pos = words[word_idx + shift][1] if 'V' in pos: y_verb[x_idx] += 1 if pos in ['NN', 'NNS']: y_noun[x_idx] += 1 if pos in ['NNP', 'NNPS']: y_propnoun[x_idx] += 1 if pos == 'IN': y_prepos[x_idx] += 1 if 'JJ' in pos: y_adj[x_idx] += 1 if 'W' in pos: y_wh[x_idx] += 1 if 'RB' in pos: y_adv[x_idx] += 1 if 'PR' in pos: y_pr[x_idx] += 1 if wrd == 'FORMULA': y_form[x_idx] += 1 if wrd == 'FIGURE': y_fig[x_idx] += 1 if wrd == 'TABLE': y_tab[x_idx] += 1 if file_idx > 200: break for idx, y in enumerate([(y_verb, 'verb'), (y_noun, 'noun'), (y_propnoun, 'proper noun'), (y_prepos, 'preposition'), (y_adj, 'adjective'), (y_wh, 'wh-det./-adv./-pron.'), (y_adv, 'adverb'), (y_pr, 'pers./pos. pronoun'), (y_form, 'formula')]): color = list(mpl.rcParams['axes.prop_cycle'])[idx]['color'] plt.plot(x_all, y[0], marker='', linestyle='-', linewidth=.5, alpha=0.3, color=color) plt.plot(x_all, y[0], label=y[1], marker='D', linestyle='', color=color) plt.xlabel('word position relative to citation') plt.ylabel('number of words') plt.legend() ax = plt.gca() ax.xaxis.grid(True) plt.xticks(np.arange(min(x_all), max(x_all), 1.0)) plt.show()
class nltk_tokenizer(IncrementalTransform): ''' a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers ''' config_name = 'nltk_tokenizer' tagger_id = 'nltk_tokenizer' def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer() def _sentences(self, clean_visible): 'generate strings identified as sentences' previous_end = 0 clean_visible = clean_visible.decode('utf8') assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): token_str = sent_str[start:end].encode('utf8') tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length = end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences def process_item(self, stream_item, context=None): if not hasattr(stream_item.body, 'clean_visible') or not stream_item.body.clean_visible: return stream_item self.label_index = None self.label_to_mention_id = dict() stream_item.body.sentences[self.tagger_id] = self.make_sentences(stream_item) return stream_item def __call__(self, stream_item, context=None): ## support the legacy callable API return self.process_item(stream_item, context)
class nltk_tokenizer(IncrementalTransform): """ a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers """ tagger_id = "nltk_tokenizer" def __init__(self, config): self.config = config self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() # PunktWordTokenizer() def _sentences(self, clean_visible): "generate strings identified as sentences" previous_end = 0 clean_visible = clean_visible.decode("utf8") assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): "make a sortedcollection on body.labels" labels = stream_item.body.labels.get(self.config.get("annotator_id")) if not labels: labels = [] self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): "assemble Sentence and Token objects" self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode("utf8") except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str)) tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info("overlapping label: %r" % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
#Retrieve annotated features for the currently-open training file keywords=get_kw(config.train_folder,f) #origin_offs=get_offs(train_folder,f) #Extracting positive examples of keywords from the paragraph, given the ground truth offsets '''for k in keywords: start=k[2] end=k[3] print(text[264:349])''' #Split each sentence on a separate line toktext=sentence_splitter.tokenize(text) s_spans=sentence_splitter.span_tokenize(text) sentence_spans=[] for ss in s_spans: sss=[] start=ss[0] end=ss[1] sss.append(start) sss.append(end) sentence_spans.append(sss) #Create output files with a similar name as the input files outputfile = f.split(".")[0] + "__output.txt" with io.open(os.path.join(config.output_folder,outputfile),'w', encoding="utf-8") as outf:
def sent_tokenize(data): punkt_param = PunktParameters() punkt_param.abbrev_types = set( ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig']) sent_detector = PunktSentenceTokenizer(punkt_param) sentences = sent_detector.tokenize(data) offsets = sent_detector.span_tokenize(data) new_sentences = deepcopy(sentences) new_offsets = deepcopy(offsets) for i, off in enumerate(offsets): if len(tokenizer.tokenize(sentences[i])) < 7: # Skip short sentences pass else: if i < len(offsets) - 1: if ((offsets[i + 1][0] - offsets[i][1]) < 5): new_sentences.append(sentences[i] + ' ' + sentences[i + 1]) new_offsets.append((offsets[i][0], offsets[i + 1][1])) if i < len(offsets) - 2: if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\ ((offsets[i + 1][0] - offsets[i][0]) < 5): new_sentences.append( sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2]) new_offsets.append((offsets[i][0], offsets[i + 2][1])) # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # if i < len(offsets) - 3: # if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 4: # if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) # if i < len(offsets) - 5: # if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and # ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and # ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and # ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and # ((offsets[i + 1][0] - offsets[i][0]) < 5)): # new_sentences.append(sentences[ # i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3]) # new_offsets.append((offsets[i][0], offsets[i + 3][1])) print new_offsets return {'sentences': new_sentences, 'offsets': new_offsets}
class nltk_tokenizer(IncrementalTransform): ''' a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new chunk with Sentence objects generated using NLTK tokenizers ''' config_name = 'nltk_tokenizer' tagger_id = 'nltk_tokenizer' def __init__(self, *args, **kwargs): super(nltk_tokenizer, self).__init__(*args, **kwargs) self.sentence_tokenizer = PunktSentenceTokenizer() self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer() def _sentences(self, clean_visible): 'generate strings identified as sentences' previous_end = 0 clean_visible = clean_visible.decode('utf8') assert isinstance(clean_visible, unicode) for start, end in self.sentence_tokenizer.span_tokenize(clean_visible): ## no need to check start, because the first byte of text ## is always first byte of first sentence, and we will ## have already made the previous sentence longer on the ## end if there was an overlap. if start < previous_end: start = previous_end if start > end: ## skip this sentence... because it was eaten by ## an earlier sentence with a label continue try: label = self.label_index.find_le(end) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] end = max(off.first + off.length, end) previous_end = end sent_str = clean_visible[start:end] yield start, end, sent_str def make_label_index(self, stream_item): 'make a sortedcollection on body.labels' labels = stream_item.body.labels.get(self.config.get('annotator_id')) if not labels: labels = [] self.label_index = SortedCollection( labels, key=lambda label: label.offsets[OffsetType.BYTES].first) def make_sentences(self, stream_item): 'assemble Sentence and Token objects' self.make_label_index(stream_item) sentences = [] token_num = 0 new_mention_id = 0 for sent_start, sent_end, sent_str in self._sentences( stream_item.body.clean_visible): assert isinstance(sent_str, unicode) sent = Sentence() sentence_pos = 0 for start, end in self.word_tokenizer.span_tokenize(sent_str): try: token_str = sent_str[start:end].encode('utf8') except Exception, exc: logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True) sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str)) tok = Token( token_num=token_num, token=token_str, sentence_pos=sentence_pos, ) tok.offsets[OffsetType.BYTES] = Offset( type=OffsetType.BYTES, first=sent_start + start, length=end - start, ) ## whitespace tokenizer will never get a token ## boundary in the middle of an 'author' label try: #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys) label = self.label_index.find_le(sent_start + start) except ValueError: label = None if label: off = label.offsets[OffsetType.BYTES] if off.first + off.length > sent_start + start: logger.info('overlapping label: %r' % label.target.target_id) ## overlaps streamcorpus.add_annotation(tok, label) assert label.annotator.annotator_id in tok.labels logger.info('adding label to tok: %r has %r', tok.token, label.target.target_id) if label in self.label_to_mention_id: mention_id = self.label_to_mention_id[label] else: mention_id = new_mention_id new_mention_id += 1 self.label_to_mention_id[label] = mention_id tok.mention_id = mention_id token_num += 1 sentence_pos += 1 sent.tokens.append(tok) sentences.append(sent) return sentences
def main(args): if len(args) < 3: sys.stderr.write('3 required arguments: <input anafora dir> <output brat dir> <tsv out dir>\n') sys.exit(-1) sent_tokenizer = PunktSentenceTokenizer() neg_out = open( join(args[2], 'negation.tsv'), 'wt') dtr_out = open( join(args[2], 'dtr.tsv'), 'wt') alink_out = open( join(args[2], 'alink.tsv'), 'wt') for sub_dir, text_name, xml_names in anafora.walk(args[0], "ADE_entity.dave.completed.xml"): textfile_path = join( join(args[0],text_name), text_name) with open(textfile_path, 'r') as tf: text = tf.read() sent_spans = list(sent_tokenizer.span_tokenize(text)) shutil.copyfile(textfile_path, join(args[1], '%s.txt' % (text_name))) brat_out = open( join(args[1], '%s.ann' % (text_name)), 'wt') for xml_name in xml_names: xml_path = os.path.join(args[0], sub_dir, xml_name) xml_parts = xml_name.split('.') annotator = xml_parts[2] status = xml_parts[3] data = anafora.AnaforaData.from_file(xml_path) alink_map = {} for rel in data.annotations.select_type('ALINK'): cat = rel.properties['Type'] tgt = rel.properties['Target'] alink_map[tgt.id] = cat for annot_ind, annot in enumerate(data.annotations.select_type('Medications/Drugs')): id = annot.id span = annot.spans[0] span_text = text[span[0]:span[1]] neg = annot.properties['negation_indicator'] neg_status = "-1" if neg is None else "1" dtr = annot.properties['DocTimeRel'] if annot.id in alink_map: alink = alink_map[annot.id] else: alink = 'None' # Write Brat format: brat_out.write('T%d\tDrug %d %d\t%s\n' % (annot_ind, span[0], span[1], span_text)) #print("File:%s\tID:%s\tSpan:(%d,%d)\tAnnotatedText:%s\tNegated:%s\tDTR:%s\tAlink:%s" % # (text_name, annot.id, span[0], span[1], span_text, neg_status, dtr, alink)) # Write some ad-hoc format: print("File:%s\tSpan:(%d,%d)\tAnnotatedText:%s\tNegated:%s\tDTR:%s\tAlink:%s" % (text_name, span[0], span[1], span_text, neg_status, dtr, alink)) # Write bert-style tsv for Neg, DTR, ALink: covering_sent_span = find_sentence_for_drug(sent_spans, span) inst_text = insert_delimiter_for_entity(text, covering_sent_span, span) neg_out.write('%s\t%s\n' % (neg_status, inst_text)) dtr_out.write('%s\t%s\n' % (dtr, inst_text)) alink_out.write('%s\t%s\n' % (alink, inst_text)) brat_out.close() neg_out.close() dtr_out.close() alink_out.close()
class ACEParser: def __init__(self): self.sent_tokenizer = PunktSentenceTokenizer() # self.word_tokenizer = RegexpTokenizer('\w+|\S+') self.word_tokenizer = WhitespaceTokenizer() self.root = None self.sentence_offsets = [] self.df = pd.DataFrame( columns=["doc_id", "sentence", "tokens", "events", "entities"]) def get_text(self, sgm_file): with open(sgm_file, "r", encoding="utf-8") as f: text = f.read() # Gets rid of lines with only tags text = re.sub(r"<(.|\s|\n)*?>", r"", text) sentence_offsets = list(self.sent_tokenizer.span_tokenize(text)) sentences = [] for offset in sentence_offsets: sentence_text = text[offset[0]:offset[1]] sentences.append(sentence_text) self.sentence_offsets = sentence_offsets return text def create_tree(self, apf_file): with open(apf_file, "r", encoding="utf-8") as f: xml_text = f.read() root = etree.fromstring(xml_text) self.root = root def get_extents(self): extent_nodes = self.root.xpath("//extent/charseq") return [ self.get_offset_tuple(extent_node) for extent_node in extent_nodes ] def get_offset_tuple(self, extent_node): return (int(extent_node.get("START")), int(extent_node.get("END")) + 1 ) # +1 makes them exclusive def get_sentences(self): sentences = [] for offset in self.sentence_offsets: sentence_text = text[offset[0]:offset[1]] sentences.append(sentence_text) return sentences def find_sentence_index(self, offset): for i, sent_offset in enumerate(self.sentence_offsets): if offset[0] >= sent_offset[0] and offset[1] <= sent_offset[1]: return i def offset_to_token(self, start, end, token_offsets, normalize=0): # normalize is making start and end relatable to token_offsets start -= normalize end -= normalize # TODO: change this to if end == offset[1]. In the case that end < offset[1] use startswith and extend token_offsets list for i, offset in enumerate(token_offsets): if end <= offset[1]: for j in range(i, -1, -1): if start >= token_offsets[j][0]: return j, i + 1 # Make it exclusive raise Exception( "Error while converting offset to token indexes. Start offset : %d , End offset : %d Norm : %d, Token offsets : %s" % (start, end, normalize, str(token_offsets))) def create_json_output(self, doc_text, filename): # doc_id = self.root.xpath("document")[0].get("DOCID") doc_id = filename event_nodes = self.root.xpath("//event") # TODO: We lose coreference information doing it this way. For now it is ok, but need to accomodate the other way too !!! event_mentions = [] for event_node in event_nodes: event_type = event_node.get("TYPE") event_subtype = event_node.get("SUBTYPE") event_id = event_node.get("ID") event_mention_nodes = event_node.xpath("event_mention") for mention_node in event_mention_nodes: # You actually don't need these two for finding which sentence we are talking about. # Because we already made sure that all of our extents are covered by sentence offsets. # extent_node = mention.xpath("/extent/charseq")[0] # extent = get_offset_tuple(extent_node) trigger_offset = self.get_offset_tuple( mention_node.xpath("anchor/charseq")[0]) # find which sentence this belongs. Only need to do this once. sent_idx = self.find_sentence_index(trigger_offset) event_arguments = [] arguments = mention_node.xpath("event_mention_argument") for argument in arguments: arg_role = argument.get("ROLE") arg_offset = self.get_offset_tuple( argument.xpath("extent/charseq")[0]) # TODO: NEED TO ADD ENTITY TYPES, getting them from refids !!! event_arguments.append({ "role": arg_role, "start": arg_offset[0], "end": arg_offset[1] }) event_mentions.append({ "event_id": event_id, "event_type": event_type, "event_subtype": event_subtype, "trigger": { "start": trigger_offset[0], "end": trigger_offset[1] }, "arguments": event_arguments, "sent_idx": sent_idx }) # For printing later # old_event_mentions = copy.deepcopy(event_mentions) tokens_list_for_printing = [] for i, sentence_offset in enumerate(self.sentence_offsets): sentence_text = doc_text[sentence_offset[0]:sentence_offset[1]] token_offsets = list( self.word_tokenizer.span_tokenize(sentence_text)) tokens = [ sentence_text[offset[0]:offset[1]] for offset in token_offsets ] tokens_list_for_printing.append(tokens) entity_mentions = [] curr_event_mentions = [] for j in range(len(event_mentions)): mention = event_mentions[j] if mention["sent_idx"] == i: # ipdb.set_trace() start_idx, end_idx = self.offset_to_token( mention["trigger"]["start"], mention["trigger"]["end"], token_offsets, normalize=sentence_offset[0]) event_mentions[j]["trigger"]["start"] = start_idx event_mentions[j]["trigger"]["end"] = end_idx for k, argument in enumerate(mention["arguments"]): start_idx, end_idx = self.offset_to_token( argument["start"], argument["end"], token_offsets, normalize=sentence_offset[0]) event_mentions[j]["arguments"][k]["start"] = start_idx event_mentions[j]["arguments"][k]["end"] = end_idx curr_event_mentions.append(event_mentions[j]) self.df = self.df.append( { "doc_id": doc_id, "sentence": sentence_text, "tokens": tokens, "events": curr_event_mentions, "entities": entity_mentions }, ignore_index=True) # Printing stuff # for mention, old_mention in zip(event_mentions, old_event_mentions): # tokens = tokens_list_for_printing[mention["sent_idx"]] # print("Offset version trigger : %s , Tokens version trigger : %s" %(doc_text[old_mention["trigger"]["start"]:old_mention["trigger"]["end"]], tokens[mention["trigger"]["start"]:mention["trigger"]["end"]])) # for argument, old_argument in zip(mention["arguments"], old_mention["arguments"]): # print("Offset version argument : %s , Tokens version argument : %s" %(doc_text[old_argument["start"]:old_argument["end"]], tokens[argument["start"]:argument["end"]])) # print("===========") # TODO: Remove debug stuff def fix_offsets(self, extents): offsets = self.sentence_offsets assert (len(offsets) > 1) # print(offsets) # print("*************") after_count = 0 before_count = 0 for extent in extents: # Check stuff for printing if len([ offset for offset in offsets if extent[0] >= offset[0] and extent[1] <= offset[1] ]) == 0: before_count += 1 if extent[1] <= offsets[0][1]: continue for idx in range(1, len(offsets)): offset = offsets[idx] if extent[1] <= offset[1]: # Ends before this sentence. if extent[0] < offset[0]: # Starts before this sentence # Fixing # print("-------") # print(extent) # print(offsets) for j in range( idx - 1, -1, -1 ): # For all sentences' offsets before this offset del offsets[j + 1] if extent[0] >= offsets[j][0]: offsets[j] = (offsets[j][0], offset[1]) break # print(offsets) break else: # Nothing wrong with this extent break # Check stuff for printing if len([ offset for offset in offsets if extent[0] >= offset[0] and extent[1] <= offset[1] ]) == 0: ipdb.set_trace() # MISSES some due to spaces between sentences # print(extent) # print(text[extent[0]:extent[1]]) after_count += 1 # print("Before : %d -> After : %d" %(before_count, after_count)) # print("================================================================================================================") self.sentence_offsets = offsets
def main(args): if len(args) < 2: sys.stderr.write('Required arguments: <input dir> <output dir>\n') sys.exit(-1) sent_tokenizer = PunktSentenceTokenizer() sentence_lookahead = 0 # get all .txt files from the chqa directory: txt_files = glob.glob(join(args[0], '*.txt')) rel_out = open(join(args[1], 'ade-all-relations.flair'), 'w') for txt_fn in txt_files: ann_fn = txt_fn[:-3] + 'ann' if not isfile(ann_fn): continue print('Processing file %s which has corresponding file %s' % (txt_fn, ann_fn)) with open(txt_fn, 'r') as myfile: text = myfile.read() ents,rels = read_brat_file(ann_fn) sent_spans = list(sent_tokenizer.span_tokenize(text)) for sent_ind in range(len(sent_spans)): primary_sent_span = sent_spans[sent_ind] end_window_ind = min(sent_ind+sentence_lookahead, len(sent_spans)-1) end_sent_span = sent_spans[end_window_ind] sent = text[primary_sent_span[0]:end_sent_span[1]].replace('\n', ' ') drug_ents, att_ents = get_span_ents(primary_sent_span, end_sent_span, ents) for att_ent in att_ents: for drug_ent in drug_ents: ## Make sure one of the ents is in the first sentence (otherwise we'll get to it later) if att_ent.start > primary_sent_span[1] and drug_ent.start > primary_sent_span[1]: continue label = get_label(rels, ents, att_ent, drug_ent) ## Get index of ents into sent: a1_start = att_ent.start - primary_sent_span[0] a1_end = att_ent.end - primary_sent_span[0] a2_start = drug_ent.start - primary_sent_span[0] a2_end = drug_ent.end - primary_sent_span[0] if a1_start < a2_start: # arg1 occurs before arg2 rel_text = (sent[:a1_start] + " %sStart %s %sEnd " % (att_ent.cat, sent[a1_start:a1_end], att_ent.cat) + sent[a1_end:a2_start] + " DrugStart %s DrugEnd " % (sent[a2_start:a2_end]) + sent[a2_end:]) else: rel_text = (sent[:a2_start] + " DrugStart %s DrugEnd " % (sent[a2_start:a2_end]) + sent[a2_end:a1_start] + " %sStart %s %sEnd " % (att_ent.cat, sent[a1_start:a1_end], att_ent.cat) + sent[a1_end:]) ## lookup flair classification format rel_out.write('__label__%s %s \n' % (label, rel_text)) rel_out.close()
def sent_pos(in_dir): """ Positions of citation markers in sentences, relatve to where in doc """ punkt_param = PunktParameters() abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs'] punkt_param.abbrev_types = set(abbreviation) tokenizer = PunktSentenceTokenizer(punkt_param) with open('hedge_words') as f: hedge_words = [l.strip() for l in f.readlines()] x = [] y = [] file_names = os.listdir(in_dir) buckets = [] for foo in range(10): buckets.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) for file_idx, fn in enumerate(file_names): if file_idx % 100 == 0: print('{}/{}'.format(file_idx, len(file_names))) path = os.path.join(in_dir, fn) aid, ext = os.path.splitext(fn) if ext != '.txt' or aid == 'log': continue with open(path) as f: text = f.read() text = re.sub(E_G_PATT, 'e.g.', text) # annot_fn = '{}_annot.json'.format(aid) # annot_path = os.path.join(in_dir, annot_fn) # if not os.path.isfile(annot_path): # continue # with open(annot_path) as f: # annots = json.load(f) marker = ' \u241F ' doc_len = len(text) # ↓ word wise for sent_idx, sent_edx in tokenizer.span_tokenize(text): sentence_orig = text[sent_idx:sent_edx] sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig) sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence) # determine contained annotations # annotated_words = [] # for annot in annots: # start = annot[0] # end = annot[1] # dbp_id = annot[2] # annot_len = end - start # in_sent_idx = start - sent_idx # if start >= sent_idx and end <= sent_edx: # disp = sentence_orig[in_sent_idx:in_sent_idx+annot_len] # annotated_words.append(disp) if marker in sentence: doc_pos = 1 - (sent_idx / doc_len) buck_y_idx = math.floor(doc_pos * 10) if buck_y_idx == 10: buck_y_idx = 9 words = pos_tag(sentence.split()) words = [w for w in words if re.search(r'[\w|\u241F]', w[0])] sent_len = len(words) sent_tags_str = ' '.join([tup[1] for tup in words]) indices = [ i for i, tup in enumerate(words) if tup[0] == marker.strip() ] # if 'JJS' not in sent_tags_str: # continue for word_idx in indices: word = words[word_idx][0] # if word == marker.strip() and \ # words[word_idx-1][1] == 'IN': # if word == marker.strip() and \ # ((word_idx > 0 and \ # 'FORMULA' not in words[word_idx-1][0] and \ # words[word_idx-1][1] in ['NNP', 'NNPS']) or \ # (word_idx > 1 and \ # words[word_idx-1][1] in ['NN', 'NNS'] and \ # 'FORMULA' not in words[word_idx-2][0] and \ # words[word_idx-2][1] in ['NNP', 'NNPS'])): # if word == marker.strip() and \ # (word_idx > 0 and \ # words[word_idx-1][0] in annotated_words and \ # words[word_idx-1][1] in ['NNP', 'NNPS']): # if word == marker.strip() and \ # word_idx+1 < len(words) and \ # 'VB' in words[word_idx+1][1]: if word == marker.strip(): # print(words) # print('doc pos: {}'.format((sent_idx/doc_len))) # print('sent pos: {}/{}'.format((word_idx+1),sent_len)) # input() sent_pos = (word_idx + 1) / sent_len y.append(doc_pos) x.append(sent_pos) buck_x_idx = math.floor(sent_pos * 10) if buck_x_idx == 10: buck_x_idx = 9 buckets[buck_y_idx][buck_x_idx] += 1 # if file_idx > 1000: # break # # ↓ character wise # for sent_idx, sentence in enumerate(sentences): # # has_hw = False # # for hw in hedge_words: # # if hw in sentence: # # has_hw = True # # break # # if not has_hw: # # continue # sent_len = len(sentence) # doc_pos = 1 - (sent_idx/doc_len) # buck_y_idx = math.floor(doc_pos*10) # if buck_y_idx == 10: # buck_y_idx = 9 # for cit_mark in re.finditer(marker, sentence): # cm_idx = cit_mark.end() # sent_pos = cm_idx/sent_len # y.append(doc_pos) # x.append(sent_pos) # buck_x_idx = math.floor(sent_pos*10) # if buck_x_idx == 10: # buck_x_idx = 9 # buckets[buck_y_idx][buck_x_idx] += 1 print('normalized row distributions:') for line in buckets: print(' '.join(['{:.2f}'.format(x / sum(line)) for x in line])) plt.xlabel('citation marker position in sentence') plt.ylabel('sentence position in document') heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50)) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm()) # plt.imshow(heatmap.T, extent=extent, origin='lower') plt.colorbar() plt.show() plt.clf() plt.xlabel('citation marker position in sentence') plt.ylabel('sentence position in document') heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50)) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] # plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm()) plt.imshow(heatmap.T, extent=extent, origin='lower') plt.colorbar() plt.show()
def main(args): args = parser.parse_args() # Loading classifier model: print("Loading classifier model") classifier = TextClassifier.load_from_file(join(args.model_dir, 'best-model.pt')) txt_files = glob.glob(join(args.data_dir, '*.txt')) sent_splitter = PunktSentenceTokenizer() tokenizer = TreebankWordTokenizer() sentence_lookahead = 0 for txt_fn in txt_files: print("Processing %s" % (txt_fn)) ann_input_fn = join(args.data_dir, basename(txt_fn)[:-3]+'ann') ents, _ = read_brat_file(ann_input_fn) ann_output_fn = join(args.output_dir, basename(txt_fn)[:-3]+'ann') with open(txt_fn, 'r') as myfile: text = myfile.read() ann_out = open(ann_output_fn, 'w') # Write entities right away: for ent_id in ents.keys(): ent = ents[ent_id] ent_text = text[ent.start:ent.end].replace('\n', ' ') ann_out.write('%s\t%s %d %d\t%s\n' % (ent_id, ent.cat, ent.start, ent.end, ent_text)) sent_spans = list(sent_splitter.span_tokenize(text)) rel_ind = 0 rel_attempts = 0 for sent_ind in range(len(sent_spans)): primary_sent_span = sent_spans[sent_ind] end_window_ind = min(sent_ind+sentence_lookahead, len(sent_spans)-1) end_sent_span = sent_spans[end_window_ind] sent = text[primary_sent_span[0]:end_sent_span[1]].replace('\n', ' ') drug_ents, att_ents = get_span_ents(primary_sent_span, end_sent_span, ents) for att_ent in att_ents: for drug_ent in drug_ents: ## Get index of ents into sent: a1_start = att_ent.start - primary_sent_span[0] a1_end = att_ent.end - primary_sent_span[0] a1_text = sent[a1_start:a1_end] a2_start = drug_ent.start - primary_sent_span[0] a2_end = drug_ent.end - primary_sent_span[0] a2_text = sent[a2_start:a2_end] if a1_start < a2_start: # arg1 occurs before arg2 rel_text = (sent[:a1_start] + " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) + sent[a1_end:a2_start] + " DrugStart %s DrugEnd" % (a2_text) + sent[a2_end:]) else: rel_text = (sent[:a2_start] + " DrugStart %s DrugEnd " % (a2_text) + sent[a2_end:a1_start] + " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) + sent[a1_end:]) # if att_ent.cat == 'Dosage': # print("working with Dosage ent") sentence = Sentence(rel_text, use_tokenizer=True) labels = classifier.predict(sentence)[0].labels if len(labels) > 1: print(' This relation has more than one output label') label = labels[0].value # print("Comparing ent %s and ent %s and got %s" % (att_ent.id, drug_ent.id, label)) rel_attempts += 1 if not label == 'None': # Make sure label corresponds to entity type: if label.find(att_ent.cat) < 0: # print(" Skipping found relation where label %s doesn't match arg type %s" % (label, att_ent.cat)) continue ann_out.write('R%d\t%s Arg1:%s Arg2:%s\n' % (rel_ind, label, att_ent.id, drug_ent.id)) rel_ind += 1 # print("Finished: Found %d relations while making %d classification attempts" % (rel_ind, rel_attempts)) ann_out.close()