def main(args):
    args = parser.parse_args()

    tagger = SequenceTagger.load_from_file(
        join(args.model_dir, 'best-model.pt'))

    txt_files = glob.glob(join(args.data_dir, '*.txt'))

    sent_splitter = PunktSentenceTokenizer()
    #tokenizer = TreebankWordTokenizer()

    for txt_fn in txt_files:
        print("Processing %s" % (basename(txt_fn)))
        ann_fn = join(args.output_dir, basename(txt_fn)[:-3] + 'ann')
        with open(txt_fn, 'r') as myfile:
            text = myfile.read()

        ann_out = open(ann_fn, 'w')
        ent_id = 0

        sent_spans = sent_splitter.span_tokenize(text)

        raw_offset = 0
        for sent_span in sent_spans:
            raw_offset = sent_span[0]
            sent = text[sent_span[0]:sent_span[1]]
            #tokens = tokenizer.tokenize(sent)
            # tagged = pos_tag(tokens)
            #flair_sent = Sentence(' '.join(tokens))
            flair_sent = Sentence(sent, use_tokenizer=True)
            ade_tagged = tagger.predict(flair_sent)

            cmap = raw_flair_charmap(sent, flair_sent.to_tokenized_string())
            #print('Sent is %s' % (sent) )
            #print(ade_tagged[0].to_tagged_string())

            # Check for annotated drugs:
            drug_found = False
            for entity in ade_tagged[0].to_dict(tag_type='ner')['entities']:
                if entity['type'] == 'Drug':
                    drug_found = True
                    break

            if drug_found or not args.conservative:
                for entity in ade_tagged[0].to_dict(
                        tag_type='ner')['entities']:
                    start = entity['start_pos']
                    end = entity['end_pos']

                    raw_start = start  #cmap[start]
                    raw_end = end  #cmap[end]

                    #                print('Mapped entity type %s(%s):(%d, %d) => (%d, %d)' % (entity['type'], entity['text'], start, end, raw_offset+raw_start, raw_offset+raw_end) )
                    ann_out.write(
                        'T%d\t%s %d %d\t%s\n' %
                        (ent_id, entity['type'], raw_offset + raw_start,
                         raw_offset + raw_end, entity['text']))
                    ent_id += 1

            raw_offset += len(sent) + 1
Exemple #2
0
    def __get_setences_boundaries(self):
        """
        function to tokenize sentences and return
        sentence boundaries of each sentence using a tokenizer.
        :return:
        """

        tokenizer = PunktSentenceTokenizer(punkt_param)
        sentences = list(tokenizer.span_tokenize(self.text))
        return sentences
Exemple #3
0
 def analyse_entry(self, entry, params):
     chunker_type = params["delimiter"]
     original_text = entry['nif:isString']
     if chunker_type == "sentence":
         tokenizer = PunktSentenceTokenizer()
     if chunker_type == "paragraph":
         tokenizer = LineTokenizer()
     chars = list(tokenizer.span_tokenize(original_text))
     for i, chunk in enumerate(tokenizer.tokenize(original_text)):
         print(chunk)
         e = Entry()
         e['nif:isString'] = chunk
         if entry.id:
             e.id = entry.id + "#char={},{}".format(chars[i][0], chars[i][1])
         yield e
def split_into_sentences(text):
    # splits the text into sentences and also preserves the corresponding starting and ending indices
    startIndices = []
    endIndices = []
    corpus = []
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'doc', 'mr', 'mrs', 'prof', 'inc', 'mgr', 'ing', 'st'])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)

    for start, end in sentence_splitter.span_tokenize(text):
        startIndices.append(start)
        endIndices.append(end)
        token = text[start:end]
        corpus.append(token)
    return startIndices, endIndices, corpus
Exemple #5
0
 def analyse_entry(self, entry, activity):
     yield entry
     chunker_type = activity.params["delimiter"]
     original_text = entry['nif:isString']
     if chunker_type == "sentence":
         tokenizer = PunktSentenceTokenizer()
     if chunker_type == "paragraph":
         tokenizer = LineTokenizer()
     chars = list(tokenizer.span_tokenize(original_text))
     if len(chars) == 1:
         # This sentence was already split
         return
     for i, chunk in enumerate(chars):
         start, end = chunk
         e = Entry()
         e['nif:isString'] = original_text[start:end]
         if entry.id:
             e.id = entry.id + "#char={},{}".format(start, end)
         yield e
Exemple #6
0
 def analyse_entry(self, entry, params):
     yield entry
     chunker_type = params["delimiter"]
     original_text = entry['nif:isString']
     if chunker_type == "sentence":
         tokenizer = PunktSentenceTokenizer()
     if chunker_type == "paragraph":
         tokenizer = LineTokenizer()
     chars = list(tokenizer.span_tokenize(original_text))
     if len(chars) == 1:
         # This sentence was already split
         return
     for i, chunk in enumerate(chars):
         start, end = chunk
         e = Entry()
         e['nif:isString'] = original_text[start:end]
         if entry.id:
             e.id = entry.id + "#char={},{}".format(start, end)
         yield e
Exemple #7
0
def sent_tokenize(data, filter_threshold=None):
    '''
    Tokenizes a string into sentences and corresponding offsets

    Args:
        data(str): The document itself
        filter_threshold(int): if sentence length is
            less than this, it will be ignored

    Returns:
        tuple(list(str), list(list))): tokenized
            sentences and corresponding offsets
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    return (sentences, offsets)
Exemple #8
0
def sent_tokenize(data, filter_threshold=None):
    '''
    Tokenizes a string into sentences and corresponding offsets

    Args:
        data(str): The document itself
        filter_threshold(int): if sentence length is
            less than this, it will be ignored

    Returns:
        tuple(list(str), list(list))): tokenized
            sentences and corresponding offsets
    '''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    return (sentences, offsets)
def marker_surr_patt(in_dir):
    """ Find most frequent POS tag patterns at the end of sentences.
    """

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    file_names = os.listdir(in_dir)
    patt_comb_freq_map = {}
    patt_orig_freq_map = {}
    patt_comb_freq_map_cit = {}
    patt_orig_freq_map_cit = {}
    # num_sentences_total = 0
    for file_idx, fn in enumerate(file_names):
        if file_idx % 100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
            # print(num_sentences_total/(file_idx+1))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue

        if re.search(r'[a-z]', aid):
            split = re.search(r'[a-z][0-9]', aid).span()[0] + 1
            aid = aid[:split] + '/' + aid[split:]

        with open(path) as f:
            text = f.read()
        text = re.sub(E_G_PATT, 'e.g.', text)

        if not re.search(CITE_MULTI_PATT, text):
            continue

        marker = ' \u241F '
        doc_len = len(text)
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            cit_end = False
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            words = pos_tag(sentence.split())
            words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
            if len(words) == 0:
                continue
            if words[-1][0] == marker.strip():
                cit_end = True
                words = words[:-1]
            if len(words) < 3:
                continue
            # num_sentences_total += 1
            sent_len = len(words)
            patt_comb = [None, None, None, None]
            patt_orig = [None, None, None, None]
            for x_idx in range(sent_len - 3, sent_len + 1):
                patt_idx = x_idx - (sent_len - 3)
                if x_idx < 0 or \
                        x_idx >= len(words):
                    patt_comb[patt_idx] = '<EOS>'
                    patt_orig[patt_idx] = '<EOS>'
                    continue
                wrd = words[x_idx][0]
                pos = words[x_idx][1]
                patt_orig[patt_idx] = pos
                if 'V' in pos:
                    patt_comb[patt_idx] = 'V'
                elif pos in ['NN', 'NNS']:
                    patt_comb[patt_idx] = 'NN'
                elif pos in ['NNP', 'NNPS']:
                    patt_comb[patt_idx] = 'NNP'
                elif pos == 'IN':
                    patt_comb[patt_idx] = 'IN'
                elif 'JJ' in pos:
                    patt_comb[patt_idx] = 'JJ'
                elif 'W' in pos:
                    patt_comb[patt_idx] = 'WH'
                elif 'RB' in pos:
                    patt_comb[patt_idx] = 'ADV'
                elif 'PR' in pos:
                    patt_comb[patt_idx] = 'PR'
                elif wrd == 'FORMULA':
                    patt_comb[patt_idx] = 'FORMULA'
                elif wrd == 'FIGURE':
                    patt_comb[patt_idx] = 'FIGURE'
                elif wrd == 'TABLE':
                    patt_comb[patt_idx] = 'TABLE'
                else:
                    patt_comb[patt_idx] = 'OTHER'
            comb_id = '¦'.join(patt_comb)
            orig_id = '¦'.join(patt_orig)
            if comb_id not in patt_comb_freq_map:
                patt_comb_freq_map[comb_id] = 0
            patt_comb_freq_map[comb_id] += 1
            if cit_end:
                if comb_id not in patt_comb_freq_map_cit:
                    patt_comb_freq_map_cit[comb_id] = 0
                patt_comb_freq_map_cit[comb_id] += 1

            if orig_id not in patt_orig_freq_map:
                patt_orig_freq_map[orig_id] = 0
            patt_orig_freq_map[orig_id] += 1
            if cit_end:
                if orig_id not in patt_orig_freq_map_cit:
                    patt_orig_freq_map_cit[orig_id] = 0
                patt_orig_freq_map_cit[orig_id] += 1
                if orig_id == 'RB¦JJ¦NNS¦<EOS>':
                    print(sentence)
                    print(fn)
                    input()
        # if file_idx > 200:
        #    break

    patt_comb_freq = sorted(patt_comb_freq_map.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
    patt_orig_freq = sorted(patt_orig_freq_map.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
    patt_comb_freq_cit = sorted(patt_comb_freq_map_cit.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
    patt_orig_freq_cit = sorted(patt_orig_freq_map_cit.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
    print('- - - C O M B - - -')
    for pid in patt_comb_freq[:25]:
        print(pid)
    print('- - - O R I G - - -')
    for pid in patt_orig_freq[:25]:
        print(pid)

    with open('sentence_comb.json', 'w') as f:
        json.dump(patt_comb_freq, f)
    with open('sentence_orig.json', 'w') as f:
        json.dump(patt_orig_freq, f)
    with open('marker_comb.json', 'w') as f:
        json.dump(patt_comb_freq_cit, f)
    with open('marker_orig.json', 'w') as f:
        json.dump(patt_orig_freq_cit, f)
Exemple #10
0
def sent_tokenize(data):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = deepcopy(sentences)
    new_offsets = deepcopy(offsets)
    for i, off in enumerate(offsets):
        if len(tokenizer.tokenize(sentences[i])) < 7:  # Skip short sentences
            pass
        else:
            if i < len(offsets) - 1:
                if ((offsets[i + 1][0] - offsets[i][1]) < 5):
                    new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
                    new_offsets.append((offsets[i][0], offsets[i + 1][1]))
            if i < len(offsets) - 2:
                if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
                        ((offsets[i + 1][0] - offsets[i][0]) < 5):
                    new_sentences.append(sentences[i] + ' ' +
                                         sentences[i + 1] + ' ' +
                                         sentences[i + 2])
                    new_offsets.append((offsets[i][0], offsets[i + 2][1]))
    #         if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    # if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
Exemple #11
0
def resolve_phrases(section, tokens, book, id):
    phrases = []
    sentences = []
    # find and resolve parantheses
    if book == "almizan_fa":
        if int(id.split("_")[0]) <= 2:
            html = section.html()
            replace = lambda start, end, oldtext, newtext: oldtext[:start] + newtext + oldtext[end:]

            # in chapter1, remove parantheses for ayas
            iter = re.finditer(r"(<span[^\n]*>)[ ]*\(([^\)s]*)\)[^\)]*(</span[^\n]*>)", html)
            for m in reversed(list(iter)):
                html = replace(m.start(), m.end(), html, m.group().replace("(", "").replace(")", ""))

            iter = re.finditer(r"\([^\)]{3,15}\)", html)
            for match in reversed(list(iter)):
                m = match.group()[1:-1]
                resolved = resolve_phrase(m, tokens, book[-2:])
                if resolved:
                    html = replace(match.start(), match.end(), html, '<em rel="{0}">{1}</em>'.format(resolved[0], m))

            section.html(html)

    pst = PunktSentenceTokenizer()
    # resolve em elements
    for em in section.find("em").items():
        resolved = resolve_phrase(em.text(), tokens, book[-2:])
        if resolved:
            em.attr("rel", resolved[0])
            phrases.append((em.text(), resolved[1], resolved[0]))
            paragraph = em.parent().html(method="html")
            for start, end in pst.span_tokenize(paragraph):
                if paragraph[start:end].find(em.outerHtml()) != -1:
                    this_sentence = paragraph[start:end].lstrip()
                    this_sentence = refine_sentence(this_sentence)

                    while this_sentence.startswith("<code"):
                        if this_sentence.find("</code>") != -1:
                            new_start = this_sentence.find("</code>") + 7
                        this_sentence = this_sentence[new_start:].lstrip()

                    this_sentence = refine_sentence(this_sentence)

                    while this_sentence.startswith("<span"):
                        new_start = this_sentence.find("</span>") + 7
                        this_sentence = this_sentence[new_start:].lstrip()

                    this_sentence = refine_sentence(this_sentence)

                    before = this_sentence.index(em.outerHtml())
                    after = len(this_sentence) - len(em.outerHtml()) - before
                    em.attr("data-sentence", "{0}:{1}".format(before, after))
                    break
            sentences.append((em.text(), resolved[0], (before, after), [this_sentence]))
        else:
            phrases.append((em.text(),))

    new_section = section.html(method="html")
    p = re.compile(r'<em rel="([^"]+)" data-sentence="([^"]+)">([^<]+)<\/em>')
    matched_list = [(m.start(0), m.end(0), m.group(1), m.group(2)) for m in re.finditer(p, new_section)]
    last_start = -1
    for matched in reversed(matched_list):
        start_span = matched[0] - int(matched[3].split(":")[0])
        end_span = matched[1] + int(matched[3].split(":")[1])
        if start_span != last_start:
            new_section = (
                new_section[:start_span]
                + '<span class="phrase">'
                + new_section[start_span:end_span]
                + "</span>"
                + new_section[end_span:]
            )
            last_start = start_span

    section.html(new_section)
    return phrases, sentences
class Textualizer:
    """The Textualzer class."""
    def __init__(self, abbrev=['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']):
        """Initialize Textualizer.

        Usually, you need to create only one textualizer in your script.

        Args:
            abbrev (list): List of abbreviations
        """
        punkt = PunktParameters()
        punkt.abbrev_types = set(abbrev)
        self.tokenizer = PunktSentenceTokenizer(punkt)

    def find_sentences(self, par):
        """Finding sentences from paragraph using nltk.

        Args:
            par (Paragraph): The input paragraph

        Returns:
            A list of sentences.
        """
        text = ''.join([w.text for w in par.words])

        word_iter = iter(par.words)
        word = next(word_iter)
        sent_ls = []

        for i, (b, e) in enumerate(self.tokenizer.span_tokenize(text)):
            _id = 's-' + par._id[2:] + '-' + str(i)
            sent = Sentence(_id, par.sec_name, par.box_name, text[b:e], [])

            sent_ls.append(sent)

            while word is not None and word.start < b:
                word = next(word_iter)

            while word is not None and word.start < e:
                if word._id is not None:
                    sent.words.append(word._id)

                try:
                    word = next(word_iter)
                except StopIteration:
                    word = None

        return sent_ls

    def textualize(self, doc, remove_pos=True):
        """Textualize the document.

        The main function of this methods are summarized:

            1. Add sentence information to Document.tree (i.e., XML)
            2. Put list of sentences to Document.sentences
            3. Provide additional attributes (Documents.words, Documents.sentences)

        Additionally, some normalization, e.g., breaking ligatures, are applied.

        In order for the compatibility, it adds following attributes for now:

            - Document.maths
            - Document.cites

        These attributes should be added by other modules in the future.

        Args:
            doc (Document): The input document
            remove_pos (bool): Whether remove positions or not
        """

        # basic variables
        ligatures = {
            '\ufb00': 'ff',
            '\ufb01': 'fi',
            '\ufb02': 'fl',
            '\ufb03': 'ffi',
            '\ufb04': 'ffl',
            '\ufb05': 'st',
            '\ufb06': 'st',
        }
        ns = {'x': 'http://www.w3.org/1999/xhtml'}
        end_token = re.compile(r'[?!.]$')
        tag_token = re.compile(r'(?<!\s)-$')

        pars = {}
        par_ls = []
        last_par = None
        ref_pars = []
        cite = None
        math = None
        ignore_math = False
        math_ls = []
        word_nodes = {}
        cites = {}

        for sec in doc.tree.xpath('x:body/x:div', namespaces=ns):
            sec_id = sec.get('id')
            sec_name = sec.get('data-name')

            for box in sec.xpath('x:div', namespaces=ns):
                box_name = box.get('data-name')

                for par in box.xpath('x:p', namespaces=ns):
                    math_par = False
                    par_id = par.get('id')
                    page_id = int(par.get('data-page'))

                    # standalone equations should continue from last par
                    if box_name == 'Equation' and last_par:
                        p = pars[last_par]
                        # TODO: a bit different from the original; please check
                        # print(len(p.words))

                        if not end_token.match(p.words[-1].text):
                            continued_from_id = last_par
                            math_par = True

                    else:
                        continued_from_id = par.get('data-continued-from')

                    # if not continued_from_id or par_ls:
                    #    text = '\n\n'
                    #    par_ls[-1].words.append(Word(None, text))

                    if not continued_from_id and box_name == 'Reference':
                        ref_pars.append(par_id)

                    nodes = list(par)
                    tmp_ref = nodes[0].get('data-refid', False)
                    tmp_ref = nodes[0].get('data-refid', False)
                    if tmp_ref and tmp_ref != nodes[0].get('id'):
                        del nodes[0]

                    if continued_from_id:
                        par = pars[continued_from_id]
                    else:
                        par = Paragraph(sec_id, par_id, sec_name, box_name, [],
                                        [])
                        par_ls.append(par)

                    pars[par_id] = par

                    for node in filter(
                            lambda n: n.get('data-refid') is None or n.get(
                                'id') == n.get('data-refid'), nodes):

                        sp_val = node.get('data-space')
                        if sp_val == 'nospace' or (
                                sp_val == 'bol' and
                            (not par.words
                             or tag_token.search(par.words[-1].text))):
                            space = None
                        else:
                            space = Word(None, ' ')

                        text = node.get('data-fullform') or node.text or ''

                        text = re.sub(r'\s+', ' ', text)
                        text = ''.join([
                            ligatures[c] if ligatures.get(c, False) else c
                            for c in text
                        ])

                        _id = node.get('id')
                        word_nodes[_id] = node

                        # inside a citation; skip everything
                        if cite:
                            math = None
                            space = None
                            word = Word(_id, '')
                            if cite == node.get('id'):
                                cite = None

                        # starting a citation; make a dummy word
                        elif node.get('data-cite-end', False):
                            if math:
                                par.words = par.backup_words
                                math = None
                                ignore_math = True

                            cite = node.get('data-cite-end')
                            cids = node.get('data-cite-id').split(',')
                            text = ', '.join(map(lambda c: 'CITE-' + c, cids))
                            cites.update({c: _id for c in cids})
                            word = Word(_id, text)

                        # starting an equation
                        elif not ignore_math and (
                                node.get('data-math') == 'B-Math' or
                            (node.get('data-math') == 'I-Math' and not math) or
                            (math_par and not math)):
                            par.backup_words = par.words.copy()
                            if space:
                                par.backup_words.append(space)
                            par.backup_words.append(Word(_id, text, node))

                            if math_par:
                                mid = 'MATH-' + par_id
                            else:
                                mid = 'MATH-' + _id
                            word = Word(_id, mid)
                            math = [mid, _id, _id, page_id] + [
                                float(a)
                                for a in node.get('data-bdr').split(',')
                            ]
                            math_ls.append(math)

                        # inside an equation: skip while calculating bbox
                        elif not ignore_math and (node.get('data-math')
                                                  == 'I-Math' or math_par):
                            if space:
                                par.backup_words.append(space)
                            par.backup_words.append(Word(_id, text, node))

                            space = None
                            word = Word(_id, '')
                            math[2] = _id
                            new = [
                                float(a)
                                for a in node.get('data-bdr').split(',')
                            ]
                            math[4] = min(math[4], new[0])
                            math[5] = min(math[5], new[1])
                            math[6] = max(math[6], new[2])
                            math[7] = max(math[7], new[3])

                        # normal texts
                        else:
                            math = None
                            ignore_math = False
                            word = Word(_id, text, node)

                        # finish the loop
                        if space is not None:
                            par.words.append(space)
                        if word is not None:
                            par.words.append(word)

                    # set last_par
                    if box_name != 'Body':
                        last_par = None
                    elif continued_from_id:
                        last_par = continued_from_id
                    else:
                        last_par = par_id

        par_pos = 0
        for p in par_ls:
            p.words.append(Word(None, '\n\n'))
            pos = 0
            for w in p.words:
                w.start = pos
                next_pos = pos + len(w.text)
                if w.node is not None:
                    w.node.set('data-from', str(par_pos + pos))
                    w.node.set('data-to', str(par_pos + next_pos))
                pos = next_pos
            par_pos += pos

        # textualize
        sent_ls = []
        for p in par_ls:
            sent_ls.extend(self.find_sentences(p))

        for s in sent_ls:
            for w in s.words:
                word_nodes[w].set('data-sent-id', str(s._id))

        # collect the data
        word_ls = [(w.get('id'), int(w.get('data-from',
                                           0)), int(w.get('data-to', 0)))
                   for w in doc.tree.xpath('//x:span', namespaces=ns)]
        doc.words = sorted(word_ls, key=lambda w: (w[1], w[2]))

        doc.text = ''.join([w.text for p in par_ls for w in p.words])
        doc.sentences = sent_ls
        doc.maths = math_ls
        doc.cites = [('CITE-' + p, ''.join([w.text
                                            for w in pars[p].words]).strip(),
                      cites.get(p, [])) for p in ref_pars]

        # scrub the tree; remove positions
        if remove_pos:
            for w in doc.tree.xpath('//x:span', namespaces=ns):
                if w.get('class', None) == 'word':
                    w.attrib.pop('data-from', None)
Exemple #13
0
def sent_tokenize(data, filter_short=False, filter_verbless=False):
    """
    Tokenize sentences 

    Tokenize `data` into two arrays: sentences and offsets
    Returns a tuple (`sentences`,`offsets`)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = []
    new_offsets = []
    to_del = []
    if filter_verbless:
        pos = pos_tagger.extract_nlp_batch()
        for i in range(sentences):
            okay = False
            for word in pos['sentences'][i]['words']:
                if word[1]['PartOfSpeech'] in verbs:
                    okay = True
                    break
            if not okay:  # the sentence doesn't have verb,
                to_del.append(i)  # mark for deletion
        sentences = multi_delete(sentences, to_del)
        offsetes = multi_delete(offsets, to_del)
    if filter_short and not filter_verbless:
        for i in range(len(sentences)):
            if len(sentences[i]) >= filter_short:
                new_sentences.append(sentences[i])
                new_offsets.append(new_offsets[i])
        new_sentences = [s for s in sentences if sentences]


#     new_sentences = deepcopy(sentences)
#     new_offsets = deepcopy(offsets)
#     for i, off in enumerate(offsets):
#         if i < len(offsets) - 1:
#             if ((offsets[i + 1][0] - offsets[i][1]) < 5):
#                 new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
#                 new_offsets.append((offsets[i][0], offsets[i + 1][1]))
#         if i < len(offsets) - 2:
#             if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5):
#                 new_sentences.append(
#                     sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
#                 new_offsets.append((offsets[i][0], offsets[i + 2][1]))
#         if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
                shard.append(json.loads(line))

        for paper in shard:
            manuscript_id = paper["paper_id"]
            full_text = paper["body_text"]

            for paragraph in full_text:
                section_name = paragraph["section"].lower()
                if "discuss" not in section_name and "conclu" not in section_name:
                    continue

                if not paragraph["cite_spans"]:
                    continue

                paragraph_text = paragraph["text"]
                endpoints = list(tokenizer.span_tokenize(paragraph_text))

                j = 0
                for cite_span in paragraph["cite_spans"]:
                    cite_id = cite_span["cite_id"]
                    if cite_id not in paper_ids:
                        continue

                    cite_text = cite_span["text"]
                    start, end = cite_span["start"], cite_span["end"]

                    a, b = endpoints[j]
                    while start >= b:
                        j += 1
                        a, b = endpoints[j]
Exemple #15
0
def resolve_phrases(section, tokens, book, id):
    phrases = []
    sentences = []
    # find and resolve parantheses
    if book == 'almizan_fa':
        if int(id.split('_')[0]) <= 2:
            html = section.html()
            replace = lambda start, end, oldtext, newtext: oldtext[:start
                                                                   ] + newtext + oldtext[
                                                                       end:]

            # in chapter1, remove parantheses for ayas
            iter = re.finditer(
                r'(<span[^\n]*>)[ ]*\(([^\)s]*)\)[^\)]*(</span[^\n]*>)', html)
            for m in reversed(list(iter)):
                html = replace(m.start(), m.end(), html,
                               m.group().replace('(', '').replace(')', ''))

            iter = re.finditer(r'\([^\)]{3,15}\)', html)
            for match in reversed(list(iter)):
                m = match.group()[1:-1]
                resolved = resolve_phrase(m, tokens, book[-2:])
                if resolved:
                    html = replace(
                        match.start(), match.end(), html,
                        '<em rel="{0}">{1}</em>'.format(resolved[0], m))

            section.html(html)

    pst = PunktSentenceTokenizer()
    # resolve em elements
    for em in section.find('em').items():
        resolved = resolve_phrase(em.text(), tokens, book[-2:])
        if resolved:
            em.attr('rel', resolved[0])
            phrases.append((em.text(), resolved[1], resolved[0]))
            paragraph = em.parent().html(method='html')
            for start, end in pst.span_tokenize(paragraph):
                if paragraph[start:end].find(em.outerHtml()) != -1:
                    this_sentence = paragraph[start:end].lstrip()
                    this_sentence = refine_sentence(this_sentence)

                    while this_sentence.startswith("<code"):
                        if this_sentence.find('</code>') != -1:
                            new_start = this_sentence.find('</code>') + 7
                        this_sentence = this_sentence[new_start:].lstrip()

                    this_sentence = refine_sentence(this_sentence)

                    while this_sentence.startswith('<span'):
                        new_start = this_sentence.find('</span>') + 7
                        this_sentence = this_sentence[new_start:].lstrip()

                    this_sentence = refine_sentence(this_sentence)

                    before = this_sentence.index(em.outerHtml())
                    after = len(this_sentence) - len(em.outerHtml()) - before
                    em.attr('data-sentence', '{0}:{1}'.format(before, after))
                    break
            sentences.append(
                (em.text(), resolved[0], (before, after), [this_sentence]))
        else:
            phrases.append((em.text(), ))

    new_section = section.html(method='html')
    p = re.compile(r'<em rel="([^"]+)" data-sentence="([^"]+)">([^<]+)<\/em>')
    matched_list = [(m.start(0), m.end(0), m.group(1), m.group(2))
                    for m in re.finditer(p, new_section)]
    last_start = -1
    for matched in reversed(matched_list):
        start_span = matched[0] - int(matched[3].split(':')[0])
        end_span = matched[1] + int(matched[3].split(':')[1])
        if start_span != last_start:
            new_section = new_section[:start_span] + '<span class="phrase">' + new_section[
                start_span:end_span] + '</span>' + new_section[end_span:]
            last_start = start_span

    section.html(new_section)
    return phrases, sentences
Exemple #16
0
def sent_tokenize(data, filter_short=False, filter_verbless=False):
    """
    Tokenize sentences 

    Tokenize `data` into two arrays: sentences and offsets
    Returns a tuple (`sentences`,`offsets`)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = []
    new_offsets = []
    to_del = []
    if filter_verbless:
        pos = pos_tagger.extract_nlp_batch()
        for i in range(sentences):
            okay = False
            for word in pos['sentences'][i]['words']:
                if word[1]['PartOfSpeech'] in verbs:
                    okay = True
                    break
            if not okay:  # the sentence doesn't have verb,
                to_del.append(i)  # mark for deletion
        sentences = multi_delete(sentences, to_del)
        offsetes = multi_delete(offsets, to_del)
    if filter_short and not filter_verbless:
        for i in range(len(sentences)):
            if len(sentences[i]) >= filter_short:
                new_sentences.append(sentences[i])
                new_offsets.append(new_offsets[i])
        new_sentences = [s for s in sentences if sentences]


#     new_sentences = deepcopy(sentences)
#     new_offsets = deepcopy(offsets)
#     for i, off in enumerate(offsets):
#         if i < len(offsets) - 1:
#             if ((offsets[i + 1][0] - offsets[i][1]) < 5):
#                 new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
#                 new_offsets.append((offsets[i][0], offsets[i + 1][1]))
#         if i < len(offsets) - 2:
#             if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5):
#                 new_sentences.append(
#                     sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
#                 new_offsets.append((offsets[i][0], offsets[i + 2][1]))
#         if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
# if i < len(offsets) - 3:
#             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 4:
#             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
#         if i < len(offsets) - 5:
#             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
#                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
#                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
#                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
#                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
#                 new_sentences.append(sentences[
#                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
#                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
def marker_surr_patt(in_dir):
    """ Find most frequent POS tag patterns surrounding citation marker
    """

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    file_names = os.listdir(in_dir)
    patt_comb_freq_map = {}
    patt_orig_freq_map = {}
    for file_idx, fn in enumerate(file_names):
        if file_idx%100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue

        if re.search(r'[a-z]', aid):
            split = re.search(r'[a-z][0-9]', aid).span()[0] + 1
            aid = aid[:split] + '/' + aid[split:]

        with open(path) as f:
            text = f.read()
        text = re.sub(E_G_PATT, 'e.g.', text)

        marker = ' \u241F '
        doc_len = len(text)
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            if marker in sentence:
                words = pos_tag(sentence.split())
                words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
                sent_len = len(words)
                indices = [i for i, tup in enumerate(words)
                           if tup[0] == marker.strip()]
                for word_idx in indices:
                    word = words[word_idx][0]
                    if word == marker.strip():
                        patt_comb = [None, None, None, '[]', None, None, None]
                        patt_orig = [None, None, None, '[]', None, None, None]
                        for shift in range(-3, 4):
                            x_idx = shift+3
                            if shift == 0:
                                # marker itself
                                continue
                            if word_idx+shift < 0 or \
                                    word_idx+shift >= len(words):
                                patt_comb[x_idx] = '<EOS>'
                                patt_orig[x_idx] = '<EOS>'
                                continue
                            wrd = words[word_idx+shift][0]
                            pos = words[word_idx+shift][1]
                            patt_orig[x_idx] = pos
                            if 'V' in pos:
                                patt_comb[x_idx] = 'V'
                            elif pos in ['NN', 'NNS']:
                                patt_comb[x_idx] = 'NN'
                            elif pos in ['NNP', 'NNPS']:
                                patt_comb[x_idx] = 'NNP'
                            elif pos == 'IN':
                                patt_comb[x_idx] = 'IN'
                            elif 'JJ' in pos:
                                patt_comb[x_idx] = 'JJ'
                            elif 'W' in pos:
                                patt_comb[x_idx] = 'WH'
                            elif 'RB' in pos:
                                patt_comb[x_idx] = 'ADV'
                            elif 'PR' in pos:
                                patt_comb[x_idx] = 'PR'
                            elif wrd == 'FORMULA':
                                patt_comb[x_idx] = 'FORMULA'
                            elif wrd == 'FIGURE':
                                patt_comb[x_idx] = 'FIGURE'
                            elif wrd == 'TABLE':
                                patt_comb[x_idx] = 'TABLE'
                            else:
                                patt_comb[x_idx] = 'OTHER'
                        comb_id = '¦'.join(patt_comb)
                        orig_id = '¦'.join(patt_orig)
                        # # look at examples
                        # if orig_id == 'VBN¦IN¦NNP¦[]¦<EOS>¦<EOS>¦<EOS>':
                        #     print(sentence)
                        #     input()
                        #     print('.')
                        if comb_id not in patt_comb_freq_map:
                            patt_comb_freq_map[comb_id] = 0
                        patt_comb_freq_map[comb_id] += 1
                        if orig_id not in patt_orig_freq_map:
                            patt_orig_freq_map[orig_id] = 0
                        patt_orig_freq_map[orig_id] += 1
        # if file_idx > 200:
        #    break

    patt_comb_freq = sorted(patt_comb_freq_map.items(),
                            key=operator.itemgetter(1), reverse=True)
    patt_orig_freq = sorted(patt_orig_freq_map.items(),
                            key=operator.itemgetter(1), reverse=True)
    print('- - - C O M B - - -')
    for pid in patt_comb_freq[:25]:
        print(pid)
    print('- - - O R I G - - -')
    for pid in patt_orig_freq[:25]:
        print(pid)

    store_comb = []
    for tup in patt_comb_freq:
        pid = tup[0]
        freq = tup[1]
        if '[]¦<EOS>¦<EOS>¦<EOS>' in pid:
            new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>')
            store_comb.append((new_pid, freq))
    with open('marker_comb.json', 'w') as f:
        json.dump(store_comb, f)

    store_orig = []
    for tup in patt_orig_freq:
        pid = tup[0]
        freq = tup[1]
        if '[]¦<EOS>¦<EOS>¦<EOS>' in pid:
            new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>')
            store_orig.append((new_pid, freq))
    with open('marker_orig.json', 'w') as f:
        json.dump(store_orig, f)
Exemple #18
0
def sent_pos(in_dir):
    """ Positions of citation markers in sentences, relatve to where in doc
    """

    arxiv_base_url = 'http://export.arxiv.org/api/query?search_query=id:'
    arxiv_ns = {
        'atom': 'http://www.w3.org/2005/Atom',
        'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
        'arxiv': 'http://arxiv.org/schemas/atom'
    }

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    with open('hedge_words') as f:
        hedge_words = [l.strip() for l in f.readlines()]

    x_all = list(range(-5, 6))
    y_verb = []
    y_noun = []
    y_propnoun = []
    y_prepos = []
    y_adj = []
    y_wh = []
    y_adv = []
    y_pr = []
    y_form = []
    y_fig = []
    y_tab = []
    for x in x_all:
        y_verb.append(0)
        y_noun.append(0)
        y_propnoun.append(0)
        y_prepos.append(0)
        y_adj.append(0)
        y_wh.append(0)
        y_adv.append(0)
        y_pr.append(0)
        y_form.append(0)
        y_fig.append(0)
        y_tab.append(0)
    file_names = os.listdir(in_dir)
    for file_idx, fn in enumerate(file_names):
        if file_idx % 100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue

        phys_cat = [
            'hep-th', 'hep-ph', 'hep-lat', 'hep-ex', 'cond-mat', 'astro-ph',
            'physics', 'nucl', 'gr-qc', 'quant-ph', 'nlin'
        ]
        math_cat = ['math', 'math-ph']
        cs_cat = ['cs']
        if re.search(r'[a-z]', aid):
            split = re.search(r'[a-z][0-9]', aid).span()[0] + 1
            aid = aid[:split] + '/' + aid[split:]
        resp = requests.get('{}{}&start=0&max_results=1'.format(
            arxiv_base_url, aid))
        xml_root = etree.fromstring(resp.text.encode('utf-8'))
        result_elems = xml_root.xpath('/atom:feed/atom:entry',
                                      namespaces=arxiv_ns)
        result = result_elems[0]
        cat = result.find('arxiv:primary_category',
                          namespaces=arxiv_ns).get('term')
        high_cat = None
        for pc in phys_cat:
            if pc in cat:
                high_cat = 'phys'
                break
        if not high_cat:
            for mc in math_cat:
                if pc in cat:
                    high_cat = 'math'
                    break
        if not high_cat:
            if 'cs' in cat:
                high_cat = 'cs'
        if not high_cat:
            continue

        if high_cat != 'phys':
            continue

        with open(path) as f:
            text = f.read()

        marker = ' \u241F '
        doc_len = len(text)
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            if marker in sentence:
                words = pos_tag(sentence.split())
                words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
                sent_len = len(words)
                indices = [
                    i for i, tup in enumerate(words)
                    if tup[0] == marker.strip()
                ]
                for word_idx in indices:
                    word = words[word_idx][0]
                    if word == marker.strip():
                        for shift in x_all:
                            x_idx = shift + 5
                            if shift == 0:
                                # marker itself
                                continue
                            if word_idx+shift < 0 or \
                                    word_idx+shift >= len(words):
                                # out of range
                                continue
                            wrd = words[word_idx + shift][0]
                            pos = words[word_idx + shift][1]
                            if 'V' in pos:
                                y_verb[x_idx] += 1
                            if pos in ['NN', 'NNS']:
                                y_noun[x_idx] += 1
                            if pos in ['NNP', 'NNPS']:
                                y_propnoun[x_idx] += 1
                            if pos == 'IN':
                                y_prepos[x_idx] += 1
                            if 'JJ' in pos:
                                y_adj[x_idx] += 1
                            if 'W' in pos:
                                y_wh[x_idx] += 1
                            if 'RB' in pos:
                                y_adv[x_idx] += 1
                            if 'PR' in pos:
                                y_pr[x_idx] += 1
                            if wrd == 'FORMULA':
                                y_form[x_idx] += 1
                            if wrd == 'FIGURE':
                                y_fig[x_idx] += 1
                            if wrd == 'TABLE':
                                y_tab[x_idx] += 1
        if file_idx > 200:
            break

    for idx, y in enumerate([(y_verb, 'verb'), (y_noun, 'noun'),
                             (y_propnoun, 'proper noun'),
                             (y_prepos, 'preposition'), (y_adj, 'adjective'),
                             (y_wh, 'wh-det./-adv./-pron.'), (y_adv, 'adverb'),
                             (y_pr, 'pers./pos. pronoun'),
                             (y_form, 'formula')]):
        color = list(mpl.rcParams['axes.prop_cycle'])[idx]['color']
        plt.plot(x_all,
                 y[0],
                 marker='',
                 linestyle='-',
                 linewidth=.5,
                 alpha=0.3,
                 color=color)
        plt.plot(x_all,
                 y[0],
                 label=y[1],
                 marker='D',
                 linestyle='',
                 color=color)

    plt.xlabel('word position relative to citation')
    plt.ylabel('number of words')
    plt.legend()

    ax = plt.gca()
    ax.xaxis.grid(True)
    plt.xticks(np.arange(min(x_all), max(x_all), 1.0))

    plt.show()
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'
    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer() #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if  start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels,
            key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                token_str = sent_str[start:end].encode('utf8')
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, 
                    first=sent_start + start,
                    length = end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                     tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences

    def process_item(self, stream_item, context=None):
        if not hasattr(stream_item.body, 'clean_visible') or not stream_item.body.clean_visible:
            return stream_item
            
        self.label_index = None
        self.label_to_mention_id = dict()
        stream_item.body.sentences[self.tagger_id] = self.make_sentences(stream_item)

        return stream_item

    def __call__(self, stream_item, context=None):
        ## support the legacy callable API
        return self.process_item(stream_item, context)
class nltk_tokenizer(IncrementalTransform):
    """
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    """

    tagger_id = "nltk_tokenizer"

    def __init__(self, config):
        self.config = config
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  # PunktWordTokenizer()

    def _sentences(self, clean_visible):
        "generate strings identified as sentences"
        previous_end = 0
        clean_visible = clean_visible.decode("utf8")
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        "make a sortedcollection on body.labels"
        labels = stream_item.body.labels.get(self.config.get("annotator_id"))
        if not labels:
            labels = []

        self.label_index = SortedCollection(labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        "assemble Sentence and Token objects"
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode("utf8")
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')", start, end, exc_info=True)
                    sys.exit("failed to cope with %r in %r" % (sent_str[start:end], sent_str))
                tok = Token(token_num=token_num, token=token_str, sentence_pos=sentence_pos)
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, first=sent_start + start, length=end - start
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    # logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info("overlapping label: %r" % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info("adding label to tok: %r has %r", tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
Exemple #21
0
     
     #Retrieve annotated features for the currently-open training file
     keywords=get_kw(config.train_folder,f)
     #origin_offs=get_offs(train_folder,f)  
     
     #Extracting positive examples of keywords from the paragraph, given the ground truth offsets
     '''for k in keywords:
         start=k[2]
         end=k[3]
         
     
     print(text[264:349])'''
     
     #Split each sentence on a separate line 
     toktext=sentence_splitter.tokenize(text)
     s_spans=sentence_splitter.span_tokenize(text)
     sentence_spans=[]
     for ss in s_spans:
         sss=[]
         start=ss[0]
         end=ss[1]
         
         sss.append(start)
         sss.append(end)
         sentence_spans.append(sss)
               
     #Create output files with a similar name as the input files
     outputfile = f.split(".")[0] + "__output.txt"
 
 with io.open(os.path.join(config.output_folder,outputfile),'w', encoding="utf-8") as outf:
     
Exemple #22
0
def sent_tokenize(data):
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(
        ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'Fig', 'fig'])
    sent_detector = PunktSentenceTokenizer(punkt_param)
    sentences = sent_detector.tokenize(data)
    offsets = sent_detector.span_tokenize(data)
    new_sentences = deepcopy(sentences)
    new_offsets = deepcopy(offsets)
    for i, off in enumerate(offsets):
        if len(tokenizer.tokenize(sentences[i])) < 7:  # Skip short sentences
            pass
        else:
            if i < len(offsets) - 1:
                if ((offsets[i + 1][0] - offsets[i][1]) < 5):
                    new_sentences.append(sentences[i] + ' ' + sentences[i + 1])
                    new_offsets.append((offsets[i][0], offsets[i + 1][1]))
            if i < len(offsets) - 2:
                if ((offsets[i + 2][0] - offsets[i + 1][1]) < 5) and\
                        ((offsets[i + 1][0] - offsets[i][0]) < 5):
                    new_sentences.append(
                        sentences[i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2])
                    new_offsets.append((offsets[i][0], offsets[i + 2][1]))
    #         if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    # if i < len(offsets) - 3:
    #             if (((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 4:
    #             if (((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))      new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    #         if i < len(offsets) - 5:
    #             if (((offsets[i + 5][0] - offsets[i + 4][1]) < 5) and
    #                 ((offsets[i + 4][0] - offsets[i + 3][1]) < 5) and
    #                  ((offsets[i + 3][0] - offsets[i + 2][1]) < 5) and
    #                     ((offsets[i + 2][0] - offsets[i + 1][0]) < 5) and
    #                     ((offsets[i + 1][0] - offsets[i][0]) < 5)):
    #                 new_sentences.append(sentences[
    #                                   i] + ' ' + sentences[i + 1] + ' ' + sentences[i + 2] + ' ' + sentences[i + 3] + ' ' + sentences[i + 3])
    #                 new_offsets.append((offsets[i][0], offsets[i + 3][1]))
    print new_offsets
    return {'sentences': new_sentences, 'offsets': new_offsets}
class nltk_tokenizer(IncrementalTransform):
    '''
    a streamcorpus_pipeline IncrementalTransform that converts a chunk into a new
    chunk with Sentence objects generated using NLTK tokenizers
    '''
    config_name = 'nltk_tokenizer'
    tagger_id = 'nltk_tokenizer'

    def __init__(self, *args, **kwargs):
        super(nltk_tokenizer, self).__init__(*args, **kwargs)
        self.sentence_tokenizer = PunktSentenceTokenizer()
        self.word_tokenizer = WhitespaceTokenizer()  #PunktWordTokenizer()

    def _sentences(self, clean_visible):
        'generate strings identified as sentences'
        previous_end = 0
        clean_visible = clean_visible.decode('utf8')
        assert isinstance(clean_visible, unicode)
        for start, end in self.sentence_tokenizer.span_tokenize(clean_visible):
            ## no need to check start, because the first byte of text
            ## is always first byte of first sentence, and we will
            ## have already made the previous sentence longer on the
            ## end if there was an overlap.
            if start < previous_end:
                start = previous_end
                if start > end:
                    ## skip this sentence... because it was eaten by
                    ## an earlier sentence with a label
                    continue
            try:
                label = self.label_index.find_le(end)
            except ValueError:
                label = None
            if label:
                off = label.offsets[OffsetType.BYTES]
                end = max(off.first + off.length, end)
            previous_end = end
            sent_str = clean_visible[start:end]
            yield start, end, sent_str

    def make_label_index(self, stream_item):
        'make a sortedcollection on body.labels'
        labels = stream_item.body.labels.get(self.config.get('annotator_id'))
        if not labels:
            labels = []

        self.label_index = SortedCollection(
            labels, key=lambda label: label.offsets[OffsetType.BYTES].first)

    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(
                stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start,
                                    end,
                                    exc_info=True)
                    sys.exit('failed to cope with %r in %r' %
                             (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES,
                    first=sent_start + start,
                    length=end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' %
                                    label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                    tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
        return sentences
def main(args):

    if len(args) < 3:
        sys.stderr.write('3 required arguments: <input anafora dir> <output brat dir> <tsv out dir>\n')
        sys.exit(-1)

    sent_tokenizer = PunktSentenceTokenizer()
    neg_out = open( join(args[2], 'negation.tsv'), 'wt')
    dtr_out = open( join(args[2], 'dtr.tsv'), 'wt')
    alink_out = open( join(args[2], 'alink.tsv'), 'wt')

    for sub_dir, text_name, xml_names in anafora.walk(args[0], "ADE_entity.dave.completed.xml"):
        textfile_path = join( join(args[0],text_name), text_name)
        with open(textfile_path, 'r') as tf:
            text = tf.read()

        sent_spans = list(sent_tokenizer.span_tokenize(text))
        shutil.copyfile(textfile_path, join(args[1], '%s.txt' % (text_name)))
        brat_out = open( join(args[1], '%s.ann' % (text_name)), 'wt')

        for xml_name in xml_names:
            xml_path = os.path.join(args[0], sub_dir, xml_name)
            xml_parts = xml_name.split('.')
            annotator = xml_parts[2]
            status = xml_parts[3]
            data = anafora.AnaforaData.from_file(xml_path)

            alink_map = {}
            for rel in data.annotations.select_type('ALINK'):
                cat = rel.properties['Type']
                tgt = rel.properties['Target']
                alink_map[tgt.id] = cat

            for annot_ind, annot in enumerate(data.annotations.select_type('Medications/Drugs')):
                id = annot.id
                span = annot.spans[0]
                span_text = text[span[0]:span[1]]
                neg = annot.properties['negation_indicator']
                neg_status = "-1" if neg is None else "1"
                dtr = annot.properties['DocTimeRel']
                if annot.id in alink_map:
                    alink = alink_map[annot.id]
                else:
                    alink = 'None'

                # Write Brat format:
                brat_out.write('T%d\tDrug %d %d\t%s\n' % 
                    (annot_ind, span[0], span[1], span_text))
                #print("File:%s\tID:%s\tSpan:(%d,%d)\tAnnotatedText:%s\tNegated:%s\tDTR:%s\tAlink:%s" %
                #        (text_name, annot.id, span[0], span[1], span_text, neg_status, dtr, alink))
                # Write some ad-hoc format:
                print("File:%s\tSpan:(%d,%d)\tAnnotatedText:%s\tNegated:%s\tDTR:%s\tAlink:%s" %
                        (text_name, span[0], span[1], span_text, neg_status, dtr, alink))

                # Write bert-style tsv for Neg, DTR, ALink:
                covering_sent_span = find_sentence_for_drug(sent_spans, span)
                inst_text = insert_delimiter_for_entity(text, covering_sent_span, span)
 
                neg_out.write('%s\t%s\n' % (neg_status, inst_text))
                dtr_out.write('%s\t%s\n' % (dtr, inst_text))
                alink_out.write('%s\t%s\n' % (alink, inst_text))

        brat_out.close()
    neg_out.close()
    dtr_out.close()
    alink_out.close()
Exemple #25
0
class ACEParser:
    def __init__(self):
        self.sent_tokenizer = PunktSentenceTokenizer()
        # self.word_tokenizer = RegexpTokenizer('\w+|\S+')
        self.word_tokenizer = WhitespaceTokenizer()
        self.root = None
        self.sentence_offsets = []
        self.df = pd.DataFrame(
            columns=["doc_id", "sentence", "tokens", "events", "entities"])

    def get_text(self, sgm_file):
        with open(sgm_file, "r", encoding="utf-8") as f:
            text = f.read()

        # Gets rid of lines with only tags
        text = re.sub(r"<(.|\s|\n)*?>", r"", text)
        sentence_offsets = list(self.sent_tokenizer.span_tokenize(text))
        sentences = []
        for offset in sentence_offsets:
            sentence_text = text[offset[0]:offset[1]]
            sentences.append(sentence_text)

        self.sentence_offsets = sentence_offsets
        return text

    def create_tree(self, apf_file):
        with open(apf_file, "r", encoding="utf-8") as f:
            xml_text = f.read()

        root = etree.fromstring(xml_text)
        self.root = root

    def get_extents(self):
        extent_nodes = self.root.xpath("//extent/charseq")
        return [
            self.get_offset_tuple(extent_node) for extent_node in extent_nodes
        ]

    def get_offset_tuple(self, extent_node):
        return (int(extent_node.get("START")), int(extent_node.get("END")) + 1
                )  # +1 makes them exclusive

    def get_sentences(self):
        sentences = []
        for offset in self.sentence_offsets:
            sentence_text = text[offset[0]:offset[1]]
            sentences.append(sentence_text)

        return sentences

    def find_sentence_index(self, offset):

        for i, sent_offset in enumerate(self.sentence_offsets):
            if offset[0] >= sent_offset[0] and offset[1] <= sent_offset[1]:
                return i

    def offset_to_token(self, start, end, token_offsets, normalize=0):
        # normalize is making start and end relatable to token_offsets
        start -= normalize
        end -= normalize

        # TODO: change this to if end == offset[1]. In the case that end < offset[1] use startswith and extend token_offsets list
        for i, offset in enumerate(token_offsets):
            if end <= offset[1]:
                for j in range(i, -1, -1):
                    if start >= token_offsets[j][0]:
                        return j, i + 1  # Make it exclusive

        raise Exception(
            "Error while converting offset to token indexes. Start offset : %d , End offset : %d Norm : %d, Token offsets : %s"
            % (start, end, normalize, str(token_offsets)))

    def create_json_output(self, doc_text, filename):
        # doc_id = self.root.xpath("document")[0].get("DOCID")
        doc_id = filename
        event_nodes = self.root.xpath("//event")

        # TODO: We lose coreference information doing it this way. For now it is ok, but need to accomodate the other way too !!!
        event_mentions = []
        for event_node in event_nodes:
            event_type = event_node.get("TYPE")
            event_subtype = event_node.get("SUBTYPE")
            event_id = event_node.get("ID")
            event_mention_nodes = event_node.xpath("event_mention")
            for mention_node in event_mention_nodes:
                # You actually don't need these two for finding which sentence we are talking about.
                # Because we already made sure that all of our extents are covered by sentence offsets.
                # extent_node = mention.xpath("/extent/charseq")[0]
                # extent = get_offset_tuple(extent_node)

                trigger_offset = self.get_offset_tuple(
                    mention_node.xpath("anchor/charseq")[0])

                # find which sentence this belongs. Only need to do this once.
                sent_idx = self.find_sentence_index(trigger_offset)

                event_arguments = []
                arguments = mention_node.xpath("event_mention_argument")
                for argument in arguments:
                    arg_role = argument.get("ROLE")
                    arg_offset = self.get_offset_tuple(
                        argument.xpath("extent/charseq")[0])
                    # TODO: NEED TO ADD ENTITY TYPES, getting them from refids !!!
                    event_arguments.append({
                        "role": arg_role,
                        "start": arg_offset[0],
                        "end": arg_offset[1]
                    })

                event_mentions.append({
                    "event_id": event_id,
                    "event_type": event_type,
                    "event_subtype": event_subtype,
                    "trigger": {
                        "start": trigger_offset[0],
                        "end": trigger_offset[1]
                    },
                    "arguments": event_arguments,
                    "sent_idx": sent_idx
                })

        # For printing later
        # old_event_mentions = copy.deepcopy(event_mentions)

        tokens_list_for_printing = []
        for i, sentence_offset in enumerate(self.sentence_offsets):
            sentence_text = doc_text[sentence_offset[0]:sentence_offset[1]]
            token_offsets = list(
                self.word_tokenizer.span_tokenize(sentence_text))
            tokens = [
                sentence_text[offset[0]:offset[1]] for offset in token_offsets
            ]
            tokens_list_for_printing.append(tokens)
            entity_mentions = []
            curr_event_mentions = []

            for j in range(len(event_mentions)):
                mention = event_mentions[j]
                if mention["sent_idx"] == i:
                    # ipdb.set_trace()
                    start_idx, end_idx = self.offset_to_token(
                        mention["trigger"]["start"],
                        mention["trigger"]["end"],
                        token_offsets,
                        normalize=sentence_offset[0])
                    event_mentions[j]["trigger"]["start"] = start_idx
                    event_mentions[j]["trigger"]["end"] = end_idx

                    for k, argument in enumerate(mention["arguments"]):
                        start_idx, end_idx = self.offset_to_token(
                            argument["start"],
                            argument["end"],
                            token_offsets,
                            normalize=sentence_offset[0])
                        event_mentions[j]["arguments"][k]["start"] = start_idx
                        event_mentions[j]["arguments"][k]["end"] = end_idx

                    curr_event_mentions.append(event_mentions[j])

            self.df = self.df.append(
                {
                    "doc_id": doc_id,
                    "sentence": sentence_text,
                    "tokens": tokens,
                    "events": curr_event_mentions,
                    "entities": entity_mentions
                },
                ignore_index=True)

        # Printing stuff
        # for mention, old_mention in zip(event_mentions, old_event_mentions):
        #     tokens = tokens_list_for_printing[mention["sent_idx"]]
        #     print("Offset version trigger : %s , Tokens version trigger : %s" %(doc_text[old_mention["trigger"]["start"]:old_mention["trigger"]["end"]], tokens[mention["trigger"]["start"]:mention["trigger"]["end"]]))
        #     for argument, old_argument in zip(mention["arguments"], old_mention["arguments"]):
        #         print("Offset version argument : %s , Tokens version argument : %s" %(doc_text[old_argument["start"]:old_argument["end"]], tokens[argument["start"]:argument["end"]]))

        #     print("===========")

    # TODO: Remove debug stuff
    def fix_offsets(self, extents):
        offsets = self.sentence_offsets
        assert (len(offsets) > 1)
        # print(offsets)
        # print("*************")

        after_count = 0
        before_count = 0
        for extent in extents:
            # Check stuff for printing
            if len([
                    offset for offset in offsets
                    if extent[0] >= offset[0] and extent[1] <= offset[1]
            ]) == 0:
                before_count += 1

            if extent[1] <= offsets[0][1]:
                continue

            for idx in range(1, len(offsets)):
                offset = offsets[idx]
                if extent[1] <= offset[1]:  # Ends before this sentence.
                    if extent[0] < offset[0]:  # Starts before this sentence
                        # Fixing
                        # print("-------")
                        # print(extent)
                        # print(offsets)
                        for j in range(
                                idx - 1, -1, -1
                        ):  # For all sentences' offsets before this offset
                            del offsets[j + 1]
                            if extent[0] >= offsets[j][0]:
                                offsets[j] = (offsets[j][0], offset[1])
                                break

                        # print(offsets)
                        break

                    else:  # Nothing wrong with this extent
                        break

            # Check stuff for printing
            if len([
                    offset for offset in offsets
                    if extent[0] >= offset[0] and extent[1] <= offset[1]
            ]) == 0:
                ipdb.set_trace()
                # MISSES some due to spaces between sentences
                # print(extent)
                # print(text[extent[0]:extent[1]])
                after_count += 1

        # print("Before : %d -> After : %d" %(before_count, after_count))
        # print("================================================================================================================")

        self.sentence_offsets = offsets
Exemple #26
0
def main(args):
    if len(args) < 2:
        sys.stderr.write('Required arguments: <input dir> <output dir>\n')
        sys.exit(-1)

    sent_tokenizer = PunktSentenceTokenizer()
    sentence_lookahead = 0

    # get all .txt files from the chqa directory:
    txt_files = glob.glob(join(args[0], '*.txt'))
    rel_out = open(join(args[1], 'ade-all-relations.flair'), 'w')

    for txt_fn in txt_files:
        ann_fn = txt_fn[:-3] + 'ann'
        if not isfile(ann_fn): continue

        print('Processing file %s which has corresponding file %s' % (txt_fn, ann_fn))

        with open(txt_fn, 'r') as myfile:
            text = myfile.read()
        ents,rels = read_brat_file(ann_fn)

        sent_spans = list(sent_tokenizer.span_tokenize(text))

        for sent_ind in range(len(sent_spans)):
            primary_sent_span = sent_spans[sent_ind]
            end_window_ind = min(sent_ind+sentence_lookahead, len(sent_spans)-1)
            end_sent_span = sent_spans[end_window_ind]

            sent = text[primary_sent_span[0]:end_sent_span[1]].replace('\n', ' ')
            drug_ents, att_ents = get_span_ents(primary_sent_span, end_sent_span, ents)

            for att_ent in att_ents:
                for drug_ent in drug_ents:
                    ## Make sure one of the ents is in the first sentence (otherwise we'll get to it later)
                    if att_ent.start > primary_sent_span[1] and drug_ent.start > primary_sent_span[1]:
                        continue

                    label = get_label(rels, ents, att_ent, drug_ent)

                    ## Get index of ents into sent:
                    a1_start = att_ent.start - primary_sent_span[0]
                    a1_end = att_ent.end - primary_sent_span[0]

                    a2_start = drug_ent.start - primary_sent_span[0]
                    a2_end = drug_ent.end - primary_sent_span[0]

                    if a1_start < a2_start:
                        # arg1 occurs before arg2
                        rel_text = (sent[:a1_start] + 
                                    " %sStart %s %sEnd " % (att_ent.cat, sent[a1_start:a1_end], att_ent.cat) +
                                    sent[a1_end:a2_start] +
                                    " DrugStart %s DrugEnd " % (sent[a2_start:a2_end]) +
                                    sent[a2_end:])
                    else:
                        rel_text = (sent[:a2_start] +
                                    " DrugStart %s DrugEnd " % (sent[a2_start:a2_end]) +
                                    sent[a2_end:a1_start] +
                                    " %sStart %s %sEnd " % (att_ent.cat, sent[a1_start:a1_end], att_ent.cat) +
                                    sent[a1_end:])
                    ## lookup flair classification format
                    rel_out.write('__label__%s %s \n' % (label, rel_text))
    
    rel_out.close()
Exemple #27
0
def sent_pos(in_dir):
    """ Positions of citation markers in sentences, relatve to where in doc
    """

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    with open('hedge_words') as f:
        hedge_words = [l.strip() for l in f.readlines()]

    x = []
    y = []
    file_names = os.listdir(in_dir)
    buckets = []
    for foo in range(10):
        buckets.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    for file_idx, fn in enumerate(file_names):
        if file_idx % 100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue
        with open(path) as f:
            text = f.read()
        text = re.sub(E_G_PATT, 'e.g.', text)
        # annot_fn = '{}_annot.json'.format(aid)
        # annot_path = os.path.join(in_dir, annot_fn)
        # if not os.path.isfile(annot_path):
        #     continue
        # with open(annot_path) as f:
        #     annots = json.load(f)

        marker = ' \u241F '
        doc_len = len(text)
        # ↓ word wise
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            # determine contained annotations
            # annotated_words = []
            # for annot in annots:
            #     start = annot[0]
            #     end = annot[1]
            #     dbp_id = annot[2]
            #     annot_len = end - start
            #     in_sent_idx = start - sent_idx
            #     if start >= sent_idx and end <= sent_edx:
            #         disp = sentence_orig[in_sent_idx:in_sent_idx+annot_len]
            #         annotated_words.append(disp)
            if marker in sentence:
                doc_pos = 1 - (sent_idx / doc_len)
                buck_y_idx = math.floor(doc_pos * 10)
                if buck_y_idx == 10:
                    buck_y_idx = 9
                words = pos_tag(sentence.split())
                words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
                sent_len = len(words)
                sent_tags_str = ' '.join([tup[1] for tup in words])
                indices = [
                    i for i, tup in enumerate(words)
                    if tup[0] == marker.strip()
                ]
                # if 'JJS' not in sent_tags_str:
                #     continue
                for word_idx in indices:
                    word = words[word_idx][0]
                    # if word == marker.strip() and \
                    #     words[word_idx-1][1] == 'IN':

                    # if word == marker.strip() and \
                    #     ((word_idx > 0 and \
                    #       'FORMULA' not in words[word_idx-1][0] and \
                    #       words[word_idx-1][1] in ['NNP', 'NNPS']) or \
                    #      (word_idx > 1 and \
                    #       words[word_idx-1][1] in ['NN', 'NNS'] and \
                    #       'FORMULA' not in words[word_idx-2][0] and \
                    #       words[word_idx-2][1] in ['NNP', 'NNPS'])):

                    # if word == marker.strip() and \
                    #     (word_idx > 0 and \
                    #      words[word_idx-1][0] in annotated_words and \
                    #      words[word_idx-1][1] in ['NNP', 'NNPS']):

                    # if word == marker.strip() and \
                    #     word_idx+1 < len(words) and \
                    #     'VB' in words[word_idx+1][1]:

                    if word == marker.strip():
                        # print(words)
                        # print('doc pos:  {}'.format((sent_idx/doc_len)))
                        # print('sent pos: {}/{}'.format((word_idx+1),sent_len))
                        # input()
                        sent_pos = (word_idx + 1) / sent_len
                        y.append(doc_pos)
                        x.append(sent_pos)
                        buck_x_idx = math.floor(sent_pos * 10)
                        if buck_x_idx == 10:
                            buck_x_idx = 9
                        buckets[buck_y_idx][buck_x_idx] += 1
        # if file_idx > 1000:
        #     break

        # # ↓ character wise
        # for sent_idx, sentence in enumerate(sentences):
        #     # has_hw = False
        #     # for hw in hedge_words:
        #     #     if hw in sentence:
        #     #         has_hw = True
        #     #         break
        #     # if not has_hw:
        #     #     continue
        #     sent_len = len(sentence)
        #     doc_pos = 1 - (sent_idx/doc_len)
        #     buck_y_idx = math.floor(doc_pos*10)
        #     if buck_y_idx == 10:
        #         buck_y_idx = 9
        #     for cit_mark in re.finditer(marker, sentence):
        #         cm_idx = cit_mark.end()
        #         sent_pos = cm_idx/sent_len
        #         y.append(doc_pos)
        #         x.append(sent_pos)
        #         buck_x_idx = math.floor(sent_pos*10)
        #         if buck_x_idx == 10:
        #             buck_x_idx = 9
        #         buckets[buck_y_idx][buck_x_idx] += 1

    print('normalized row distributions:')
    for line in buckets:
        print(' '.join(['{:.2f}'.format(x / sum(line)) for x in line]))

    plt.xlabel('citation marker position in sentence')
    plt.ylabel('sentence position in document')

    heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50))
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm())
    # plt.imshow(heatmap.T, extent=extent, origin='lower')
    plt.colorbar()
    plt.show()

    plt.clf()

    plt.xlabel('citation marker position in sentence')
    plt.ylabel('sentence position in document')

    heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50))
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    # plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm())
    plt.imshow(heatmap.T, extent=extent, origin='lower')
    plt.colorbar()
    plt.show()
Exemple #28
0
def main(args):
    args = parser.parse_args()

    # Loading classifier model:
    print("Loading classifier model")
    classifier = TextClassifier.load_from_file(join(args.model_dir, 'best-model.pt'))

    txt_files = glob.glob(join(args.data_dir, '*.txt'))
    
    sent_splitter = PunktSentenceTokenizer()
    tokenizer = TreebankWordTokenizer()
    sentence_lookahead = 0

    for txt_fn in txt_files:
        print("Processing %s" % (txt_fn))
        ann_input_fn = join(args.data_dir, basename(txt_fn)[:-3]+'ann')
        ents, _ = read_brat_file(ann_input_fn)

        ann_output_fn = join(args.output_dir, basename(txt_fn)[:-3]+'ann')
        with open(txt_fn, 'r') as myfile:
            text = myfile.read()

        ann_out = open(ann_output_fn, 'w')
        
        # Write entities right away:
        for ent_id in ents.keys():
            ent = ents[ent_id]
            ent_text = text[ent.start:ent.end].replace('\n', ' ')
            ann_out.write('%s\t%s %d %d\t%s\n' % (ent_id, ent.cat, ent.start, ent.end, ent_text))

        sent_spans = list(sent_splitter.span_tokenize(text))

        rel_ind = 0
        rel_attempts = 0
        for sent_ind in range(len(sent_spans)):
            primary_sent_span = sent_spans[sent_ind]
            end_window_ind = min(sent_ind+sentence_lookahead, len(sent_spans)-1)
            end_sent_span = sent_spans[end_window_ind]

            sent = text[primary_sent_span[0]:end_sent_span[1]].replace('\n', ' ')
            drug_ents, att_ents = get_span_ents(primary_sent_span, end_sent_span, ents)

            for att_ent in att_ents:
                for drug_ent in drug_ents:
                    ## Get index of ents into sent:
                    a1_start = att_ent.start - primary_sent_span[0]
                    a1_end = att_ent.end - primary_sent_span[0]
                    a1_text = sent[a1_start:a1_end]

                    a2_start = drug_ent.start - primary_sent_span[0]
                    a2_end = drug_ent.end - primary_sent_span[0]
                    a2_text = sent[a2_start:a2_end]

                    if a1_start < a2_start:
                        # arg1 occurs before arg2
                        rel_text = (sent[:a1_start] + 
                                    " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) +
                                    sent[a1_end:a2_start] +
                                    " DrugStart %s DrugEnd" % (a2_text) +
                                    sent[a2_end:])
                    else:
                        rel_text = (sent[:a2_start] +
                                    " DrugStart %s DrugEnd " % (a2_text) +
                                    sent[a2_end:a1_start] +
                                    " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) +
                                    sent[a1_end:])

                    # if att_ent.cat == 'Dosage':
                        # print("working with Dosage ent")
                    sentence = Sentence(rel_text, use_tokenizer=True)
                    labels = classifier.predict(sentence)[0].labels
                    if len(labels) > 1:
                        print('  This relation has more than one output label')
                    label = labels[0].value
                    # print("Comparing ent %s and ent %s and got %s" % (att_ent.id, drug_ent.id, label))
                    rel_attempts += 1
                    if not label == 'None':
                        # Make sure label corresponds to entity type:
                        if label.find(att_ent.cat) < 0:
                            # print("  Skipping found relation where label %s doesn't match arg type %s" % (label, att_ent.cat))
                            continue
                        ann_out.write('R%d\t%s Arg1:%s Arg2:%s\n' % (rel_ind, label, att_ent.id, drug_ent.id))
                        rel_ind += 1

        # print("Finished: Found %d relations while making %d classification attempts" % (rel_ind, rel_attempts))
        ann_out.close()