Example #1
0
File: search.py Project: dmcc/brat
def _split_and_tokenize(s):
    """
    Helper, sentence-splits and tokenizes, returns array comparable to
    what you would get from re.split(r'(\s+)', s).
    """
    from ssplit import en_sentence_boundary_gen
    from tokenise import en_token_boundary_gen

    tokens = []

    sprev = 0
    for sstart, send in en_sentence_boundary_gen(s):
        if sprev != sstart:
            # between-sentence space
            tokens.append(s[sprev:sstart])
        stext = s[sstart:send]
        tprev = sstart
        for tstart, tend in en_token_boundary_gen(stext):
            if tprev != tstart:
                # between-token space
                tokens.append(s[sstart+tprev:sstart+tstart])
            tokens.append(s[sstart+tstart:sstart+tend])
            tprev = tend
        sprev = send

    if sprev != len(s):
        # document-final space
        tokens.append(s[sprev:])

    assert "".join(tokens) == s, "INTERNAL ERROR\n'%s'\n'%s'" % ("".join(tokens),s)

    return tokens
Example #2
0
def _split_and_tokenize(s):
    """
    Helper, sentence-splits and tokenizes, returns array comparable to
    what you would get from re.split(r'(\s+)', s).
    """
    from ssplit import en_sentence_boundary_gen
    from tokenise import en_token_boundary_gen

    tokens = []

    sprev = 0
    for sstart, send in en_sentence_boundary_gen(s):
        if sprev != sstart:
            # between-sentence space
            tokens.append(s[sprev:sstart])
        stext = s[sstart:send]
        tprev = sstart
        for tstart, tend in en_token_boundary_gen(stext):
            if tprev != tstart:
                # between-token space
                tokens.append(s[sstart+tprev:sstart+tstart])
            tokens.append(s[sstart+tstart:sstart+tend])
            tprev = tend
        sprev = send

    if sprev != len(s):
        # document-final space
        tokens.append(s[sprev:])

    assert "".join(tokens) == s, "INTERNAL ERROR\n'%s'\n'%s'" % ("".join(tokens),s)

    return tokens
Example #3
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error(
                'Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' ' + unichr(0x00A0))

    j_dic['text'] = text

    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True
Example #4
0
def sentencebreaks_to_newlines(text):
    offsets = [o for o in en_sentence_boundary_gen(text)]

    # adjust to include any initial space skipped by the
    # boundary generator. (TODO: fix generator instead.)
    if offsets and offsets[0][0] > 0:
        offsets.insert(0, (0, offsets[0][0]))

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    prev_end = 0
    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum - 1:
            orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

            if (offsets[i][1] < offsets[i + 1][0]
                    and text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append('\n' +
                                 text[offsets[i][1] + 1:offsets[i + 1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check
    assert text == ''.join(
        orig_parts), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
            text, ''.join(orig_parts))

    splittext = ''.join(new_parts)

    # sanity
    assert len(text) == len(splittext), "INTERNAL ERROR"
    assert _normspace(text) == _normspace(
        splittext), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
            _normspace(text), _normspace(splittext))

    return splittext
Example #5
0
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None):
    if raw_text is not None:
        # looks like somebody read this already; nice
        text = raw_text
    else:
        # need to read raw text
        try:
            with open_textfile(txt_file_path, 'r') as txt_file:
                text = txt_file.read()
        except IOError:
            raise UnableToReadTextFile(txt_file_path)
        except UnicodeDecodeError:
            Messager.error('Error reading text file: nonstandard encoding or binary?', -1)
            raise UnableToReadTextFile(txt_file_path)

    # TODO XXX huge hack, sorry, the client currently crashing on
    # chrome for two or more consecutive space, so replace every
    # second with literal non-breaking space. Note that this is just
    # for the client display -- server-side storage is not affected.
    # NOTE: it might be possible to fix this in a principled way by
    # having xml:space="preserve" on the relevant elements.
    text = text.replace("  ", ' '+unichr(0x00A0))

    j_dic['text'] = text
    
    from logging import info as log_info

    # First, generate tokenisation
    if JAPANESE:
        from tokenise import jp_token_boundary_gen
        token_offsets = [o for o in jp_token_boundary_gen(text)]
    else:
        from tokenise import en_token_boundary_gen
        token_offsets = [o for o in en_token_boundary_gen(text)]
    j_dic['token_offsets'] = token_offsets

    if NEWLINE_SS:
        from ssplit import newline_sentence_boundary_gen
        sentence_offsets = [o for o in newline_sentence_boundary_gen(text)]
    elif JAPANESE:
        from ssplit import jp_sentence_boundary_gen
        sentence_offsets = [o for o in jp_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(offsets))
    else:
        from ssplit import en_sentence_boundary_gen
        sentence_offsets = [o for o in en_sentence_boundary_gen(text)]
        #log_info('offsets: ' + str(sentence_offsets))
    j_dic['sentence_offsets'] = sentence_offsets

    return True
Example #6
0
def sentencebreaks_to_newlines(text):
    offsets = [o for o in en_sentence_boundary_gen(text)]

    # adjust to include any initial space skipped by the
    # boundary generator. (TODO: fix generator instead.)
    if offsets and offsets[0][0] > 0:
        offsets.insert(0, (0, offsets[0][0]))

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    prev_end = 0
    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum-1:
            orig_parts.append(text[offsets[i][1]:offsets[i+1][0]])

            if (offsets[i][1] < offsets[i+1][0] and
                text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append('\n'+text[offsets[i][1]+1:offsets[i+1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i+1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check
    assert text == ''.join(orig_parts), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (text, ''.join(orig_parts))

    splittext = ''.join(new_parts)

    # sanity
    assert len(text) == len(splittext), "INTERNAL ERROR"
    assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (_normspace(text), _normspace(splittext))

    return splittext
Example #7
0
def split_sentences(text):
    offsets = [o for o in en_sentence_boundary_gen(text)]

    # adjust to include any initial space skipped by the
    # boundary generator. (TODO: fix generator instead.)
    if offsets and offsets[0][0] > 0:
        offsets.insert(0, (0, offsets[0][0]))

    # adjust to include any intervening space
    adjusted = []
    for i in range(len(offsets)-1):
        adjusted.append((offsets[i][0], offsets[i+1][0]))
    if offsets:
        adjusted.append((offsets[-1][0], len(text)))
    offsets = adjusted

    return [s for s in _text_by_offsets_gen(text, offsets)]
Example #8
0
def split_sentences(text):
    offsets = [o for o in en_sentence_boundary_gen(text)]

    # adjust to include any initial space skipped by the
    # boundary generator. (TODO: fix generator instead.)
    if offsets and offsets[0][0] > 0:
        offsets.insert(0, (0, offsets[0][0]))

    # adjust to include any intervening space
    adjusted = []
    for i in range(len(offsets) - 1):
        adjusted.append((offsets[i][0], offsets[i + 1][0]))
    if offsets:
        adjusted.append((offsets[-1][0], len(text)))
    offsets = adjusted

    return [s for s in _text_by_offsets_gen(text, offsets)]
Example #9
0
def sentencebreaks_to_newlines(text):
    offsets = [o for o in en_sentence_boundary_gen(text)]

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum - 1:
            orig_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

            if (offsets[i][1] < offsets[i + 1][0]
                    and text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append('\n' +
                                 text[offsets[i][1] + 1:offsets[i + 1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i + 1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check
    assert text == ''.join(
        orig_parts), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
            text, ''.join(orig_parts))

    splittext = ''.join(new_parts)

    # sanity
    assert len(text) == len(splittext), "INTERNAL ERROR"
    assert _normspace(text) == _normspace(
        splittext), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (
            _normspace(text), _normspace(splittext))

    return splittext
Example #10
0
File: search.py Project: dmcc/brat
def _get_offset_sentence_map(s):
    """
    Helper, sentence-splits and returns a mapping from character
    offsets to sentence number.
    """
    from ssplit import en_sentence_boundary_gen

    m = {} # TODO: why is this a dict and not an array?
    sprev, snum = 0, 1 # note: sentences indexed from 1
    for sstart, send in en_sentence_boundary_gen(s):
        # if there are extra newlines (i.e. more than one) in between
        # the previous end and the current start, those need to be
        # added to the sentence number
        snum += max(0,len([nl for nl in s[sprev:sstart] if nl == "\n"]) - 1)
        for o in range(sprev, send):
            m[o] = snum
        sprev = send
        snum += 1
    return m
Example #11
0
def _get_offset_sentence_map(s):
    """
    Helper, sentence-splits and returns a mapping from character
    offsets to sentence number.
    """
    from ssplit import en_sentence_boundary_gen

    m = {} # TODO: why is this a dict and not an array?
    sprev, snum = 0, 1 # note: sentences indexed from 1
    for sstart, send in en_sentence_boundary_gen(s):
        # if there are extra newlines (i.e. more than one) in between
        # the previous end and the current start, those need to be
        # added to the sentence number
        snum += max(0,len([nl for nl in s[sprev:sstart] if nl == "\n"]) - 1)
        for o in range(sprev, send):
            m[o] = snum
        sprev = send
        snum += 1
    return m
Example #12
0
def sentencebreaks_to_newlines(text):
    offsets = [o for o in en_sentence_boundary_gen(text)]

    # break into sentences
    sentences = [s for s in _text_by_offsets_gen(text, offsets)]

    # join up, adding a newline for space where possible
    orig_parts = []
    new_parts = []

    sentnum = len(sentences)
    for i in range(sentnum):
        sent = sentences[i]
        orig_parts.append(sent)
        new_parts.append(sent)

        if i < sentnum-1:
            orig_parts.append(text[offsets[i][1]:offsets[i+1][0]])

            if (offsets[i][1] < offsets[i+1][0] and
                text[offsets[i][1]].isspace()):
                # intervening space; can add newline
                new_parts.append('\n'+text[offsets[i][1]+1:offsets[i+1][0]])
            else:
                new_parts.append(text[offsets[i][1]:offsets[i+1][0]])

    if len(offsets) and offsets[-1][1] < len(text):
        orig_parts.append(text[offsets[-1][1]:])
        new_parts.append(text[offsets[-1][1]:])

    # sanity check
    assert text == ''.join(orig_parts), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (text, ''.join(orig_parts))

    splittext = ''.join(new_parts)

    # sanity
    assert len(text) == len(splittext), "INTERNAL ERROR"
    assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n    '%s'\nvs\n    '%s'" % (_normspace(text), _normspace(splittext))

    return splittext