Example #1
0
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    global options
    # print(f)
    if options.nosplit:
        sentences = f.readlines()
        # print(sentences)
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)

            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []

    offset = 0
    # print(sentences)
    for s in sentences:
        nonspace_token_seen = False
        # print(s)

        # tokens = s.split(" ")
        tokens = word_tokenize(s)

        token_w_pos = map_text_to_char(s, tokens, offset)
        # print("token_w_pos: ",token_w_pos)

        for (t, pos) in token_w_pos:
            if not t.isspace():
                l1 = ['O', pos, pos + len(t), t]
                lines.append(l1)
                # print(l1)

        lines.append([])

        offset += len(s)

        # tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] # JT : Dec 6
        # for t in tokens:
        #     if not t.isspace():
        #         lines.append(['O', offset, offset + len(t), t])
        #         nonspace_token_seen = True
        #     offset += len(t)

        # # sentences delimited by empty lines
        # if nonspace_token_seen:
        #     lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        textbounds, dict_of_entity, list_of_relns = get_annotations(f.name)
        lines = relabel(lines, textbounds, dict_of_entity, list_of_relns, f)
        # print(lines)

    # lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] #JT: Dec 6
    lines = [[l[3], l[0]] if l else l for l in lines]  #JT: Dec 6
    # lines = [[l[3],l[0],l[4],l[5],l[6]] if l else l for l in lines] #JT: Dec 6

    return StringIO('\n'.join(('\t'.join(l) for l in lines)))
Example #2
0
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers):
	"""
	Returns:

	language, number of sentences, number of tokens

	"""
	lang, no_sentences, no_tokens = np.nan,np.nan,np.nan
	try:
		intermediate_out_file = "%s%s"%(interm_dir,doc_id)
		iob_out_file = "%s%s"%(out_dir,doc_id)
		text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read()
		intermediate_text = sentencebreaks_to_newlines(text)
		recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False)
		codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text)
		logger.info("Written intermediate output to %s"%intermediate_out_file)
		lang = detect_language(text)
		logger.info("Language detected=\"%s\""%lang)
		sentences = recovered_text.split('\n')
		logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences)))
		tagged_sentences = taggers[lang].tag_sents(sentences)
		tokenised_text = [[token[:2] for token in line] for line in tagged_sentences]
		IO.write_iob_file(tokenised_text,iob_out_file)
		logger.info("Written IOB output to %s"%iob_out_file)
		no_sentences = len(recovered_text.split('\n'))
		no_tokens = IO.count_tokens(tokenised_text)
	except Exception, e:
		logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e)) 
Example #3
0
def _apply_tagger(text):

    try:
        splittext = sentencebreaks_to_newlines(text)
    except:

        print >> stderr, "Warning: sentence splitting failed for input:\n'%s'" % text
        splittext = text

    sentences = splittext.split('\n')
    all_tagged = []
    baseoffset = 0
    for s in sentences:
        tagged = _apply_tagger_to_sentence(s)

        for t in tagged:
            t.startOff += baseoffset
            t.endOff += baseoffset

        all_tagged.extend(tagged)
        baseoffset += len(s) + 1

    anns = {}

    idseq = 1
    for t in all_tagged:
        anns["T%d" % idseq] = {
            'type': t.eType,
            'offsets': ((t.startOff, t.endOff), ),
            'texts': (t.eText, ),
        }
        idseq += 1

    return anns
def _apply_tagger(text):
    global tagger_process, tagger_queue

    # the tagger expects a sentence per line, so do basic splitting
    try:
        splittext = sentencebreaks_to_newlines(text)
    except BaseException:
        # if anything goes wrong, just go with the
        # original text instead
        print("Warning: sentence splitting failed for input:\n'%s'" % text,
              file=stderr)
        splittext = text

    print(splittext, file=tagger_process.stdin)
    print(DOCUMENT_BOUNDARY, file=tagger_process.stdin)
    tagger_process.stdin.flush()

    response_lines = []
    while True:
        l = tagger_process.stdout.readline()
        l = l.rstrip('\n')

        if l == DOCUMENT_BOUNDARY:
            break

        response_lines.append(l)

    try:
        tagged_entities = BIO_lines_to_standoff(response_lines, text)
    except BaseException:
        # if anything goes wrong, bail out
        print("Warning: BIO-to-standoff conversion failed for BIO:\n'%s'" %
              '\n'.join(response_lines),
              file=stderr)
        return {}

    anns = {}

    for t in tagged_entities:
        anns["T%d" % t.idNum] = {
            'type': t.eType,
            'offsets': ((t.startOff, t.endOff), ),
            'texts': (t.eText, ),
        }

    return anns
Example #5
0
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    global options

    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:

            l, junk_mark = sentencebreaks_to_newlines(l)

            if not junk_mark:

                sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

            else:
                JUNKS_FILE.append(f)
                break

    lines = []

    offset = 0
    for s in sentences:
        nonspace_token_seen = False

        tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]

        for t in tokens:
            if not t.isspace():
                lines.append(['O', offset, offset + len(t), t])
                nonspace_token_seen = True
            offset += len(t)

        # sentences delimited by empty lines
        if nonspace_token_seen:
            lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name))

    # TODO неправильный порядок?!
    # lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines]
    lines = [[l[3], str(l[1]), str(l[2]), l[0]] if l else l for l in lines]
    return StringIO('\n'.join(('\t'.join(l) for l in lines)))
Example #6
0
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    sentences = []
    for l in f:
        l = sentencebreaks_to_newlines(l)
        sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []
    for s in sentences:
        nonspace_token_seen = False
        tokens = [t for t in s.split() if t]
        for i, t in enumerate(tokens):
            if not t.isspace():
                # pre label rules designed by Deheng
                #if API_pattern.match(t) is not None:
                #    lines.append([t, 'B-API'])
                if i < len(tokens) - 2:
                    comp = tokens[i - 1] + t + tokens[i + 1]
                    comp = comp.lower()
                else:
                    comp = ""

                if t.endswith("()"):
                    #print t
                    t_nobracket = t[:-2]
                    if t_nobracket.lower() in api_list:
                        lines.append([t, 'B-API'])
                    else:
                        lines.append([t, 'O'])
                elif t.lower() in api_list:
                    #print t
                    lines.append([t, 'B-API'])
                elif comp in api_list:
                    print comp
                    lines.append([t, 'B-API'])
                else:
                    lines.append([t, 'O'])
                nonspace_token_seen = True
        # sentences delimited by empty lines
        if nonspace_token_seen:
            lines.append([])

    lines = [[l[0], l[1]] if l else l for l in lines]
    return StringIO('\n'.join(('\t'.join(l) for l in lines)))
Example #7
0
def _apply_tagger(text):
    global tagger_process, tagger_queue

    # the tagger expects a sentence per line, so do basic splitting
    try:
        splittext = sentencebreaks_to_newlines(text)
    except BaseException:
        # if anything goes wrong, just go with the
        # original text instead
        print("Warning: sentence splitting failed for input:\n'%s'" % text, file=stderr)
        splittext = text

    print(splittext, file=tagger_process.stdin)
    print(DOCUMENT_BOUNDARY, file=tagger_process.stdin)
    tagger_process.stdin.flush()

    response_lines = []
    while True:
        l = tagger_process.stdout.readline()
        l = l.rstrip('\n')

        if l == DOCUMENT_BOUNDARY:
            break

        response_lines.append(l)

    try:
        tagged_entities = BIO_lines_to_standoff(response_lines, text)
    except BaseException:
        # if anything goes wrong, bail out
        print("Warning: BIO-to-standoff conversion failed for BIO:\n'%s'" % '\n'.join(
            response_lines), file=stderr)
        return {}

    anns = {}

    for t in tagged_entities:
        anns["T%d" % t.idNum] = {
            'type': t.eType,
            'offsets': ((t.startOff, t.endOff), ),
            'texts': (t.eText, ),
        }

    return anns
Example #8
0
def process_rels(terms_dict: Dict, text: str, rels: List):
    sentences = sentencebreaks_to_newlines(text)
    sent_stops = [m.start() for m in re.finditer('\n', sentences)] # Find sentence breaks
    assert(len(sentences) == len(text)) # Assume BRAT sent processing replaces spaces with \n

    sent_bounds: List = []

    for ent1, relation, ent2 in rels:
        arg1_bounds: List = BRATHelper.get_ent_bounds(ent1)
        rel_bounds: List = BRATHelper.get_ent_bounds(relation)
        arg2_bounds: List = BRATHelper.get_ent_bounds(ent2)

        sent_min  = min(arg1_bounds + rel_bounds + arg2_bounds)
        sent_max = max(arg1_bounds + rel_bounds + arg2_bounds)
        sent_min, sent_max = BRATHelper.get_sent_bounds(sent_stops, sent_min, sent_max) # Get bounds of sentence

        sent_bounds.append((sent_min, sent_max))

    return sent_bounds
Example #9
0
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    global options

    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)
            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []

    offset = 0
    for s in sentences:

        nonspace_token_seen = False
        # pdb.set_trace()
        tokens = nltk.word_tokenize(s)
        # tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]
        ############################################################## SKATA NA FAS EDOOOOOOOOOOOOOOOOOOO
        for t in tokens:
            if not t.isspace():
                # pdb.set_trace()

                lines.append(['O', offset, offset + len(t), t])
                nonspace_token_seen = True
            offset += len(t)

        # sentences delimited by empty lines
        if nonspace_token_seen:
            # pdb.set_trace()
            lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name))

    lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines]
    # pdb.set_trace()
    return StringIO('\n'.join(('\t'.join(l) for l in lines)))
Example #10
0
def _apply_tagger(text):
    global tagger_process, tagger_queue

    try:
        splittext = sentencebreaks_to_newlines(text)
    except:

        print >> stderr, "Warning: sentence splitting failed for input:\n'%s'" % text
        splittext = text

    print >> tagger_process.stdin, splittext
    print >> tagger_process.stdin, DOCUMENT_BOUNDARY
    tagger_process.stdin.flush()

    response_lines = []
    while True:
        l = tagger_process.stdout.readline()
        l = l.rstrip('\n')

        if l == DOCUMENT_BOUNDARY:
            break

        response_lines.append(l)

    try:
        tagged_entities = BIO_lines_to_standoff(response_lines, text)
    except:

        print >> stderr, "Warning: BIO-to-standoff conversion failed for BIO:\n'%s'" % '\n'.join(
            response_lines)
        return {}

    anns = {}

    for t in tagged_entities:
        anns["T%d" % t.idNum] = {
            'type': t.eType,
            'offsets': ((t.startOff, t.endOff), ),
            'texts': (t.eText, ),
        }

    return anns
Example #11
0
def text_to_conll_lines(f):
    """Convert plain text into CoNLL format."""
    global options

    if not options:
        options = argparser_internal().parse_args(None)

    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)
            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []

    offset = 0
    for s in sentences:
        nonspace_token_seen = False

        tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]

        for t in tokens:
            if not t.isspace():
                lines.append(['O', offset, offset+len(t), t])
                nonspace_token_seen = True
            offset += len(t)

        # sentences delimited by empty lines
        if nonspace_token_seen:
            lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name))

    lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines]
    lines = conll_to_standford(lines)
    lines = strip_xml_tag(lines)
    return lines
Example #12
0
def text_to_conll_lines(f):
    """Convert plain text into CoNLL format."""
    global options

    if not options:
        options = argparser_internal().parse_args(None)

    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)
            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []

    offset = 0
    for s in sentences:
        nonspace_token_seen = False

        tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]

        for t in tokens:
            if not t.isspace():
                lines.append(['O', offset, offset + len(t), t])
                nonspace_token_seen = True
            offset += len(t)

        # sentences delimited by empty lines
        if nonspace_token_seen:
            lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name))

    lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines]
    lines = conll_to_standford(lines)
    lines = strip_xml_tag(lines)
    return lines
Example #13
0
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    global options
    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)
            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])
    lines = []

    offset = 0
    for s in sentences:
        nonspace_token_seen = False

        #tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]
        tokens = [t for t in non_ascii_tokenizer(s) if t]

        # " " single-space is appended to token list
        # to fix the multi-line issue
        # Changed by Oyesh
        #tokens.append("")

        for t in tokens:
            if not t.isspace():
                lines.append(['O', offset, offset + len(t), t])
                nonspace_token_seen = True
            offset += len(t)

        # sentences delimited by empty lines
        if nonspace_token_seen:
            lines.append([])
    
    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name))

    #lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines]
    lines = [[l[3], str(l[1]), str(l[2]), l[0]] if l else l for l in lines]
    return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def _apply_tagger(text):
    # MetaMap isn't too happy with large outputs, so process a
    # sentence per invocation

    try:
        splittext = sentencebreaks_to_newlines(text)
    except BaseException:
        # if anything goes wrong, just go with the
        # original text instead
        print("Warning: sentence splitting failed for input:\n'%s'" % text, file=stderr)
        splittext = text

    sentences = splittext.split('\n')
    all_tagged = []
    baseoffset = 0
    for s in sentences:
        tagged = _apply_tagger_to_sentence(s)

        # adjust offsets
        for t in tagged:
            t.startOff += baseoffset
            t.endOff += baseoffset

        all_tagged.extend(tagged)
        baseoffset += len(s) + 1

    anns = {}

    idseq = 1
    for t in all_tagged:
        anns["T%d" % idseq] = {
            'type': t.eType,
            'offsets': ((t.startOff, t.endOff), ),
            'texts': (t.eText, ),
        }
        idseq += 1

    return anns
Example #15
0
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    global options

    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)
            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []

    offset = 0
    for s in sentences:
        nonspace_token_seen = False
        """fake_tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]
        tokens = []
        quote_count = 0

        for i, t in enumerate(fake_tokens):
            if quote(t):
                quote_count += 1

            if i == 0:
                tokens.append(t)
            if i == len(fake_tokens)-1:
                tokens.append(t)

            if i > 0 and i < len(fake_tokens)-1:
                if fake_tokens[i]==' ':
                    if re.match(r'^[\(]$', fake_tokens[i-1]):
                        continue
                    if re.match(r'^[\.,\)\?\!]$', fake_tokens[i+1]):
                        continue
                    if quote(fake_tokens[i-1]) and quote_count is not None and quote_count%2==1:
                        continue
                    if quote(fake_tokens[i+1]) and quote_count is not None and quote_count%2==1:
                        continue
                tokens.append(t)
        #print tokens

        tokens2 = [t for t in s.split('\s') if t]
        #print tokens2
        """
        tokens = [t for t in TOKENIZATION_REGEX.split(s) if t]
        for t in tokens:
            if not t.isspace():
                lines.append(['O', offset, offset + len(t), t])
                nonspace_token_seen = True
            offset += len(t)

        # sentences delimited by empty lines
        if nonspace_token_seen:
            lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name))

    lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines]
    return StringIO('\n'.join(('\t'.join(l) for l in lines)))
Example #16
0
# -*- coding: utf-8 -*-
import sys
import re
import os
from os.path import basename
from cStringIO import StringIO

sys.path.append(os.path.join(os.path.dirname(__file__), 'mylib'))
sys.path.append('.')
from sentencesplit import sentencebreaks_to_newlines
NEWLINE_TERM_REGEX = re.compile(r'(.*?\n)')

api_list = []
with open('real_amb.txt', 'r') as gaz:
    for line in gaz:
        line = str(line.strip())
        #line = line.lower()
        api_list.append(line)

fin = open(sys.argv[1], 'r')
fout = open(sys.argv[2], 'w')
for l in fin:
    sentences = []
    l = sentencebreaks_to_newlines(l)
    sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])
    for s in sentences:
        #if any(api in s for api in api_list):
        #	fout.write(s)
        fout.write(s)
def text_to_conll(f):
    """Convert plain text into CoNLL format."""
    global options

    if options.nosplit:
        sentences = f.readlines()
    else:
        sentences = []
        for l in f:
            l = sentencebreaks_to_newlines(l)
            sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s])

    lines = []

    offset = 0
    # print(sentences)
    #JT: Feb 19: added it for resolving char encoding issues
    fixed_sentences = []
    for s in sentences:
        # print(s)
        # fixed_s = ftfy.fix_text(s)
        # # print(fixed_s)
        # fixed_sentences.append(fixed_s)
        fixed_sentences.append(s)

    # for s in sentences:
    for s in fixed_sentences:
        nonspace_token_seen = False
        # print(s)

        try:
            tokens = stokenizer.tokenize(s)
        except stokenizer.TimedOutExc as e:
            try:
                print("***********using ark tokenizer")
                tokens = ark_twokenize.tokenizeRawTweetText(s)
            except Exception as e:
                print(e)
        # print("tokens: ", tokens)
        token_w_pos = map_text_to_char(s, tokens, offset)
        # print("token_w_pos: ",token_w_pos)

        for (t, pos) in token_w_pos:
            if not t.isspace():
                lines.append(['O', pos, pos + len(t), t])

        lines.append([])

        offset += len(s)

        # tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] # JT : Dec 6
        # for t in tokens:
        #     if not t.isspace():
        #         lines.append(['O', offset, offset + len(t), t])
        #         nonspace_token_seen = True
        #     offset += len(t)

        # # sentences delimited by empty lines
        # if nonspace_token_seen:
        #     lines.append([])

    # add labels (other than 'O') from standoff annotation if specified
    if options.annsuffix:
        lines = relabel(lines, get_annotations(f.name), f)

    # lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] #JT: Dec 6
    lines = [[l[3], l[0]] if l else l for l in lines]  #JT: Dec 6
    return StringIO('\n'.join(('\t'.join(l) for l in lines)))