def text_to_conll(f): """Convert plain text into CoNLL format.""" global options # print(f) if options.nosplit: sentences = f.readlines() # print(sentences) else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 # print(sentences) for s in sentences: nonspace_token_seen = False # print(s) # tokens = s.split(" ") tokens = word_tokenize(s) token_w_pos = map_text_to_char(s, tokens, offset) # print("token_w_pos: ",token_w_pos) for (t, pos) in token_w_pos: if not t.isspace(): l1 = ['O', pos, pos + len(t), t] lines.append(l1) # print(l1) lines.append([]) offset += len(s) # tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] # JT : Dec 6 # for t in tokens: # if not t.isspace(): # lines.append(['O', offset, offset + len(t), t]) # nonspace_token_seen = True # offset += len(t) # # sentences delimited by empty lines # if nonspace_token_seen: # lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: textbounds, dict_of_entity, list_of_relns = get_annotations(f.name) lines = relabel(lines, textbounds, dict_of_entity, list_of_relns, f) # print(lines) # lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] #JT: Dec 6 lines = [[l[3], l[0]] if l else l for l in lines] #JT: Dec 6 # lines = [[l[3],l[0],l[4],l[5],l[6]] if l else l for l in lines] #JT: Dec 6 return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def preproc_document(doc_id,inp_dir,interm_dir,out_dir,abbreviations,taggers): """ Returns: language, number of sentences, number of tokens """ lang, no_sentences, no_tokens = np.nan,np.nan,np.nan try: intermediate_out_file = "%s%s"%(interm_dir,doc_id) iob_out_file = "%s%s"%(out_dir,doc_id) text = codecs.open("%s%s"%(inp_dir,doc_id),'r','utf-8').read() intermediate_text = sentencebreaks_to_newlines(text) recovered_text= recover_segmentation_errors(intermediate_text,abbreviations,verbose=False) codecs.open(intermediate_out_file,'w','utf-8').write(recovered_text) logger.info("Written intermediate output to %s"%intermediate_out_file) lang = detect_language(text) logger.info("Language detected=\"%s\""%lang) sentences = recovered_text.split('\n') logger.info("Document \"%s\" has %i sentences"%(doc_id,len(sentences))) tagged_sentences = taggers[lang].tag_sents(sentences) tokenised_text = [[token[:2] for token in line] for line in tagged_sentences] IO.write_iob_file(tokenised_text,iob_out_file) logger.info("Written IOB output to %s"%iob_out_file) no_sentences = len(recovered_text.split('\n')) no_tokens = IO.count_tokens(tokenised_text) except Exception, e: logger.error("The pre-processing of document %s (lang=\'%s\') failed with error \"%s\""%(doc_id,lang,e))
def _apply_tagger(text): try: splittext = sentencebreaks_to_newlines(text) except: print >> stderr, "Warning: sentence splitting failed for input:\n'%s'" % text splittext = text sentences = splittext.split('\n') all_tagged = [] baseoffset = 0 for s in sentences: tagged = _apply_tagger_to_sentence(s) for t in tagged: t.startOff += baseoffset t.endOff += baseoffset all_tagged.extend(tagged) baseoffset += len(s) + 1 anns = {} idseq = 1 for t in all_tagged: anns["T%d" % idseq] = { 'type': t.eType, 'offsets': ((t.startOff, t.endOff), ), 'texts': (t.eText, ), } idseq += 1 return anns
def _apply_tagger(text): global tagger_process, tagger_queue # the tagger expects a sentence per line, so do basic splitting try: splittext = sentencebreaks_to_newlines(text) except BaseException: # if anything goes wrong, just go with the # original text instead print("Warning: sentence splitting failed for input:\n'%s'" % text, file=stderr) splittext = text print(splittext, file=tagger_process.stdin) print(DOCUMENT_BOUNDARY, file=tagger_process.stdin) tagger_process.stdin.flush() response_lines = [] while True: l = tagger_process.stdout.readline() l = l.rstrip('\n') if l == DOCUMENT_BOUNDARY: break response_lines.append(l) try: tagged_entities = BIO_lines_to_standoff(response_lines, text) except BaseException: # if anything goes wrong, bail out print("Warning: BIO-to-standoff conversion failed for BIO:\n'%s'" % '\n'.join(response_lines), file=stderr) return {} anns = {} for t in tagged_entities: anns["T%d" % t.idNum] = { 'type': t.eType, 'offsets': ((t.startOff, t.endOff), ), 'texts': (t.eText, ), } return anns
def text_to_conll(f): """Convert plain text into CoNLL format.""" global options if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l, junk_mark = sentencebreaks_to_newlines(l) if not junk_mark: sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) else: JUNKS_FILE.append(f) break lines = [] offset = 0 for s in sentences: nonspace_token_seen = False tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] for t in tokens: if not t.isspace(): lines.append(['O', offset, offset + len(t), t]) nonspace_token_seen = True offset += len(t) # sentences delimited by empty lines if nonspace_token_seen: lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name)) # TODO неправильный порядок?! # lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] lines = [[l[3], str(l[1]), str(l[2]), l[0]] if l else l for l in lines] return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def text_to_conll(f): """Convert plain text into CoNLL format.""" sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] for s in sentences: nonspace_token_seen = False tokens = [t for t in s.split() if t] for i, t in enumerate(tokens): if not t.isspace(): # pre label rules designed by Deheng #if API_pattern.match(t) is not None: # lines.append([t, 'B-API']) if i < len(tokens) - 2: comp = tokens[i - 1] + t + tokens[i + 1] comp = comp.lower() else: comp = "" if t.endswith("()"): #print t t_nobracket = t[:-2] if t_nobracket.lower() in api_list: lines.append([t, 'B-API']) else: lines.append([t, 'O']) elif t.lower() in api_list: #print t lines.append([t, 'B-API']) elif comp in api_list: print comp lines.append([t, 'B-API']) else: lines.append([t, 'O']) nonspace_token_seen = True # sentences delimited by empty lines if nonspace_token_seen: lines.append([]) lines = [[l[0], l[1]] if l else l for l in lines] return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def _apply_tagger(text): global tagger_process, tagger_queue # the tagger expects a sentence per line, so do basic splitting try: splittext = sentencebreaks_to_newlines(text) except BaseException: # if anything goes wrong, just go with the # original text instead print("Warning: sentence splitting failed for input:\n'%s'" % text, file=stderr) splittext = text print(splittext, file=tagger_process.stdin) print(DOCUMENT_BOUNDARY, file=tagger_process.stdin) tagger_process.stdin.flush() response_lines = [] while True: l = tagger_process.stdout.readline() l = l.rstrip('\n') if l == DOCUMENT_BOUNDARY: break response_lines.append(l) try: tagged_entities = BIO_lines_to_standoff(response_lines, text) except BaseException: # if anything goes wrong, bail out print("Warning: BIO-to-standoff conversion failed for BIO:\n'%s'" % '\n'.join( response_lines), file=stderr) return {} anns = {} for t in tagged_entities: anns["T%d" % t.idNum] = { 'type': t.eType, 'offsets': ((t.startOff, t.endOff), ), 'texts': (t.eText, ), } return anns
def process_rels(terms_dict: Dict, text: str, rels: List): sentences = sentencebreaks_to_newlines(text) sent_stops = [m.start() for m in re.finditer('\n', sentences)] # Find sentence breaks assert(len(sentences) == len(text)) # Assume BRAT sent processing replaces spaces with \n sent_bounds: List = [] for ent1, relation, ent2 in rels: arg1_bounds: List = BRATHelper.get_ent_bounds(ent1) rel_bounds: List = BRATHelper.get_ent_bounds(relation) arg2_bounds: List = BRATHelper.get_ent_bounds(ent2) sent_min = min(arg1_bounds + rel_bounds + arg2_bounds) sent_max = max(arg1_bounds + rel_bounds + arg2_bounds) sent_min, sent_max = BRATHelper.get_sent_bounds(sent_stops, sent_min, sent_max) # Get bounds of sentence sent_bounds.append((sent_min, sent_max)) return sent_bounds
def text_to_conll(f): """Convert plain text into CoNLL format.""" global options if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 for s in sentences: nonspace_token_seen = False # pdb.set_trace() tokens = nltk.word_tokenize(s) # tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] ############################################################## SKATA NA FAS EDOOOOOOOOOOOOOOOOOOO for t in tokens: if not t.isspace(): # pdb.set_trace() lines.append(['O', offset, offset + len(t), t]) nonspace_token_seen = True offset += len(t) # sentences delimited by empty lines if nonspace_token_seen: # pdb.set_trace() lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name)) lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] # pdb.set_trace() return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def _apply_tagger(text): global tagger_process, tagger_queue try: splittext = sentencebreaks_to_newlines(text) except: print >> stderr, "Warning: sentence splitting failed for input:\n'%s'" % text splittext = text print >> tagger_process.stdin, splittext print >> tagger_process.stdin, DOCUMENT_BOUNDARY tagger_process.stdin.flush() response_lines = [] while True: l = tagger_process.stdout.readline() l = l.rstrip('\n') if l == DOCUMENT_BOUNDARY: break response_lines.append(l) try: tagged_entities = BIO_lines_to_standoff(response_lines, text) except: print >> stderr, "Warning: BIO-to-standoff conversion failed for BIO:\n'%s'" % '\n'.join( response_lines) return {} anns = {} for t in tagged_entities: anns["T%d" % t.idNum] = { 'type': t.eType, 'offsets': ((t.startOff, t.endOff), ), 'texts': (t.eText, ), } return anns
def text_to_conll_lines(f): """Convert plain text into CoNLL format.""" global options if not options: options = argparser_internal().parse_args(None) if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 for s in sentences: nonspace_token_seen = False tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] for t in tokens: if not t.isspace(): lines.append(['O', offset, offset+len(t), t]) nonspace_token_seen = True offset += len(t) # sentences delimited by empty lines if nonspace_token_seen: lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name)) lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] lines = conll_to_standford(lines) lines = strip_xml_tag(lines) return lines
def text_to_conll_lines(f): """Convert plain text into CoNLL format.""" global options if not options: options = argparser_internal().parse_args(None) if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 for s in sentences: nonspace_token_seen = False tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] for t in tokens: if not t.isspace(): lines.append(['O', offset, offset + len(t), t]) nonspace_token_seen = True offset += len(t) # sentences delimited by empty lines if nonspace_token_seen: lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name)) lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] lines = conll_to_standford(lines) lines = strip_xml_tag(lines) return lines
def text_to_conll(f): """Convert plain text into CoNLL format.""" global options if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 for s in sentences: nonspace_token_seen = False #tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] tokens = [t for t in non_ascii_tokenizer(s) if t] # " " single-space is appended to token list # to fix the multi-line issue # Changed by Oyesh #tokens.append("") for t in tokens: if not t.isspace(): lines.append(['O', offset, offset + len(t), t]) nonspace_token_seen = True offset += len(t) # sentences delimited by empty lines if nonspace_token_seen: lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name)) #lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] lines = [[l[3], str(l[1]), str(l[2]), l[0]] if l else l for l in lines] return StringIO('\n'.join(('\t'.join(l) for l in lines)))
def _apply_tagger(text): # MetaMap isn't too happy with large outputs, so process a # sentence per invocation try: splittext = sentencebreaks_to_newlines(text) except BaseException: # if anything goes wrong, just go with the # original text instead print("Warning: sentence splitting failed for input:\n'%s'" % text, file=stderr) splittext = text sentences = splittext.split('\n') all_tagged = [] baseoffset = 0 for s in sentences: tagged = _apply_tagger_to_sentence(s) # adjust offsets for t in tagged: t.startOff += baseoffset t.endOff += baseoffset all_tagged.extend(tagged) baseoffset += len(s) + 1 anns = {} idseq = 1 for t in all_tagged: anns["T%d" % idseq] = { 'type': t.eType, 'offsets': ((t.startOff, t.endOff), ), 'texts': (t.eText, ), } idseq += 1 return anns
def text_to_conll(f): """Convert plain text into CoNLL format.""" global options if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 for s in sentences: nonspace_token_seen = False """fake_tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] tokens = [] quote_count = 0 for i, t in enumerate(fake_tokens): if quote(t): quote_count += 1 if i == 0: tokens.append(t) if i == len(fake_tokens)-1: tokens.append(t) if i > 0 and i < len(fake_tokens)-1: if fake_tokens[i]==' ': if re.match(r'^[\(]$', fake_tokens[i-1]): continue if re.match(r'^[\.,\)\?\!]$', fake_tokens[i+1]): continue if quote(fake_tokens[i-1]) and quote_count is not None and quote_count%2==1: continue if quote(fake_tokens[i+1]) and quote_count is not None and quote_count%2==1: continue tokens.append(t) #print tokens tokens2 = [t for t in s.split('\s') if t] #print tokens2 """ tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] for t in tokens: if not t.isspace(): lines.append(['O', offset, offset + len(t), t]) nonspace_token_seen = True offset += len(t) # sentences delimited by empty lines if nonspace_token_seen: lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name)) lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] return StringIO('\n'.join(('\t'.join(l) for l in lines)))
# -*- coding: utf-8 -*- import sys import re import os from os.path import basename from cStringIO import StringIO sys.path.append(os.path.join(os.path.dirname(__file__), 'mylib')) sys.path.append('.') from sentencesplit import sentencebreaks_to_newlines NEWLINE_TERM_REGEX = re.compile(r'(.*?\n)') api_list = [] with open('real_amb.txt', 'r') as gaz: for line in gaz: line = str(line.strip()) #line = line.lower() api_list.append(line) fin = open(sys.argv[1], 'r') fout = open(sys.argv[2], 'w') for l in fin: sentences = [] l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) for s in sentences: #if any(api in s for api in api_list): # fout.write(s) fout.write(s)
def text_to_conll(f): """Convert plain text into CoNLL format.""" global options if options.nosplit: sentences = f.readlines() else: sentences = [] for l in f: l = sentencebreaks_to_newlines(l) sentences.extend([s for s in NEWLINE_TERM_REGEX.split(l) if s]) lines = [] offset = 0 # print(sentences) #JT: Feb 19: added it for resolving char encoding issues fixed_sentences = [] for s in sentences: # print(s) # fixed_s = ftfy.fix_text(s) # # print(fixed_s) # fixed_sentences.append(fixed_s) fixed_sentences.append(s) # for s in sentences: for s in fixed_sentences: nonspace_token_seen = False # print(s) try: tokens = stokenizer.tokenize(s) except stokenizer.TimedOutExc as e: try: print("***********using ark tokenizer") tokens = ark_twokenize.tokenizeRawTweetText(s) except Exception as e: print(e) # print("tokens: ", tokens) token_w_pos = map_text_to_char(s, tokens, offset) # print("token_w_pos: ",token_w_pos) for (t, pos) in token_w_pos: if not t.isspace(): lines.append(['O', pos, pos + len(t), t]) lines.append([]) offset += len(s) # tokens = [t for t in TOKENIZATION_REGEX.split(s) if t] # JT : Dec 6 # for t in tokens: # if not t.isspace(): # lines.append(['O', offset, offset + len(t), t]) # nonspace_token_seen = True # offset += len(t) # # sentences delimited by empty lines # if nonspace_token_seen: # lines.append([]) # add labels (other than 'O') from standoff annotation if specified if options.annsuffix: lines = relabel(lines, get_annotations(f.name), f) # lines = [[l[0], str(l[1]), str(l[2]), l[3]] if l else l for l in lines] #JT: Dec 6 lines = [[l[3], l[0]] if l else l for l in lines] #JT: Dec 6 return StringIO('\n'.join(('\t'.join(l) for l in lines)))