def block_smart_extend(has_selection, start, end): end = end.copy() if not end.is_end(): end.backward_lines(1) start_ws = len(get_whitespace(start)) prev_empty = start.is_start() or line_is_empty(prev_line(start)) prev_ws = len(get_whitespace(prev_line(start))) end_ws = len(get_whitespace(end)) next_empty = end.is_end() or line_is_empty(next_line(end)) next_ws = len(get_whitespace(next_line(end))) newstart, newend = start.copy(), end if not has_selection and start.get_line() == end.get_line() and \ ( next_empty or next_ws < end_ws ) and (prev_empty or prev_ws < start_ws): pass elif not prev_empty and not next_empty and prev_ws == start_ws == next_ws == end_ws: newstart = extend_without_gap(start, start_ws, -1) newend = extend_without_gap(end, end_ws, 1) elif prev_empty and not next_empty and next_ws == end_ws: newend = extend_without_gap(end, end_ws, 1) elif not prev_empty and next_empty and prev_ws == start_ws: newstart = extend_without_gap(start, start_ws, -1) elif not next_empty and next_ws > start_ws: newend = extend_block_without_gap(end, start_ws, 1) elif ( not next_empty and next_ws == start_ws ) or ( not prev_empty and prev_ws >= start_ws ): if not prev_empty: newstart = extend_without_gap(start, start_ws, -1) if not next_empty: newend = extend_without_gap(end, start_ws, 1) elif next_empty and prev_empty: newstart = extend_with_gap(start, start_ws, -1) newend = extend_with_gap(end, start_ws, 1) elif next_empty and not prev_empty and prev_ws < start_ws: newend = extend_with_gap(end, start_ws, 1) if has_selection and start.equal(newstart) and end.equal(newend): if not prev_empty: newstart.backward_lines(1) else: ne = get_next_not_empty_line(start, -1) if ne: newstart = ne if not next_empty and len(line_text(next_line(end)).strip()) < 5: newend.forward_lines(1) newend.forward_lines(1) return newstart, newend
def read_tokens_conll(cols, f, path=''): tokens = [] for _ in cols: tokens.append(set()) tokens = tuple(tokens) sent = 1 line = next_line(f) while line: if line == '\n': sent += 1 line = next_line(f) continue fields = line.strip().split('\t') token = int(fields[0]) for i, col in enumerate(cols): label = re.sub('-\w+$', '', fields[col]) if label != '_': tokens[i].add((sent, token, label)) line = next_line(f) return tokens
def read_event_spans_conll(f, path=''): spans = set() sent = 1 line = next_line(f) id2tokens = defaultdict(list) while line: if line == '\n': sent += 1 else: fields = line.strip().split('\t') if fields[2] != '_': type_, label, event_id = fields[2].split('-') assert type_ in ['I', 'B'] assert label == 'E' assert (type_ == 'B' and len(id2tokens[event_id]) == 0) or \ (type_ == 'I' and len(id2tokens[event_id]) > 0), \ "Format error in file %s, sentence %d, token %s: %s" \ %(path, sent, fields[0], fields[2]) id2tokens[event_id].append(int(fields[0])) line = next_line(f) for _, tokens in id2tokens.iteritems(): spans.add((sent, tuple(tokens))) return spans
def read_generic_spans_conll(col, f, path): spans = set() id2tokens = defaultdict(list) sent = 1 line = next_line(f) while line: if line == '\n': sent += 1 else: fields = line.strip().split('\t') token = int(fields[0]) label = fields[col] if label != '_': type_, label, id_ = label.split('-') assert type_ in ['I', 'B'] assert (type_ == 'B' and len(id2tokens[(label, id_)]) == 0) or \ (type_ == 'I' and len(id2tokens[(label, id_)]) > 0), \ "Format error in file %s, sentence %d, token %s: %s" \ %(path, sent, fields[0], fields[col]) id2tokens[(label, id_)].append(token) line = next_line(f) for (label, _), tokens in id2tokens.iteritems(): spans.add((sent, tuple(tokens), label)) return spans
def read_spans_conll(f, path=''): spans = set() sent = 1 line = next_line(f) while line: if line == '\n': sent += 1 line = next_line(f) continue fields = line.strip().split('\t') token = int(fields[0]) url = fields[3] if url != '_': start = end = token while True: fields = line.strip().split('\t') if (not line) or line == '\n' or fields[3] != url: break end = int(fields[0]) line = next_line(f) spans.add((sent, start, end, url)) else: line = next_line(f) return spans