def load(self, filename): """Do the exception-prone loading""" # set the allowed tags for later self.allowed_tags = set(['org', 'per', 'loc', 'locorg']) self.mentions = {} for tag in self.allowed_tags: self.mentions[tag] = [] # read the file that should consist of lines like # [TAG] [START_SYMBOL_INDEX] [LENGTH] with safeOpen(filename) as f: r = csv.reader(f, delimiter=' ', quotechar=Config.QUOTECHAR) for index, parts in enumerate(r): # skip the empty lines if len(parts) == 0: continue try: assert(len(parts) == 3) tag = normalize(parts[0]) assert(tag in self.allowed_tags) self.mentions[tag].append(Interval(*parts[1:])) except Exception as e: line_descr = '[{}] [START_SYMBOL_INDEX] [LENGTH]'.format( '/'.join(self.allowed_tags)) raise Exception( 'Error: "{}", line {}.\nExpected: {}\nReceived: {}\nDetails: {}'.format( filename, index, line_descr, ' '.join(parts), str(e)))
def loadSpans(self, filename): """Load the data from a file with the provided name Raw span data should be loaded from one of the system export '.spans' file Expected format: line = <left> SPAN_FILE_SEPARATOR <right> left = <span_id> <tag_name> <start_pos> <nchars> <start_token> <ntokens> right ::= [ <token>]+ [ <token_text>]+ // <ntokens> of each """ self.spans = [] with open(filename, 'r', encoding='utf-8') as f: for index, line in enumerate(f): if len(line) == 0: # skip the empty lines continue parts = line.split(Config.SPAN_FILE_SEPARATOR) if len(parts) != 2: # bad non-empty line raise Exception( 'Expected symbol "{}" missing in line {} of file {}'.format( Config.SPAN_FILE_SEPARATOR, index, filename)) left = parts[0] right = parts[1] filtered_left = [i for i in left.split(Config.DEFAULT_DELIMITER) if len(i) > 0] if len(filtered_left) < 6: raise Exception( 'Missing left parts in line {} of file {}'.format( index, filename)) new_span = Span(*filtered_left) filtered_right = [i for i in right.split(Config.DEFAULT_DELIMITER) if len(i) > 0] if len(filtered_right) != 2*new_span.ntokens: raise Exception( 'Missing right parts in line {} of file {}'.format( index, filename)) token_ids = [x.strip() for x in filtered_right[:new_span.ntokens]] new_span.tokens = sorted([self._token_dict[x] for x in token_ids], key=lambda x: x.start) new_span.text = normalize(' '.join(filtered_right[new_span.ntokens:])) new_span.text = new_span.text.replace('\n', '') self.spans.append(new_span) # fill the span dictionary self._span_dict = dict([(x.id, x) for x in self.spans])
def __init__(self, id, start, length, text): """Create a new token with the given parameters""" self.id = id self.start = int(start) self.length = int(length) self.end = self.start + self.length - 1 self.text = normalize(text) self.next = None self.prev = None
def fromTest(cls, line): """Load an attribute from the set of lines representing it. This method corresponds to the test format of representation. Returns a new Attribute instance""" parts = line.split(':') assert (len(parts) >= 2) instance = cls() instance.name = parts[0].strip().lower() value = normalize(':'.join(parts[1:])) instance.values.add(value) return instance
def load(self, filename): """Do the exception-prone loading""" self.facts = [] with safeOpen(filename) as f: buffer = '' for raw_line in f: line = normalize(raw_line) if len(line) == 0: if len(buffer) > 0: self.facts.append(Fact.fromTest(buffer)) buffer = '' else: buffer += line + '\n' if len(buffer) > 0: self.facts.append(Fact.fromTest(buffer))
def fromStandard(cls, lines): """Load an attribute from the set of lines representing it. This method corresponds to the standard format of representation Returns a new Attribute instance""" assert (len(lines) == 1) line = lines[0] parts = line.split(' ') instance = cls() instance.name = parts[0].strip().lower() value = normalize(' '.join(parts[1:])) instance.values.add(value) return instance
def loadText(self, filename): """Load text from the associated text file""" with open(filename, 'r', encoding='utf-8') as f: self.text = normalize(''.join( [line for line in f] ))