Beispiel #1
0
 def load(self, filename):
     """Do the exception-prone loading"""
     
     # set the allowed tags for later
     self.allowed_tags = set(['org', 'per', 'loc', 'locorg'])
         
     self.mentions = {}
     for tag in self.allowed_tags:
         self.mentions[tag] = []
         
     # read the file that should consist of lines like
     # [TAG] [START_SYMBOL_INDEX] [LENGTH]
     with safeOpen(filename) as f:
         r = csv.reader(f, delimiter=' ', quotechar=Config.QUOTECHAR)
         for index, parts in enumerate(r):
             # skip the empty lines
             if len(parts) == 0:
                 continue
                 
             try:
                 assert(len(parts) == 3)
                 tag = normalize(parts[0])
                 assert(tag in self.allowed_tags)
                 self.mentions[tag].append(Interval(*parts[1:]))
             except Exception as e:
                 line_descr = '[{}] [START_SYMBOL_INDEX] [LENGTH]'.format(
                             '/'.join(self.allowed_tags))
                 raise Exception(
                     'Error: "{}", line {}.\nExpected: {}\nReceived: {}\nDetails: {}'.format(
                         filename, index, line_descr, ' '.join(parts), str(e)))
Beispiel #2
0
 def loadSpans(self, filename):
     """Load the data from a file with the provided name
     
     Raw span data should be loaded from one of the system export '.spans' file
     
     Expected format:
     line = <left> SPAN_FILE_SEPARATOR <right>
     left = <span_id> <tag_name> <start_pos> <nchars> <start_token> <ntokens>
     right ::= [ <token>]+ [ <token_text>]+     // <ntokens> of each
         """
     self.spans = []
     
     with open(filename, 'r', encoding='utf-8') as f:
         for index, line in enumerate(f):
             if len(line) == 0:
                 # skip the empty lines
                 continue
             
             parts = line.split(Config.SPAN_FILE_SEPARATOR)
             if len(parts) != 2:
                 # bad non-empty line
                 raise Exception(
                     'Expected symbol "{}" missing in line {} of file {}'.format(
                         Config.SPAN_FILE_SEPARATOR, index, filename))
                 
             left = parts[0]
             right = parts[1]
             
             filtered_left = [i
                  for i in left.split(Config.DEFAULT_DELIMITER)
                      if len(i) > 0]
             
             if len(filtered_left) < 6:
                 raise Exception(
                     'Missing left parts in line {} of file {}'.format(
                         index, filename))
                 
             new_span = Span(*filtered_left)
             
             filtered_right = [i
                   for i in right.split(Config.DEFAULT_DELIMITER)
                         if len(i) > 0]
             if len(filtered_right) != 2*new_span.ntokens:
                 raise Exception(
                     'Missing right parts in line {} of file {}'.format(
                         index, filename))
             
             
             token_ids = [x.strip() for x in filtered_right[:new_span.ntokens]]
             new_span.tokens = sorted([self._token_dict[x] for x in token_ids],
                                      key=lambda x: x.start)
             new_span.text = normalize(' '.join(filtered_right[new_span.ntokens:]))
             new_span.text = new_span.text.replace('\n', '')
             
             self.spans.append(new_span)
             
     # fill the span dictionary
     self._span_dict = dict([(x.id, x) for x in self.spans])
Beispiel #3
0
 def loadSpans(self, filename):
     """Load the data from a file with the provided name
     
     Raw span data should be loaded from one of the system export '.spans' file
     
     Expected format:
     line = <left> SPAN_FILE_SEPARATOR <right>
     left = <span_id> <tag_name> <start_pos> <nchars> <start_token> <ntokens>
     right ::= [ <token>]+ [ <token_text>]+     // <ntokens> of each
         """
     self.spans = []
     
     with open(filename, 'r', encoding='utf-8') as f:
         for index, line in enumerate(f):
             if len(line) == 0:
                 # skip the empty lines
                 continue
             
             parts = line.split(Config.SPAN_FILE_SEPARATOR)
             if len(parts) != 2:
                 # bad non-empty line
                 raise Exception(
                     'Expected symbol "{}" missing in line {} of file {}'.format(
                         Config.SPAN_FILE_SEPARATOR, index, filename))
                 
             left = parts[0]
             right = parts[1]
             
             filtered_left = [i
                  for i in left.split(Config.DEFAULT_DELIMITER)
                      if len(i) > 0]
             
             if len(filtered_left) < 6:
                 raise Exception(
                     'Missing left parts in line {} of file {}'.format(
                         index, filename))
                 
             new_span = Span(*filtered_left)
             
             filtered_right = [i
                   for i in right.split(Config.DEFAULT_DELIMITER)
                         if len(i) > 0]
             if len(filtered_right) != 2*new_span.ntokens:
                 raise Exception(
                     'Missing right parts in line {} of file {}'.format(
                         index, filename))
             
             
             token_ids = [x.strip() for x in filtered_right[:new_span.ntokens]]
             new_span.tokens = sorted([self._token_dict[x] for x in token_ids],
                                      key=lambda x: x.start)
             new_span.text = normalize(' '.join(filtered_right[new_span.ntokens:]))
             new_span.text = new_span.text.replace('\n', '')
             
             self.spans.append(new_span)
             
     # fill the span dictionary
     self._span_dict = dict([(x.id, x) for x in self.spans])
Beispiel #4
0
 def __init__(self, id, start, length, text):
     """Create a new token with the given parameters"""
     self.id = id
     self.start = int(start)
     self.length = int(length)
     self.end = self.start + self.length - 1
     self.text = normalize(text)
     self.next = None
     self.prev = None        
Beispiel #5
0
    def fromTest(cls, line):
        """Load an attribute from the set of lines representing it.
        This method corresponds to the test format of representation.
        
        Returns a new Attribute instance"""

        parts = line.split(':')
        assert (len(parts) >= 2)

        instance = cls()
        instance.name = parts[0].strip().lower()
        value = normalize(':'.join(parts[1:]))
        instance.values.add(value)

        return instance
Beispiel #6
0
    def load(self, filename):
        """Do the exception-prone loading"""
        self.facts = []

        with safeOpen(filename) as f:
            buffer = ''
            for raw_line in f:
                line = normalize(raw_line)
                if len(line) == 0:
                    if len(buffer) > 0:
                        self.facts.append(Fact.fromTest(buffer))
                        buffer = ''
                else:
                    buffer += line + '\n'
            if len(buffer) > 0:
                self.facts.append(Fact.fromTest(buffer))
Beispiel #7
0
    def load(self, filename):
        """Do the exception-prone loading"""
        self.facts = []

        with safeOpen(filename) as f:
            buffer = ''
            for raw_line in f:
                line = normalize(raw_line)
                if len(line) == 0:
                    if len(buffer) > 0:
                        self.facts.append(Fact.fromTest(buffer))
                        buffer = ''
                else:
                    buffer += line + '\n'
            if len(buffer) > 0:
                self.facts.append(Fact.fromTest(buffer))
Beispiel #8
0
    def fromStandard(cls, lines):
        """Load an attribute from the set of lines representing it.
        This method corresponds to the standard format of representation
        
        Returns a new Attribute instance"""

        assert (len(lines) == 1)

        line = lines[0]
        parts = line.split(' ')

        instance = cls()
        instance.name = parts[0].strip().lower()
        value = normalize(' '.join(parts[1:]))
        instance.values.add(value)

        return instance
Beispiel #9
0
 def loadText(self, filename):
     """Load text from the associated text file"""
     with open(filename, 'r', encoding='utf-8') as f:
         self.text = normalize(''.join( [line for line in f] ))