Beispiel #1
0
def main(args):
    argp = ARGPARSER.parse_args(args[1:])

    if not argp.no_cache:
        # We can't do it iteratively listening to stdin, read it all
        doc = Document('<classify>', [], [], '<classify>')
        for _string in (l.rstrip('\n') for l in argp.input):
            doc.abstract.append(_string_to_ann_sent(_string))
        docs = (doc, )
    else:
        docs = (Document('Line: %s' % i, [], [_string_to_ann_sent(_string)],
            '<stdin>') for  i, _string in enumerate(
                (l.rstrip('\n') for l in argp.input), start=1))

    # Cache the strings for speed
    if not argp.no_cache:
        cache_simstring((docs, ), verbose=argp.verbose)

    with open(argp.model_path, 'r') as model_file:
        classifier = pickle_load(model_file)

    # TODO: Faster to do it in a batch instead
    for doc in docs:
        for sent in doc:
            for ann in sent:
                print '%s\t%s' % (sent.annotation_text(ann),
                        str(classifier.classify(doc, sent, ann, ranked=True)))
 def __handleDocument(self, xmlDocument):
     d = Document()
     root = xmlDocument.firstChild
     
     maxNumParag = 2
     counter = 0
     if root.hasChildNodes():
         for xmlNode in root.childNodes:
             #print 'xmlNode.nodeName '+str(xmlNode.nodeName)
             if xmlNode.nodeName == 'DESC' or xmlNode.nodeName == 'TIME':
                 pass
             if xmlNode.nodeName == 'P':
                 counter += 1
                 
                 p = Paragraph()
                 print 'before hangle_paragraph'
                 self.__handleParagraph( xmlNode, p )
                 print 'after handle_paragraph'
                 d.addParagraph(p)
                 p.document = d
                 
                 
                 print p.idx
                 if counter >= maxNumParag:
                     break
                 
     print d, d.paragraphs
     """
     # After all the paragraphs, therefore sentences, therefore words
     # and markables have been parsed, the markables
     # will have its slices relative to the word indexes in the sentence,
     # being the first word index 0 and the last index numWords-1
     
     # Transform the indexes to absolute indexes, relative to the
     # word objects' ID attributes
     for paragraph in d._paragraphs:
         for sentence in paragraph._sentences:
             for markable in sentence._markables:
                 sliceIndex = markable._slice
                 fromIndex, toIndex = sliceIndex.split(":")[0], sliceIndex.split(":")[1]
                 
                 try:
                     fromId = sentence._words[int(fromIndex)]._id
                     toId = sentence._words[int(toIndex)]._id
                     #print fromId + " " + toId
                     markable._slice = fromId + ":" + toId
                 except ValueError:
                     # Value error here means that the key couldn't be converted to an int
                     # value, in that case it has already been converted by another
                     # markable and can be used "as is"
                     markable._slice = fromIndex + ":" + toIndex
     """
     return d
Beispiel #3
0
 def add_document(self):
     schema = DocumentSchema()
     form = Form(schema, buttons=('submit',))
     if 'submit' in self.request.POST:
         # Make a new Document
         title = self.request.POST['title']
         content = self.request.POST['content']
         name = str(randint(0,999999))
         new_document = Document(title, content)
         new_document.__name__ = name
         new_document.__parent__ = self.context
         self.context[name] = new_document
         # Redirect to the new document
         url = self.request.resource_url(new_document)
         return HTTPFound(location=url)
     return {"form": form.render()}
Beispiel #4
0
 def add_document(self):
     schema = DocumentSchema()
     form = Form(schema, buttons=('submit', ))
     if 'submit' in self.request.POST:
         # Make a new Document
         title = self.request.POST['title']
         content = self.request.POST['content']
         name = str(randint(0, 999999))
         new_document = Document(title, content)
         new_document.__name__ = name
         new_document.__parent__ = self.context
         self.context[name] = new_document
         # Redirect to the new document
         url = self.request.resource_url(new_document)
         return HTTPFound(location=url)
     return {"form": form.render()}
Beispiel #5
0
def _tab_separated_input_to_doc(input):
    # Create a dataset out of the input
    doc = Document(input.name, [], [], '<%s>' % input.name)
    for _string, _type in (l.rstrip('\n').split('\t') for l in input):
        doc.abstract.append(
            Sentence(_string, [
                Annotation(0, len(_string), _type),
            ]))
    return doc
Beispiel #6
0
    def add_document_view(self):
        # Make a new Document
        title = self.request.POST['document_title']
        name = str(randint(0, 999999))
        new_document = Document(name, self.context, title)
        self.context[name] = new_document

        # Redirect to the new document
        url = self.request.resource_url(new_document)
        return HTTPFound(location=url)
Beispiel #7
0
 def add_document(self):
     schema = DocumentSchema()
     form = Form(schema, buttons=('submit', ))
     if 'submit' in self.request.POST:
         # Make a new Document
         title = self.request.POST['title']
         content = self.request.POST['content']
         doc_id = self.document_map.new_docid()
         name = "document%s" % doc_id
         new_document = Document(title, content)
         new_document.__name__ = name
         new_document.__parent__ = self.context
         self.context[name] = new_document
         # map object path to catalog id
         path = resource_path(new_document)
         self.document_map.add(path, doc_id)
         # index new folder
         self.catalog.index_doc(doc_id, new_document)
         # Redirect to the new document
         url = self.request.resource_url(new_document)
         return HTTPFound(location=url)
     return {"form": form.render()}
Beispiel #8
0
 def add_document(self):
     schema = DocumentSchema()
     form = Form(schema, buttons=('submit',))
     if 'submit' in self.request.POST:
         # Make a new Document
         title = self.request.POST['title']
         content = self.request.POST['content']
         doc_id = self.document_map.new_docid()
         name = "document%s" % doc_id
         new_document = Document(title, content)
         new_document.__name__ = name
         new_document.__parent__ = self.context
         self.context[name] = new_document
         # map object path to catalog id
         path = resource_path(new_document)
         self.document_map.add(path, doc_id) 
         # index new folder
         self.catalog.index_doc(doc_id, new_document)
         # Redirect to the new document
         url = self.request.resource_url(new_document)
         return HTTPFound(location=url)
     return {"form": form.render()}
Beispiel #9
0
def _get_documents(dir):
    for id, txt_path, ss_path, a1_path, a2_path in _get_aligned_resources(dir):
        #print id
        # First we align the text and the sentences since we need to map the
        # offsets of the stand-off to map to the sentences in the sentence
        # split file
        #with open(txt_path, 'r') as txt_file, open(ss_path, 'r') as ss_file:
        with open(txt_path, 'r') as txt_file:
            if ENCODE_WRAP:
                txt_file = _encode_wrap(txt_file)
            with open(ss_path, 'r') as ss_file:
                if ENCODE_WRAP:
                    ss_file = _encode_wrap(ss_file)
                #sentences, s_offset_by_sentence = (
                s_starts_and_sentences = (_get_sentences_and_offsets(
                    txt_file, ss_file))

        #XXX: HACK!
        if a2_path is None:
            a2_path = '/dev/null'

        #with open(a1_path, 'r') as a1_file, open(a2_path, 'r') as a2_file:
        with open(a1_path, 'r') as a1_file:
            if ENCODE_WRAP:
                a1_file = _encode_wrap(a1_file)
            with open(a2_path, 'r') as a2_file:
                if ENCODE_WRAP:
                    a2_file = _encode_wrap(a2_file)
                for line in (l.rstrip('\n') for l in chain(a1_file, a2_file)):
                    # We ignore everything apart from the text-bound annotations
                    match = TB_SO_REGEX.match(line)
                    if match is not None:
                        g_dict = match.groupdict()
                        ann_start = int(g_dict['start'])
                        ann_end = int(g_dict['end'])

                        # Find the sentence and its index containing the annotation
                        s_idx, sentence = _find_containing_idx(
                            ann_start, s_starts_and_sentences)

                        # XXX: There are cases where an annotation is cut-off
                        #       by a sentence break. If this is the case, merge
                        #       the sentences.
                        if ann_end > s_idx + len(sentence.text):
                            next_s_idx, next_sentence = _find_containing_idx(
                                ann_end, s_starts_and_sentences)
                            # Merge the next sentence into this one
                            # XXX: Just assumes a space! May be wrong!
                            sentence = Sentence(
                                sentence.text + ' ' + next_sentence.text,
                                sentence.annotations +
                                next_sentence.annotations)
                            # Remove the old one
                            s_starts_and_sentences.remove(
                                (next_s_idx, next_sentence))

                        # Create an annotation object but adjust the indices to
                        # be relative to the sentence and not to the file
                        new_ann_start = ann_start - s_idx
                        assert 0 <= new_ann_start < len(
                            sentence.text
                        ), '0 <= {} < {} ({}, {}) {} "{}" {}'.format(
                            new_ann_start, len(sentence.text), s_idx,
                            g_dict['start'], id, g_dict['text'], s_idx)
                        new_ann_end = ann_end - s_idx
                        assert 0 < new_ann_end <= len(
                            sentence.text
                        ), '0 < {} <= {} ({}, {}) {} {}'.format(
                            new_ann_end, len(sentence.text), s_idx,
                            g_dict['end'], id, g_dict['text'])
                        assert new_ann_start < new_ann_end
                        annotation = Annotation(ann_start - s_idx,
                                                ann_end - s_idx,
                                                g_dict['type'])

                        # If we have a text span in the stand-off we sanity check
                        # it against what is in the sentence
                        #XXX: Run this again!
                        if g_dict['text'] is not None:
                            g_dict['text'] = unicode(
                                g_dict['text'].strip('\r\n'),
                                encoding='utf-8')  #XXX: Regex is not perfect
                            # it leaves spaces around
                            target_ann_text = sentence.annotation_text(
                                annotation)
                            assert target_ann_text == g_dict['text'], (
                                'text span mismatch in {} '
                                'target: "{}" != source: "{}" {} "{}" {} {} {}'
                            ).format(id, target_ann_text, g_dict['text'],
                                     annotation, sentence.text, g_dict,
                                     type(target_ann_text),
                                     type(g_dict['text']))

                        sentence.add_annotation(annotation)
                    #else:
                    #    assert False, line.replace(' ', '\s').replace('\t', '\\t')

        yield Document(id, [],
                       [sentence for _, sentence in s_starts_and_sentences],
                       txt_path)