def read_bioc(request, pubmed_id, format_type): # When fetching via pubmed, include no annotaitons writer = bioc_writer(request) doc = get_object_or_404(Document, document_id=pubmed_id) writer = bioc_writer(request) bioc_document = doc.as_bioc_with_passages() writer.collection.add_document(bioc_document) if format_type == 'json': writer_json = bioc_as_json(writer) return HttpResponse(writer_json, content_type='application/json') else: return HttpResponse(writer, content_type='text/xml')
def group_pubtator_bioc(request, group_pk, format_type): group = get_object_or_404(Group, pk=group_pk) # When fetching via pubmed, include all user annotaitons writer = bioc_writer(request) for doc in group.get_documents(): doc_bioc = doc.as_bioc_with_pubtator_annotations() writer.collection.add_document(doc_bioc) if format_type == 'json': writer_json = bioc_as_json(writer) return HttpResponse(writer_json, content_type='application/json') else: return HttpResponse(writer, content_type='text/xml')
def as_writer(self, request=None): from mark2cure.common.formatter import bioc_writer writer = bioc_writer(request) document = self.as_bioc_with_passages() writer.collection.add_document(document) return writer
def as_writer(self, documents=[]): ''' Return a blank BioC Writer that is based off the pubtator content. Problems: This requires every document to have at least 1 pubtator model Pros: This prevents us from generating our own BioC file which may have inconsistencies ''' if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) str_doc_arr = list(set(doc_arr)) else: raise ValueError('No documents supplied to generator writer') cmd_str = "" with open('mark2cure/document/commands/get-pubtators.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(','.join(str_doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [(x[0], x[1], x[2]) for x in c.fetchall()] finally: c.close() writer = bioc_writer(None) for pubtator_content in res: section_ids = pubtator_content[2].split(',') r = BioCReader(source=pubtator_content[1]) r.read() doc = r.collection.documents[0] doc.put_infon('document_pk', str(pubtator_content[0])) for idx, passage in enumerate(doc.passages): passage.clear_annotations() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section_ids[idx])) writer.collection.add_document(doc) str_doc_arr.remove(str(pubtator_content[0])) # Capture all the documents not available via pubtators for document_pk_str in str_doc_arr: # Can optimize this model retrieval but should rarely occur document_model = Document.objects.get(pk=document_pk_str) bioc_document = BioCDocument() bioc_document.id = str(document_model.document_id) bioc_document.put_infon('document_pk', document_pk_str) passage_offset = 0 for idx, section in enumerate(document_model.available_sections()): passage = BioCPassage() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section.pk)) # (TODO) Missing a "type" infon? passage.text = section.text passage.offset = str(passage_offset) passage_offset += len(passage.text) + 1 bioc_document.add_passage(passage) writer.collection.add_document(bioc_document) return writer
def as_writer(self, documents=[]): ''' Return a blank BioC Writer that is based off the pubtator content. Problems: This requires every document to have at least 1 pubtator model Pros: This prevents us from generating our own BioC file which may have inconsistencies ''' if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) str_doc_arr = list(set(doc_arr)) else: raise ValueError('No documents supplied to generator writer') cmd_str = ''' SELECT `document_pubtator`.`document_id`, ANY_VALUE(`document_pubtator`.`content`), GROUP_CONCAT(DISTINCT `document_section`.`id`) as `section_ids` FROM `document_pubtator` JOIN `document_section` ON `document_section`.`document_id` = `document_pubtator`.`document_id` WHERE `document_pubtator`.`content` != '' AND `document_pubtator`.`document_id` IN ({0}) GROUP BY `document_pubtator`.`document_id`; '''.format(','.join(str_doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [(x[0], x[1], x[2]) for x in c.fetchall()] finally: c.close() writer = bioc_writer(None) for pubtator_content in res: section_ids = pubtator_content[2].split(',') r = BioCReader(source=pubtator_content[1]) r.read() doc = r.collection.documents[0] doc.put_infon('document_pk', str(pubtator_content[0])) for idx, passage in enumerate(doc.passages): passage.clear_annotations() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section_ids[idx])) writer.collection.add_document(doc) str_doc_arr.remove(str(pubtator_content[0])) # Capture all the documents not available via pubtators for document_pk_str in str_doc_arr: # Can optimize this model retrieval but should rarely occur document_model = Document.objects.get(pk=document_pk_str) bioc_document = BioCDocument() bioc_document.id = str(document_model.document_id) bioc_document.put_infon('document_pk', document_pk_str) passage_offset = 0 for idx, section in enumerate(document_model.available_sections()): passage = BioCPassage() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section.pk)) # (TODO) Missing a "type" infon? passage.text = section.text passage.offset = str(passage_offset) passage_offset += len(passage.text) + 1 bioc_document.add_passage(passage) writer.collection.add_document(bioc_document) return writer