def validate_pubtator(content, document): """ Returns bool if the provided str is a valid pubtator (BioC) response for the Document instance """ try: r = BioCReader(source=content) r.read() # Check general Collection + Document attributes assert (len(r.collection.documents) == 1), 'The response included more than the provided Document' assert (document.document_id == int(r.collection.documents[0].id)), 'The response does not include the requested PMID' assert (len(r.collection.documents[0].passages) == 2), 'The response document does not include the correct number of sections' # Check the Title assert (int(r.collection.documents[0].passages[0].offset) == 0), 'The title does not start at 0' section = document.section_set.first() assert (section.text == r.collection.documents[0].passages[0].text), 'The response title does not equal the provided text' assert (section.id == int(r.collection.documents[0].passages[0].infons.get('id'))), 'The response title is not correctly identified' # Check the Abstract assert (int(r.collection.documents[0].passages[1].offset) >= 1), 'The abstract does not start after 0' section = document.section_set.last() assert (section.text == r.collection.documents[0].passages[1].text), 'The response abstract does not equal the provided text' assert (section.id == int(r.collection.documents[0].passages[1].infons.get('id'))), 'The response abstract is not correctly identified' return True except Exception: client.captureException() return False
def test_document_as_bioc_with_pubtator(self): pub_query_set = Pubtator.objects.filter( document=self.doc, session_id='', content__isnull=False) response = self.client.get('/document/pubtator/{pmid}.json'.format(pmid=self.doc.document_id)) json_string = response.content self.assertNotEqual(json_string, '', msg='API returned empty response for document BioC Pubtator Representation.') data = json.loads(json_string) # Make sure it's the same document in BioC as DB self.assertEqual(int(data.get('collection').get('document').get('id')), self.doc.document_id) self.assertEqual(len(data.get('collection').get('document').get('passage')), 2) self.assertEqual(data.get('collection').get('document').get('passage')[0].get('text'), self.doc.section_set.first().text) self.assertEqual(data.get('collection').get('document').get('passage')[1].get('text'), self.doc.section_set.last().text) # Make sure it contains any annotations self.assertNotEqual(len(data.get('collection').get('document').get('passage')[0].get('annotation')), 0) self.assertNotEqual(len(data.get('collection').get('document').get('passage')[1].get('annotation')), 0) # We already validated everything in JSON b/c it's easier. Let's just # make sure the XML document passes too without specific checks response = self.client.get('/document/pubtator/{pmid}.xml'.format(pmid=self.doc.document_id)) r = BioCReader(source=response.content) r.read() self.assertEqual(len(r.collection.documents), 1) self.assertEqual(int(r.collection.documents[0].id), self.doc.document_id) self.assertEqual(len(r.collection.documents[0].passages), 2) self.assertNotEqual(len(r.collection.documents[0].passages[0].annotations), 0) self.assertNotEqual(len(r.collection.documents[0].passages[1].annotations), 0)
def handle(self, *args, **options): types_arr = [] errors = 0 if options['keys']: for pubtator in Pubtator.objects.filter(content__isnull=False).all(): try: r = BioCReader(source=pubtator.content) r.read() for d_idx, document in enumerate(r.collection.documents): for p_idx, passage in enumerate(document.passages): for annotation in r.collection.documents[d_idx].passages[p_idx].annotations: types_arr.append( annotation.infons['type'] ) except Exception as e: ''' print '%' in pubtator.content for sec in pubtator.document.available_sections(): print '%' in sec.text print ' - - - ' ''' errors = errors + 1 print 'Errors:', errors print Counter(types_arr)
def as_pubtator_annotation_df(self): # If the document has 3 solid annotations # "GNormPlus" # "DNorm" # "tmChem" df_columns = ('uid', 'source', 'ann_type', 'text', 'offset', 'location') pubtator_dfs = [] if self.valid_pubtator(): pubtators = Pubtator.objects.filter( document=self, session_id='', content__isnull=False).all() for pubtator in pubtators: r = BioCReader(source=pubtator.content) r.read() pubtator_arr = [] bioc_document = r.collection.documents[0] for passage in bioc_document.passages: for annotation in passage.annotations: infons = annotation.infons annotation_type = None uid_type = None uid = None for key in infons.keys(): if key == 'type': annotation_type = infons.get(key, None) else: uid_type = key uid = infons.get(uid_type, None) #print infons.keys() #print infons #print uid_type, uid, '('+str(annotation_type)+')' #print ' - '*40 pubtator_arr.append({ 'uid': uid, 'source': uid_type, 'ann_type': annotation_type, 'text': str(annotation.text), 'offset': int(passage.offset), 'location': str(annotation.locations[0]) }) pubtator_dfs.append( pd.DataFrame(pubtator_arr, columns=df_columns) ) if len(pubtator_dfs): return pd.concat(pubtator_dfs) else: return pd.DataFrame([], columns=df_columns)
def as_writer(self, request=None): r = BioCReader(source=self.content) r.read() bioc_writer = BioCWriter() bioc_writer.collection = r.collection return bioc_writer
def get_instance(self): """ Returns the pubtator BioC instance if valid or None """ try: r = BioCReader(source=self.content) r.read() return r except Exception: # If one of them doesn't validate leave return False
def count_annotations(self): if self.valid(): count = 0 reader = BioCReader(source=self.content) reader.read() for doc in reader.collection.documents: for passage in doc.passages: count += len(passage.annotations) return count else: return 0
def count_annotations(self): """ Returns an int count of all types of ER annotations in the Pubtator instance If none are found or the document is invalid, return 0 """ try: r = BioCReader(source=self.content) r.read() return sum([ len(passage.annotations) for passage in r.collection.documents[0].passages ]) except Exception: return 0
def test_document_as_bioc_with_m2c(self): # Submit Annotations (As User 1) so they show up when inspecting the M2C submissions self.assertEqual(Annotation.objects.count(), 0) self.client.login(username='******', password='******') response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True) doc = response.context['document'] abstract = doc.available_sections().last() # Annotation submit URL url = reverse('document:create', kwargs={'task_pk': self.task.pk, 'section_pk': abstract.pk}) self.assertEqual(self.client.post(url, {'type': 0, 'text': 'text annotation 0', 'start': 0}).status_code, 200) self.assertEqual(self.client.post(url, {'type': 1, 'text': 'text annotation 1', 'start': 10}).status_code, 200) self.assertEqual(self.client.post(url, {'type': 2, 'text': 'text annotation 2', 'start': 20}).status_code, 200) self.assertEqual(Annotation.objects.count(), 3) # Then submit the document for the Quest response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True) self.client.logout() # Submit Annotations (As User 1) so they show up when inspecting the M2C submissions self.assertEqual(Annotation.objects.count(), 3) self.client.login(username='******', password='******') response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True) # Annotation submit URL url = reverse('document:create', kwargs={'task_pk': self.task.pk, 'section_pk': abstract.pk}) self.assertEqual(self.client.post(url, {'type': 0, 'text': 'text annotation 3', 'start': 30}).status_code, 200) self.assertEqual(self.client.post(url, {'type': 1, 'text': 'text annotation 4', 'start': 40}).status_code, 200) self.assertEqual(self.client.post(url, {'type': 2, 'text': 'text annotation 5', 'start': 50}).status_code, 200) self.assertEqual(Annotation.objects.count(), 6) # Then submit the document for the Quest response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True) self.client.logout() # As Anon user, export the documents submissions res = self.client.get(reverse('document:read-users-bioc', kwargs={'pubmed_id': doc.document_id, 'format_type': 'xml'}), follow=True) self.assertEqual(res.status_code, 200) bioc = BioCReader(source=res.content) bioc.read() # Make sure the BioC document has the opponent's infp self.assertEqual(len(bioc.collection.documents), 1) self.assertEqual(int(bioc.collection.documents[0].id), doc.document_id) self.assertEqual(len(bioc.collection.documents[0].passages), 2) self.assertEqual(len(bioc.collection.documents[0].passages[0].annotations), 0) self.assertEqual(len(bioc.collection.documents[0].passages[1].annotations), 6)
def valid(self): # (TODO) This may return 2 different "types" check on # implications of this discrepancy if self.validate_cache: return True if self.session_id != '': return False if self.content is None: return False try: r = BioCReader(source=self.content) r.read() return r except Exception: # If one of them doesn't validate leave return False
def test_group_for_all_user_annotations(self): self.load_fake_annotations() # Fetch the Group BioC as JSON to ensure is online response = self.client.get(reverse('api:group-users-bioc', kwargs={'group_pk': self.group.pk, 'format_type': 'json'})) self.assertEqual(response.status_code, 200) # Fetch the Group BioC for all user annotations response = self.client.get(reverse('api:group-users-bioc', kwargs={'group_pk': self.group.pk, 'format_type': 'xml'})) self.assertEqual(response.status_code, 200) r = BioCReader(source=response.content) r.read() # Does BioC have correct number of Group Documents self.assertEqual(len(r.collection.documents), self.group.get_documents().count()) # Does BioC have correct number of Group Annotations total_bioc_annotation_int = 0 for bioc_doc in r.collection.documents: for bioc_passage in bioc_doc.passages: total_bioc_annotation_int += len(bioc_passage.annotations) self.assertEqual(Annotation.objects.count(), total_bioc_annotation_int)
def entity_recognition_df(self, documents=[], users=[], include_pubtator=True, writer=None): if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) filter_doc_level = 'WHERE `document_section`.`document_id` IN ({0})'.format( ','.join(doc_arr)) else: filter_doc_level = '' if len(users): from django.contrib.auth.models import User user_arr = [] for u in users: if type(u) == User: user_arr.append(str(u.pk)) elif type(u) is str or type(u) is unicode and d.isdigit(): user_arr.append(u) elif type(u) is int: user_arr.append(str(u)) filter_user_level = '{0} `document_view`.`user_id` IN ({1})'.format( 'WHERE' if filter_doc_level == '' else 'AND', ','.join(user_arr)) else: filter_user_level = '' content_type_id = str( ContentType.objects.get_for_model( EntityRecognitionAnnotation.objects.first()).id) df_arr = [] cmd_str = "" with open('mark2cure/document/commands/get-er-results.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(content_type_pk=content_type_id, filter_doc_level=filter_doc_level, filter_user_level=filter_user_level) c = connection.cursor() try: c.execute(cmd_str) # Get the full writer in advnaced!! if not writer: writer = Document.objects.as_writer(documents=documents) res = [x for x in c.fetchall()] # We group the response to reduce BioCDocument offset dict lookups for key, doc_group in groupby(res, lambda x: x[5]): bioc_documents = filter( lambda d: d.infons.get('document_pk') == str(key), writer.collection.documents) # If a pubtator doesn't exist for the document, we can't include any annotations as the passage offsets need to come from Pubtator if len(bioc_documents) == 1: # Use the BioC pubtator file for the offset values offset_dict = {} for passage in bioc_documents[0].passages: offset_dict[int( passage.infons.get('id'))] = passage.offset for x in doc_group: df_arr.append( self._create_er_df_row( uid=x[0], source='db', user_id=x[8], text=x[2], ann_type_idx=x[1], document_pk=x[5], section_id=x[7], section_offset=offset_dict[x[7]], offset_relative=True, start_position=x[3], length=len(x[2]))) finally: c.close() if include_pubtator: ''' This is the component that merges the 3 different pubtator reponses into 1 main file. It performances selective ordering and precedence for some annotations types / instances ''' cmd_str = "" with open( 'mark2cure/document/commands/get-er-pubtator-results.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(','.join(doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [x for x in c.fetchall()] finally: c.close() # Counter({'Disease': 3676, 'Chemical': 2928, 'Species': 1553, 'Gene': 1544, 'FamilyName': 536, 'DomainMotif': 20}) (Sampleing from DB 11/30/2016) pubtator_types = ['Disease', 'Gene', 'Chemical'] for pubtator_content in res: r = BioCReader(source=pubtator_content[2]) r.read() bioc_document = r.collection.documents[0] section_ids = pubtator_content[3].split(',') # Iterate over all the annotations in both passages for p_idx, passage in enumerate(bioc_document.passages): for annotation in passage.annotations: # Determine some meta-data (UID info) about the BioCAnnotation annotation_type = None uid_type = None uid = None for key in annotation.infons.keys(): if key == 'type': annotation_type = annotation.infons.get( key, None) else: uid_type = key uid = annotation.infons.get(uid_type, None) # We're only interested in Pubtator Annotations that are the same concepts users highlight if annotation_type in pubtator_types: start, length = str( annotation.locations[0]).split(':') df_arr.append( self._create_er_df_row( uid=uid, source=uid_type if uid_type else None, user_id=None, text=annotation.text, ann_type_idx=pubtator_types.index( annotation_type), document_pk=pubtator_content[1], section_id=section_ids[p_idx], section_offset=passage.offset, offset_relative=False, start_position=start, length=length)) return pd.DataFrame(df_arr, columns=DF_COLUMNS)
def entity_recognition_df(self, documents=[], users=[], include_pubtator=True, writer=None): if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) filter_doc_level = 'WHERE `document_section`.`document_id` IN ({0})'.format(','.join(doc_arr)) else: filter_doc_level = '' if len(users): from django.contrib.auth.models import User user_arr = [] for u in users: if type(u) == User: user_arr.append(str(u.pk)) elif type(u) is str or type(u) is unicode and d.isdigit(): user_arr.append(u) elif type(u) is int: user_arr.append(str(u)) filter_user_level = '{0} `document_view`.`user_id` IN ({1})'.format( 'WHERE' if filter_doc_level == '' else 'AND', ','.join(user_arr)) else: filter_user_level = '' content_type_id = str(ContentType.objects.get_for_model( EntityRecognitionAnnotation.objects.first()).id) df_arr = [] cmd_str = ''' SELECT `entity_recognition_entityrecognitionannotation`.`id`, `entity_recognition_entityrecognitionannotation`.`type`, `entity_recognition_entityrecognitionannotation`.`text`, `entity_recognition_entityrecognitionannotation`.`start`, `document_annotation`.`created`, `document_document`.`id` as `document_pk`, `document_document`.`document_id` as `pmid`, `document_view`.`section_id`, `document_view`.`user_id` FROM `entity_recognition_entityrecognitionannotation` INNER JOIN `document_annotation` ON `document_annotation`.`object_id` = `entity_recognition_entityrecognitionannotation`.`id` AND `document_annotation`.`content_type_id` = {content_type_pk} INNER JOIN `document_view` ON `document_annotation`.`view_id` = `document_view`.`id` INNER JOIN `document_section` ON `document_view`.`section_id` = `document_section`.`id` INNER JOIN `document_document` ON `document_document`.`id` = `document_section`.`document_id` {filter_doc_level} {filter_user_level} '''.format(content_type_pk=content_type_id, filter_doc_level=filter_doc_level, filter_user_level=filter_user_level) c = connection.cursor() try: c.execute(cmd_str) # Get the full writer in advnaced!! if not writer: writer = Document.objects.as_writer(documents=documents) res = [x for x in c.fetchall()] # We group the response to reduce BioCDocument offset dict lookups for key, doc_group in groupby(res, lambda x: x[5]): bioc_documents = filter(lambda d: d.infons.get('document_pk') == str(key), writer.collection.documents) # If a pubtator doesn't exist for the document, we can't include any annotations as the passage offsets need to come from Pubtator if len(bioc_documents) == 1: # Use the BioC pubtator file for the offset values offset_dict = {} for passage in bioc_documents[0].passages: offset_dict[int(passage.infons.get('id'))] = passage.offset for x in doc_group: df_arr.append(self._create_er_df_row( uid=x[0], source='db', user_id=x[8], text=x[2], ann_type=x[1], document_pk=x[5], section_id=x[7], section_offset=offset_dict[x[7]], offset_relative=True, start_position=x[3], length=len(x[2]))) finally: c.close() if include_pubtator: ''' This is the component that merges the 3 different pubtator reponses into 1 main file. It performances selective ordering and precedence for some annotations types / instances ''' cmd_str = ''' SELECT `document_pubtator`.`id`, `document_pubtator`.`document_id`, `document_pubtator`.`content`, GROUP_CONCAT(DISTINCT `document_section`.`id`) as `section_ids` FROM `document_pubtator` JOIN `document_section` ON `document_section`.`document_id` = `document_pubtator`.`document_id` WHERE `document_pubtator`.`content` != '' AND `document_pubtator`.`document_id` IN ({0}) GROUP BY `document_pubtator`.`id` '''.format(','.join(doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [x for x in c.fetchall()] finally: c.close() for pubtator_content in res: r = BioCReader(source=pubtator_content[2]) r.read() bioc_document = r.collection.documents[0] section_ids = pubtator_content[3].split(',') # Iterate over all the annotations in both passages for p_idx, passage in enumerate(bioc_document.passages): for annotation in passage.annotations: # Determine some meta-data (UID info) about the BioCAnnotation annotation_type = None uid_type = None uid = None for key in annotation.infons.keys(): if key == 'type': annotation_type = annotation.infons.get(key, None) else: uid_type = key uid = annotation.infons.get(uid_type, None) start, length = str(annotation.locations[0]).split(':') df_arr.append(self._create_er_df_row( uid=uid, source=uid_type if uid_type else None, user_id=None, text=annotation.text, ann_type=annotation_type if annotation_type else None, document_pk=pubtator_content[1], section_id=section_ids[p_idx], section_offset=passage.offset, offset_relative=False, start_position=start, length=length)) return pd.DataFrame(df_arr, columns=DF_COLUMNS)
def test_document_as_bioc_for_pairing(self): # Ensure the player views the Q but can't match b/c no Anns exist self.client.login(username='******', password='******') # Ensure the User info is showing up in the header response = self.client.get('/dashboard/') self.assertInHTML('<p>Level: Expert</p>', response.content) # Ensure no User >> Quest views until after viewed once self.assertEqual(UserQuestRelationship.objects.count(), 0) response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True) doc = response.context['document'] self.assertEqual(UserQuestRelationship.objects.count(), 1) # Ensure this returns a 500 for the player b/c there are no submissions yet response = self.client.get(reverse('document:results-bioc', kwargs={'task_pk': self.task.pk, 'doc_pk': doc.pk, 'format_type': 'xml'})) self.assertEqual(response.status_code, 500) self.assertEqual(response.content, 'no_points_awarded') self.client.logout() # # Submit bogus Annotations as opponent to try match again for player # self.client.login(username='******', password='******') self.assertEqual(Annotation.objects.count(), 0) response = self.client.get(reverse('common:quest-home', kwargs={'quest_pk': self.task.pk}), follow=True) # Annotation submit URL abstract = doc.available_sections().last() url = reverse('document:create', kwargs={'task_pk': self.task.pk, 'section_pk': abstract.pk}) self.assertEqual(self.client.post(url, {'type': 0, 'text': 'text annotation 0', 'start': 0}).status_code, 200) self.assertEqual(self.client.post(url, {'type': 1, 'text': 'text annotation 1', 'start': 10}).status_code, 200) self.assertEqual(self.client.post(url, {'type': 2, 'text': 'text annotation 2', 'start': 20}).status_code, 200) self.assertEqual(Annotation.objects.count(), 3) # Then submit the document for the Quest response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True) self.client.logout() # # Try again as the player to see if comparison uses opponents # self.client.login(username='******', password='******') # Submit this Document without contributing any Annotations response = self.client.post(reverse('common:doc-quest-submit', kwargs={'quest_pk': self.task.pk, 'document_pk': doc.pk}), follow=True) # Fetch the BioC Document again response = self.client.get(reverse('document:results-bioc', kwargs={'task_pk': self.task.pk, 'doc_pk': doc.pk, 'format_type': 'xml'})) self.assertEqual(response.status_code, 200) r = BioCReader(source=response.content) r.read() # Make sure the BioC document has the opponent's infp self.assertEqual(len(r.collection.documents), 1) self.assertEqual(int(r.collection.documents[0].id), doc.document_id) self.assertEqual(len(r.collection.documents[0].passages), 2) self.assertEqual(len(r.collection.documents[0].passages[0].annotations), 0) self.assertEqual(len(r.collection.documents[0].passages[1].annotations), 3) self.assertEqual(r.collection.documents[0].passages[1].annotations[0].infons['user_name'], 'opponent') self.assertEqual(int(r.collection.documents[0].passages[1].annotations[0].infons['type']), 0) self.assertEqual(r.collection.documents[0].passages[1].annotations[0].text, 'text annotation 0') self.assertEqual(int(r.collection.infons['points']), 0) self.assertEqual(r.collection.infons['partner'], 'opponent') self.client.logout()
def as_writer(self, documents=[]): ''' Return a blank BioC Writer that is based off the pubtator content. Problems: This requires every document to have at least 1 pubtator model Pros: This prevents us from generating our own BioC file which may have inconsistencies ''' if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) str_doc_arr = list(set(doc_arr)) else: raise ValueError('No documents supplied to generator writer') cmd_str = "" with open('mark2cure/document/commands/get-pubtators.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(','.join(str_doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [(x[0], x[1], x[2]) for x in c.fetchall()] finally: c.close() writer = bioc_writer(None) for pubtator_content in res: section_ids = pubtator_content[2].split(',') r = BioCReader(source=pubtator_content[1]) r.read() doc = r.collection.documents[0] doc.put_infon('document_pk', str(pubtator_content[0])) for idx, passage in enumerate(doc.passages): passage.clear_annotations() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section_ids[idx])) writer.collection.add_document(doc) str_doc_arr.remove(str(pubtator_content[0])) # Capture all the documents not available via pubtators for document_pk_str in str_doc_arr: # Can optimize this model retrieval but should rarely occur document_model = Document.objects.get(pk=document_pk_str) bioc_document = BioCDocument() bioc_document.id = str(document_model.document_id) bioc_document.put_infon('document_pk', document_pk_str) passage_offset = 0 for idx, section in enumerate(document_model.available_sections()): passage = BioCPassage() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section.pk)) # (TODO) Missing a "type" infon? passage.text = section.text passage.offset = str(passage_offset) passage_offset += len(passage.text) + 1 bioc_document.add_passage(passage) writer.collection.add_document(bioc_document) return writer
def as_writer(self, documents=[]): ''' Return a blank BioC Writer that is based off the pubtator content. Problems: This requires every document to have at least 1 pubtator model Pros: This prevents us from generating our own BioC file which may have inconsistencies ''' if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) str_doc_arr = list(set(doc_arr)) else: raise ValueError('No documents supplied to generator writer') cmd_str = ''' SELECT `document_pubtator`.`document_id`, ANY_VALUE(`document_pubtator`.`content`), GROUP_CONCAT(DISTINCT `document_section`.`id`) as `section_ids` FROM `document_pubtator` JOIN `document_section` ON `document_section`.`document_id` = `document_pubtator`.`document_id` WHERE `document_pubtator`.`content` != '' AND `document_pubtator`.`document_id` IN ({0}) GROUP BY `document_pubtator`.`document_id`; '''.format(','.join(str_doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [(x[0], x[1], x[2]) for x in c.fetchall()] finally: c.close() writer = bioc_writer(None) for pubtator_content in res: section_ids = pubtator_content[2].split(',') r = BioCReader(source=pubtator_content[1]) r.read() doc = r.collection.documents[0] doc.put_infon('document_pk', str(pubtator_content[0])) for idx, passage in enumerate(doc.passages): passage.clear_annotations() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section_ids[idx])) writer.collection.add_document(doc) str_doc_arr.remove(str(pubtator_content[0])) # Capture all the documents not available via pubtators for document_pk_str in str_doc_arr: # Can optimize this model retrieval but should rarely occur document_model = Document.objects.get(pk=document_pk_str) bioc_document = BioCDocument() bioc_document.id = str(document_model.document_id) bioc_document.put_infon('document_pk', document_pk_str) passage_offset = 0 for idx, section in enumerate(document_model.available_sections()): passage = BioCPassage() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section.pk)) # (TODO) Missing a "type" infon? passage.text = section.text passage.offset = str(passage_offset) passage_offset += len(passage.text) + 1 bioc_document.add_passage(passage) writer.collection.add_document(bioc_document) return writer
def as_bioc_with_pubtator_annotations(self, request=None): ''' This is a function that merges the 3 different pubtator reponses into 1 main file. It performances selective ordering and precedence for some annotations types / instances ''' approved_types = ['Disease', 'Gene', 'Chemical'] self.init_pubtator() reader = self.as_writer(request) pub_query_set = Pubtator.objects.filter( document=self, session_id='', content__isnull=False) # Load up our various pubtator responses pub_readers = [] for pubtator in pub_query_set.all(): r = BioCReader(source=pubtator.content) r.read() pub_readers.append(r) for d_idx, document in enumerate(reader.collection.documents): for p_idx, passage in enumerate(document.passages): # For each passage in each document in the collection # add the appropriate annotation for p in pub_readers: for annotation in p.collection.documents[d_idx].passages[p_idx].annotations: ann_type = annotation.infons['type'] infons = annotation.infons if ann_type in approved_types: uid_type = None uid = None for key in infons.keys(): if key != 'type': uid_type = key uid = infons.get(uid_type, None) annotation.clear_infons() annotation.put_infon('type', str(approved_types.index(ann_type))) annotation.put_infon('user', 'pubtator') annotation.put_infon('uid', str(uid)) reader.collection.documents[d_idx].passages[p_idx].add_annotation(annotation) # Remove the shorter annotation if they're multiple # at the same start position anns = reader.collection.documents[d_idx].passages[p_idx].annotations ann_offsets = [x.locations[0].offset for x in anns] import collections # For each of the offset positions where there are multiple annotations for offset in [x for x, y in collections.Counter(ann_offsets).items() if y > 1]: conflicting_anns = [x for x in anns if x.locations[0].offset == offset] longest_ann = max(conflicting_anns, key=lambda a: int(a.locations[0].length)) for ann in conflicting_anns: if ann is not longest_ann: reader.collection.documents[d_idx].passages[p_idx].remove_annotation(ann) # Remove any annoations that overlap, prefer selection for longest anns = reader.collection.documents[d_idx].passages[p_idx].annotations for needle_ann in anns: needle_ann_offset = int(needle_ann.locations[0].offset) needle_ann_length = int(needle_ann.locations[0].length) for stack_ann in anns: stack_ann_offset = int(stack_ann.locations[0].offset) stack_ann_length = int(stack_ann.locations[0].length) if needle_ann_offset >= stack_ann_offset and needle_ann_length < stack_ann_length: try: reader.collection.documents[d_idx].passages[p_idx].remove_annotation(needle_ann) except: pass return reader.collection.documents[0]