def validate_pubtator(content, document): """ Returns bool if the provided str is a valid pubtator (BioC) response for the Document instance """ try: r = BioCReader(source=content) r.read() # Check general Collection + Document attributes assert (len(r.collection.documents) == 1), 'The response included more than the provided Document' assert (document.document_id == int(r.collection.documents[0].id)), 'The response does not include the requested PMID' assert (len(r.collection.documents[0].passages) == 2), 'The response document does not include the correct number of sections' # Check the Title assert (int(r.collection.documents[0].passages[0].offset) == 0), 'The title does not start at 0' section = document.section_set.first() assert (section.text == r.collection.documents[0].passages[0].text), 'The response title does not equal the provided text' assert (section.id == int(r.collection.documents[0].passages[0].infons.get('id'))), 'The response title is not correctly identified' # Check the Abstract assert (int(r.collection.documents[0].passages[1].offset) >= 1), 'The abstract does not start after 0' section = document.section_set.last() assert (section.text == r.collection.documents[0].passages[1].text), 'The response abstract does not equal the provided text' assert (section.id == int(r.collection.documents[0].passages[1].infons.get('id'))), 'The response abstract is not correctly identified' return True except Exception: client.captureException() return False
def get_instance(self): """ Returns the pubtator BioC instance if valid or None """ try: r = BioCReader(source=self.content) r.read() return r except Exception: # If one of them doesn't validate leave return False
def count_annotations(self): """ Returns an int count of all types of ER annotations in the Pubtator instance If none are found or the document is invalid, return 0 """ try: r = BioCReader(source=self.content) r.read() return sum([ len(passage.annotations) for passage in r.collection.documents[0].passages ]) except Exception: return 0
def entity_recognition_df(self, documents=[], users=[], include_pubtator=True, writer=None): if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) filter_doc_level = 'WHERE `document_section`.`document_id` IN ({0})'.format( ','.join(doc_arr)) else: filter_doc_level = '' if len(users): from django.contrib.auth.models import User user_arr = [] for u in users: if type(u) == User: user_arr.append(str(u.pk)) elif type(u) is str or type(u) is unicode and d.isdigit(): user_arr.append(u) elif type(u) is int: user_arr.append(str(u)) filter_user_level = '{0} `document_view`.`user_id` IN ({1})'.format( 'WHERE' if filter_doc_level == '' else 'AND', ','.join(user_arr)) else: filter_user_level = '' content_type_id = str( ContentType.objects.get_for_model( EntityRecognitionAnnotation.objects.first()).id) df_arr = [] cmd_str = "" with open('mark2cure/document/commands/get-er-results.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(content_type_pk=content_type_id, filter_doc_level=filter_doc_level, filter_user_level=filter_user_level) c = connection.cursor() try: c.execute(cmd_str) # Get the full writer in advnaced!! if not writer: writer = Document.objects.as_writer(documents=documents) res = [x for x in c.fetchall()] # We group the response to reduce BioCDocument offset dict lookups for key, doc_group in groupby(res, lambda x: x[5]): bioc_documents = filter( lambda d: d.infons.get('document_pk') == str(key), writer.collection.documents) # If a pubtator doesn't exist for the document, we can't include any annotations as the passage offsets need to come from Pubtator if len(bioc_documents) == 1: # Use the BioC pubtator file for the offset values offset_dict = {} for passage in bioc_documents[0].passages: offset_dict[int( passage.infons.get('id'))] = passage.offset for x in doc_group: df_arr.append( self._create_er_df_row( uid=x[0], source='db', user_id=x[8], text=x[2], ann_type_idx=x[1], document_pk=x[5], section_id=x[7], section_offset=offset_dict[x[7]], offset_relative=True, start_position=x[3], length=len(x[2]))) finally: c.close() if include_pubtator: ''' This is the component that merges the 3 different pubtator reponses into 1 main file. It performances selective ordering and precedence for some annotations types / instances ''' cmd_str = "" with open( 'mark2cure/document/commands/get-er-pubtator-results.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(','.join(doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [x for x in c.fetchall()] finally: c.close() # Counter({'Disease': 3676, 'Chemical': 2928, 'Species': 1553, 'Gene': 1544, 'FamilyName': 536, 'DomainMotif': 20}) (Sampleing from DB 11/30/2016) pubtator_types = ['Disease', 'Gene', 'Chemical'] for pubtator_content in res: r = BioCReader(source=pubtator_content[2]) r.read() bioc_document = r.collection.documents[0] section_ids = pubtator_content[3].split(',') # Iterate over all the annotations in both passages for p_idx, passage in enumerate(bioc_document.passages): for annotation in passage.annotations: # Determine some meta-data (UID info) about the BioCAnnotation annotation_type = None uid_type = None uid = None for key in annotation.infons.keys(): if key == 'type': annotation_type = annotation.infons.get( key, None) else: uid_type = key uid = annotation.infons.get(uid_type, None) # We're only interested in Pubtator Annotations that are the same concepts users highlight if annotation_type in pubtator_types: start, length = str( annotation.locations[0]).split(':') df_arr.append( self._create_er_df_row( uid=uid, source=uid_type if uid_type else None, user_id=None, text=annotation.text, ann_type_idx=pubtator_types.index( annotation_type), document_pk=pubtator_content[1], section_id=section_ids[p_idx], section_offset=passage.offset, offset_relative=False, start_position=start, length=length)) return pd.DataFrame(df_arr, columns=DF_COLUMNS)
def as_writer(self, documents=[]): ''' Return a blank BioC Writer that is based off the pubtator content. Problems: This requires every document to have at least 1 pubtator model Pros: This prevents us from generating our own BioC file which may have inconsistencies ''' if len(documents): from .models import Document doc_arr = [] for d in documents: if type(d) == Document: doc_arr.append(str(d.pk)) elif type(d) is str or type(d) is unicode and d.isdigit(): doc_arr.append(d) elif type(d) is int or type(d) is long: doc_arr.append(str(d)) str_doc_arr = list(set(doc_arr)) else: raise ValueError('No documents supplied to generator writer') cmd_str = "" with open('mark2cure/document/commands/get-pubtators.sql', 'r') as f: cmd_str = f.read() cmd_str = cmd_str.format(','.join(str_doc_arr)) c = connection.cursor() try: c.execute(cmd_str) res = [(x[0], x[1], x[2]) for x in c.fetchall()] finally: c.close() writer = bioc_writer(None) for pubtator_content in res: section_ids = pubtator_content[2].split(',') r = BioCReader(source=pubtator_content[1]) r.read() doc = r.collection.documents[0] doc.put_infon('document_pk', str(pubtator_content[0])) for idx, passage in enumerate(doc.passages): passage.clear_annotations() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section_ids[idx])) writer.collection.add_document(doc) str_doc_arr.remove(str(pubtator_content[0])) # Capture all the documents not available via pubtators for document_pk_str in str_doc_arr: # Can optimize this model retrieval but should rarely occur document_model = Document.objects.get(pk=document_pk_str) bioc_document = BioCDocument() bioc_document.id = str(document_model.document_id) bioc_document.put_infon('document_pk', document_pk_str) passage_offset = 0 for idx, section in enumerate(document_model.available_sections()): passage = BioCPassage() passage.put_infon('section', ['title', 'paragraph'][idx]) passage.put_infon('id', str(section.pk)) # (TODO) Missing a "type" infon? passage.text = section.text passage.offset = str(passage_offset) passage_offset += len(passage.text) + 1 bioc_document.add_passage(passage) writer.collection.add_document(bioc_document) return writer