def _get_single_entity_mentions(self, entity: str, pageSize: int = None): """ Generator that yields each article and article id that mentions the given entity See https://europepmc.org/AnnotationsApi!/annotations45api45controller/getAnnotationsArticlesByEntityUsingGET The articles come up sorted by number of mentions """ if pageSize is None: pageSize = PMC_Europe_Service.MAX_PAGE_SIZE prevCursorMark = -1 cursorMark = 0 counter = 0 while cursorMark != prevCursorMark: url = PMC_Europe_Service.MENTION_URL.format( entity, 1, 'ID_LIST', cursorMark, pageSize) logger.info( f'{datetime.datetime.now()} Getting {counter} to {counter+pageSize}' ) results = requests.get(url) assert results.ok logger.info(f'{datetime.datetime.now()} Ok') data = json.loads(results.content.decode().strip()) prevCursorMark = cursorMark cursorMark = data['nextCursorMark'] for article in data['articles']: counter += 1 yield article, article['extId']
def get_co_occurrences(self, entity: str, limit: int = 20, types: List[str] = None) -> List[CoOccurrence]: logger.info('get co occurrences') """Returns a list of co-occurrences from a given entity """ raise TextMiningServiceOperationNotSupported
def _get_mentions_for_multiple_entities(self, entities: List[str], limit: int = 20 ) -> List[Publication]: """ Method for multiple entities retrieval. It's slow but a bit faster than _get_mentions_for_single_entity if there is a limit. """ entities = list(map(str.lower, entities)) first_entity = entities[0] rest_of_entities = entities[1:] if len(rest_of_entities) == 0: pageSize = min(PMC_Europe_Service.MAX_PAGE_SIZE, limit) else: pageSize = PMC_Europe_Service.MAX_PAGE_SIZE prevCursorMark = -1 cursorMark = 0 total_counter = 0 yielded_counter = 0 publications = [] scores = [] while cursorMark != prevCursorMark and len(publications) < limit: url = PMC_Europe_Service.MENTION_URL.format( first_entity, 0, 'JSON', cursorMark, pageSize) logger.info( f'{datetime.datetime.now()} Getting {total_counter} to {total_counter+pageSize}' ) results = requests.get(url) assert results.ok logger.info(f'{datetime.datetime.now()} Ok') data = json.loads(results.content.decode().strip()) prevCursorMark = cursorMark cursorMark = data['nextCursorMark'] for article in data['articles']: bool_table = dict( zip(rest_of_entities, [0] * len(rest_of_entities))) total_counter += 1 for annotation in article['annotations']: other_entity = annotation['exact'].lower() # check if this entity is what we look for if other_entity in bool_table: bool_table[other_entity] += 1 # if the article includes all entities, then if all(bool_table.values()): yielded_counter += 1 pub = Publication(pm_id=article_id, pmc_id=article['pmcid']) publications.append(pub) scores.append(sum(bool_table.values())) publications = np.array(publications) scores = np.array(scores) inds = scores.argsort()[::-1] return publications[inds]
def get_mentions(self, entities: List[str], limit: int = 20) -> List[Publication]: logger.info('get mentions') entities = [self._convert_entity(entity) for entity in entities] if len(entities) == 1: pageSize = min(limit + 1, PMC_Europe_Service.MAX_PAGE_SIZE) generator = self._get_single_entity_mentions(entities[0], pageSize=pageSize) publications = [] for article, _ in generator: if len(publications) == limit: break else: publications.append( Publication(pm_id=article.get('extId', None), pmc_id=article.get('pmcid', None))) return publications else: raise TextMiningServiceOperationNotSupported
def _convert_entity(self, entity: str) -> str: EBI_URL = 'https://www.ebi.ac.uk/ols/api/ontologies/{}/terms?short_form={}_{}' try: ontology, number = entity.split(':', 1) except ValueError: logger.info(f'(_convert_entity) Error spliting entity {entity}') return entity url = EBI_URL.format(ontology, ontology, number) response = requests.get(url) try: assert response.ok except AssertionError: return entity data = json.loads(response.content.decode().strip()) for term in data['_embedded']['terms']: if term['obo_id'] == entity: label = term['label'] logger.info( f'(_convert_entity) Found label {label} for entity {entity}' ) return label return entity
scores = np.array(scores) inds = scores.argsort()[::-1] return publications[inds][:limit] def get_co_occurrences(self, entity: str, limit: int = 20, types: List[str] = None) -> List[CoOccurrence]: logger.info('get co occurrences') """Returns a list of co-occurrences from a given entity """ raise TextMiningServiceOperationNotSupported if __name__ == "__main__": logger.info('PMC') pmc = PMC_Europe_Service() logger.info('Test convert entity') print(pmc._convert_entity('DOID:2841')) logger.info('get mentions for single entity PRDM1') for pub in pmc.get_mentions(['DOID:7148']): print(pub) print(datetime.datetime.now()) # right now it raises TextMiningServiceOperationNotSupported # print('get mentions for multiple entities PRDM1, GFP') # for pub in pmc.get_mentions(['PRDM1', 'GFP']): # print(pub) # print(datetime.datetime.now())