def __init__(self, study_list: List[str], user_token: str, wsc: WsClient, iac: IsaApiClient): """Init method Sets up a headers register (as we are hitting the same endpoint twice, but with different formats) and a set of base parameters for requests to the europePMC API. :param priv_list: A list of studies to iterate over, throwing each at europePMC. :param user_token: User token for use with javawebservice, must be curator or will have failed prior. :param wsc: WsClient that interfaces with the java webservice. :param iac: IsaApiClient, used to get study information. """ self.study_list = study_list self.user_token = user_token self.wsc = wsc self.iac = iac self.session = requests.Session() self.europe_pmc_url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search' self.headers_register = { 'article': { 'Accept': 'application/json' }, 'citation_ref': { 'Accept': 'application/xml' } } self.base_params = CascaDict({ 'resultType': 'core', 'format': 'JSON', 'cursorMark': '*', 'pageSize': '5', 'fromSearchPost': False, 'query': '' })
def setUp(self): self.cd_root = CascaDict([('name', 'Root'), ('color', 'Root color'), ('lvl', 0)]) #usuall, args based init self.cd_level1 = CascaDict(name='Lvl1', color='Lvl1 color', lvl= 1, ancestor=self.cd_root) #kwargs based init self.cd_level2 = self.cd_level1.cascade() self.cd_level2.update({'name': 'Lvl2', 'color': 'Lvl2 color', 'lvl': 2, 'nest': {'name': 'nested_lvl_2', 'lvl': 22, 'color': 'nested_color lvl2'}}) #cascade&update style init #Insert something with to level1 self.cd_level1['test_insert'] = 'contents' #Insert something onlu to level1 self.cd_level1['test_insert_level1'] = 'contents-lvl1' #Insert something to root level self.cd_root['test_insert_root'] = 'contents-root' #Insert something to root level which has the same name as the level1 self.cd_root['test_insert'] = 'contents_root_only'
class EuropePmcReportBuilder: """Class that builds the EuropePMC Report. The report is a result of cross referencing the publication information that submitters give us with externally sourced publication information found in EuropePMC. This allows us to check for discrepancies / differences.""" def __init__(self, study_list: List[str], user_token: str, wsc: WsClient, iac: IsaApiClient): """Init method Sets up a headers register (as we are hitting the same endpoint twice, but with different formats) and a set of base parameters for requests to the europePMC API. :param priv_list: A list of studies to iterate over, throwing each at europePMC. :param user_token: User token for use with javawebservice, must be curator or will have failed prior. :param wsc: WsClient that interfaces with the java webservice. :param iac: IsaApiClient, used to get study information. """ self.study_list = study_list self.user_token = user_token self.wsc = wsc self.iac = iac self.session = requests.Session() self.europe_pmc_url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search' self.headers_register = { 'article': { 'Accept': 'application/json' }, 'citation_ref': { 'Accept': 'application/xml' } } self.base_params = CascaDict({ 'resultType': 'core', 'format': 'JSON', 'cursorMark': '*', 'pageSize': '5', 'fromSearchPost': False, 'query': '' }) def build(self, drive) -> str: """ Get a list of result dicts (each of which represent a row) and try to build a dataframe out of them. If successful, save that dataframe as a csv file to our reporting directory, and return a message indicating success. If not successful, log the error, and return a message indicating failure. :param drive: flag to indicate whether to save the report to google drive. :return: A message as a string indicating success or failure. """ list_of_result_dicts = [ row for study in self.study_list for row in self.process(study) ] path = app.config.get('MTBLS_PRIVATE_FTP_ROOT') + '/' + app.config.get( 'REPORTING_PATH') + 'global/europepmc.csv' try: report_dataframe = pandas.DataFrame( list_of_result_dicts, columns=[ 'Identifier', 'Title', 'Submission Date', 'Status', 'Release Date', 'PubmedID', 'DOI', 'Author List', 'Publication Date', 'Citation Reference', 'Publication in MTBLS', 'Journal in EuropePMC', 'Released before curated?' ]) if drive is False: report_dataframe.to_csv(path, sep='\t') msg = 'EuropePMC report successfully saved to {0}'.format(path) logger.info(msg) else: try: setGoogleSheet(report_dataframe, app.config.get('EUROPE_PMC_REPORT'), 'europe_pmc_report', app.config.get('GOOGLE_SHEET_TOKEN')) msg = 'Saved report to google drive.' except Exception as e: abort(500, str(e)) except Exception as e: msg = 'Problem in building and saving europe pmc report: {0}'.format( e) logger.error(msg) abort(500, msg) return msg def process(self, study_id) -> List: """ Process an individual study_id from the study list. First ping our java webservice to get some basic information about the study. Then we ping the IsaApi client so that we can get title and publication information. We then iterate over the publications from the IAC, pinging europePMC for each one, creating a dict for each. :param study_id: current study_id to process. :return: List of Dicts that each represent a row in the generated report. """ row_dicts = [] self.session.headers.update(self.headers_register['article']) # kind of unsavoury to do this iteratively but saves me writing another method that does much the same thing is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = self.wsc.get_permissions(study_id, self.user_token) base_return_dict = CascaDict({ 'Identifier': study_id, 'Title': 'N/A', 'Submission Date': submission_date, 'Status': study_status, 'Release Date': release_date, 'PubmedID': 'N/A', 'DOI': 'N/A', 'Author List': 'N/A', 'Publication Date': 'N/A', 'Citing Reference': 'N/A', 'Publication in MTBLS': 'N/A', 'Journal in EuropePMC': 'N/A', 'Released before curation finished?': 'N/A' }) isa_study, isa_inv, std_path = self.iac.get_isa_study( study_id, self.user_token, skip_load_tables=True, study_location=study_location, failing_gracefully=True) # if get_isa_study has failed, isa_study will come back as None, and so we won't have any publication # information to work with. So we just return the very basic dict. if isa_study is None: row_dicts.append(base_return_dict) return row_dicts title = isa_study.title publications = isa_study.publications fresh_params = self.base_params.cascade({ 'query': title, 'format': 'JSON' }) # here we just search the article title rather than the specific publication europepmc_study_search_results = self.session.get( self.europe_pmc_url, params=fresh_params).json() # if there is an issue with query then just return the basic details dict. if 'resultList' not in europepmc_study_search_results: row_dicts.append(base_return_dict.cascade({'Title': title})) return row_dicts culled_results = [ result for result in europepmc_study_search_results['resultList'] ['result'] if fuzz.ratio(result['title'], title) > 80 ] if len(culled_results) > 0: for pub in publications: logger.info(pub) result = self.has_mapping(pub, culled_results) if result: logger.info('hit ' + str(result)) temp_dict = base_return_dict.cascade({ 'Title': title, 'PubmedId': result['pmid'], 'DOI': pub.doi, 'Author List': pub.author_list, 'Publication Date': result['journalInfo']['printPublicationDate'], 'Citation Reference': self.get_citation_reference(title), 'Publication in MTBLS': pub.title, 'Journal in EuropePMC': result['journalInfo']['journal']['title'], 'Released before curated?': self.assess_if_trangressed(study_status, result['journalInfo']) }) else: temp_dict = base_return_dict.cascade({ 'Title': title, 'PubmedId': pub.pubmed_id, 'DOI': pub.doi, 'Author List': pub.author_list, 'Publication Date': 'N/A', 'Citation Reference': self.get_citation_reference(title), 'Publication in MTBLS': pub.title, 'Journal in EuropePMC': 'N/A', 'Publication the same?': False, 'Released before curated?': 'N/A' }) row_dicts.append(temp_dict) if len(publications) is 0: row_dicts.append(base_return_dict) return row_dicts @staticmethod def has_mapping(publication, resultset): """Check whether a given publication has a match in the europePMC resultset""" for result in resultset: logger.info(result['source'] + str(len(result['source']))) if result[ 'source'] == 'PPR': #preprint so doesnt have an actual title. continue else: score = fuzz.ratio(result['title'], publication.title) logger.info('HASMAPPING: ' + str(score) + 'MTB: ' + publication.title + '/PMC: ' + result['title']) if score > 80: return result return None @staticmethod def assess_if_trangressed(status, europe_pmc_publication) -> Union[bool, str]: """Check whether the journal has been published despite study not being public.""" logger.info('ASSESSIF' + str(europe_pmc_publication)) if 'printPublicationDate' in europe_pmc_publication: journal_publication_date = datetime.strptime( europe_pmc_publication['printPublicationDate'], '%Y-%m-%d') logger.info('ASSESSIF' + str(journal_publication_date)) now = datetime.now() return status.upper( ) is not 'PUBLIC' and now > journal_publication_date else: return 'No publication date given.' def get_citation_reference(self, title) -> str: """Cascade a new param dict to use in the request and update the session headers to XML as the search endpoint on the EuropePMC API only returns the bibliographicCitation information if you specify the DC format (which is a kind of XML). Turn the resulting XML string into a dict, and then return the citation from that dict. :param title: Article title to get citation for :return: Bibliographic citation as string.""" fresh_params = self.base_params.cascade({ 'format': 'DC', 'query': title }) self.session.headers.update(self.headers_register['citation_ref']) response = self.session.get(self.europe_pmc_url, params=fresh_params) response_xmldict = xmltodict.parse(response.text) # type is infuriatingly not consistent in responses from europepmc so we have to handle it ourselves. if type(response_xmldict['responseWrapper']['rdf:RDF'] ['rdf:Description']) is list: return response_xmldict['responseWrapper']['rdf:RDF'][ 'rdf:Description'][0]['dcterms:bibliographicCitation'] else: return response_xmldict['responseWrapper']['rdf:RDF'][ 'rdf:Description']['dcterms:bibliographicCitation']
def process(self, study_id) -> List: """ Process an individual study_id from the study list. First ping our java webservice to get some basic information about the study. Then we ping the IsaApi client so that we can get title and publication information. We then iterate over the publications from the IAC, pinging europePMC for each one, creating a dict for each. :param study_id: current study_id to process. :return: List of Dicts that each represent a row in the generated report. """ row_dicts = [] self.session.headers.update(self.headers_register['article']) # kind of unsavoury to do this iteratively but saves me writing another method that does much the same thing is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \ study_status = self.wsc.get_permissions(study_id, self.user_token) base_return_dict = CascaDict({ 'Identifier': study_id, 'Title': 'N/A', 'Submission Date': submission_date, 'Status': study_status, 'Release Date': release_date, 'PubmedID': 'N/A', 'DOI': 'N/A', 'Author List': 'N/A', 'Publication Date': 'N/A', 'Citing Reference': 'N/A', 'Publication in MTBLS': 'N/A', 'Journal in EuropePMC': 'N/A', 'Released before curation finished?': 'N/A' }) isa_study, isa_inv, std_path = self.iac.get_isa_study( study_id, self.user_token, skip_load_tables=True, study_location=study_location, failing_gracefully=True) # if get_isa_study has failed, isa_study will come back as None, and so we won't have any publication # information to work with. So we just return the very basic dict. if isa_study is None: row_dicts.append(base_return_dict) return row_dicts title = isa_study.title publications = isa_study.publications fresh_params = self.base_params.cascade({ 'query': title, 'format': 'JSON' }) # here we just search the article title rather than the specific publication europepmc_study_search_results = self.session.get( self.europe_pmc_url, params=fresh_params).json() # if there is an issue with query then just return the basic details dict. if 'resultList' not in europepmc_study_search_results: row_dicts.append(base_return_dict.cascade({'Title': title})) return row_dicts culled_results = [ result for result in europepmc_study_search_results['resultList'] ['result'] if fuzz.ratio(result['title'], title) > 80 ] if len(culled_results) > 0: for pub in publications: logger.info(pub) result = self.has_mapping(pub, culled_results) if result: logger.info('hit ' + str(result)) temp_dict = base_return_dict.cascade({ 'Title': title, 'PubmedId': result['pmid'], 'DOI': pub.doi, 'Author List': pub.author_list, 'Publication Date': result['journalInfo']['printPublicationDate'], 'Citation Reference': self.get_citation_reference(title), 'Publication in MTBLS': pub.title, 'Journal in EuropePMC': result['journalInfo']['journal']['title'], 'Released before curated?': self.assess_if_trangressed(study_status, result['journalInfo']) }) else: temp_dict = base_return_dict.cascade({ 'Title': title, 'PubmedId': pub.pubmed_id, 'DOI': pub.doi, 'Author List': pub.author_list, 'Publication Date': 'N/A', 'Citation Reference': self.get_citation_reference(title), 'Publication in MTBLS': pub.title, 'Journal in EuropePMC': 'N/A', 'Publication the same?': False, 'Released before curated?': 'N/A' }) row_dicts.append(temp_dict) if len(publications) is 0: row_dicts.append(base_return_dict) return row_dicts
class TestCascaDict(unittest.TestCase): def setUp(self): self.cd_root = CascaDict([('name', 'Root'), ('color', 'Root color'), ('lvl', 0)]) #usuall, args based init self.cd_level1 = CascaDict(name='Lvl1', color='Lvl1 color', lvl= 1, ancestor=self.cd_root) #kwargs based init self.cd_level2 = self.cd_level1.cascade() self.cd_level2.update({'name': 'Lvl2', 'color': 'Lvl2 color', 'lvl': 2, 'nest': {'name': 'nested_lvl_2', 'lvl': 22, 'color': 'nested_color lvl2'}}) #cascade&update style init #Insert something with to level1 self.cd_level1['test_insert'] = 'contents' #Insert something onlu to level1 self.cd_level1['test_insert_level1'] = 'contents-lvl1' #Insert something to root level self.cd_root['test_insert_root'] = 'contents-root' #Insert something to root level which has the same name as the level1 self.cd_root['test_insert'] = 'contents_root_only' def test_insert(self): self.assertTrue(self.cd_level1['test_insert'] == 'contents') def test_insert_level(self): self.assertRaises(KeyError, access_key, self.cd_root, 'test_insert_level1') def test_getitem(self): self.assertTrue(self.cd_level1['test_insert_root'] == 'contents-root') def test_getitem_level(self): self.assertTrue(self.cd_root['test_insert'] == 'contents_root_only') def test_get(self): self.assertTrue(self.cd_level1.get('test_insert_root') == 'contents-root') def test_get_default(self): self.assertTrue(self.cd_level1.get('test_nonexistent', 'response') == 'response') def test_has_key(self): self.assertTrue(self.cd_level1.has_key('test_insert_root')) def test_contains(self): self.assertTrue('test_insert_root' in self.cd_level1) def test_final_dict(self): print(self.cd_level1.final_dict) def test_flatten_dict_top(self): temp = self.cd_level1.__flatten__() print(temp) self.assertTrue(temp['name'] == 'Lvl1') def test_flatten_dict_bottom(self): temp = self.cd_level1.__flatten__(level='bottom') print(temp) self.assertTrue(temp['name'] == 'Root') def test_get_cascaded(self): temp = self.cd_level2.get_cascaded('lvl') print(temp) self.assertTrue(temp == [2, 1, 0]) def test_get_cascaded_default(self): temp = self.cd_level2.get_cascaded('lvl_nonexistent', 'nic') self.assertTrue(temp == 'nic') def test_items(self): print(self.cd_level2.items()) def test_inherit(self): temp = self.cd_level2.cascade({'name':'lvl3', 'lvl':3}) self.assertTrue(temp['name'] == 'lvl3') def test_repr(self): print(self.cd_level2) def test_delete_valid(self): del self.cd_level2['color'] self.assertTrue(self.cd_level2['color'] == 'Lvl1 color') def test_delete_invalid(self): def delsomething(): del self.cd_level2['color'] del self.cd_level2['color'] self.assertRaises(CascaDictError, delsomething) def test_pickle(self): self.cd_level2['nest']['lvl'] = 23 ptemp = pickle.dumps(self.cd_level2) temp = pickle.loads(ptemp) print temp['nest'].get_cascaded('lvl') self.assertTrue(temp['nest'].get_cascaded('lvl') == [23, 22]) def test_nesting(self): self.cd_level2['nest']['color'] = 'nested overriden color' print(self.cd_level2['nest'].get_cascaded('color'))