def __init__(self, study_list: List[str], user_token: str, wsc: WsClient,
                 iac: IsaApiClient):
        """Init method
        Sets up a headers register (as we are hitting the same endpoint twice, but with different formats) and a set of
        base parameters for requests to the europePMC API.

        :param priv_list: A list of studies to iterate over, throwing each at europePMC.
        :param user_token: User token for use with javawebservice, must be curator or will have failed prior.
        :param wsc: WsClient that interfaces with the java webservice.
        :param iac: IsaApiClient, used to get study information.
        """
        self.study_list = study_list
        self.user_token = user_token
        self.wsc = wsc
        self.iac = iac
        self.session = requests.Session()
        self.europe_pmc_url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search'
        self.headers_register = {
            'article': {
                'Accept': 'application/json'
            },
            'citation_ref': {
                'Accept': 'application/xml'
            }
        }
        self.base_params = CascaDict({
            'resultType': 'core',
            'format': 'JSON',
            'cursorMark': '*',
            'pageSize': '5',
            'fromSearchPost': False,
            'query': ''
        })
예제 #2
0
 def setUp(self):
     self.cd_root = CascaDict([('name', 'Root'), ('color', 'Root color'), ('lvl', 0)]) #usuall, args based init
     self.cd_level1 = CascaDict(name='Lvl1', color='Lvl1 color', lvl= 1, ancestor=self.cd_root) #kwargs based init
     self.cd_level2 = self.cd_level1.cascade()
     self.cd_level2.update({'name': 'Lvl2', 'color': 'Lvl2 color', 'lvl': 2, 'nest': {'name': 'nested_lvl_2', 'lvl': 22, 'color': 'nested_color lvl2'}}) #cascade&update style init
     
     #Insert something with to level1
     self.cd_level1['test_insert'] = 'contents'
     
     #Insert something onlu to level1
     self.cd_level1['test_insert_level1'] = 'contents-lvl1'
     
     #Insert something to root level
     self.cd_root['test_insert_root'] = 'contents-root'
     
     #Insert something to root level which has the same name as the level1
     self.cd_root['test_insert'] = 'contents_root_only'
class EuropePmcReportBuilder:
    """Class that builds the EuropePMC Report. The report is a result of cross referencing the publication information
    that submitters give us with externally sourced publication information found in EuropePMC. This allows us to check
    for discrepancies / differences."""
    def __init__(self, study_list: List[str], user_token: str, wsc: WsClient,
                 iac: IsaApiClient):
        """Init method
        Sets up a headers register (as we are hitting the same endpoint twice, but with different formats) and a set of
        base parameters for requests to the europePMC API.

        :param priv_list: A list of studies to iterate over, throwing each at europePMC.
        :param user_token: User token for use with javawebservice, must be curator or will have failed prior.
        :param wsc: WsClient that interfaces with the java webservice.
        :param iac: IsaApiClient, used to get study information.
        """
        self.study_list = study_list
        self.user_token = user_token
        self.wsc = wsc
        self.iac = iac
        self.session = requests.Session()
        self.europe_pmc_url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search'
        self.headers_register = {
            'article': {
                'Accept': 'application/json'
            },
            'citation_ref': {
                'Accept': 'application/xml'
            }
        }
        self.base_params = CascaDict({
            'resultType': 'core',
            'format': 'JSON',
            'cursorMark': '*',
            'pageSize': '5',
            'fromSearchPost': False,
            'query': ''
        })

    def build(self, drive) -> str:
        """
        Get a list of result dicts (each of which represent a row) and try to build a dataframe out of them. If
        successful, save that dataframe as a csv file to our reporting directory, and return a message indicating
        success. If not successful, log the error, and return a message indicating failure.

        :param drive: flag to indicate whether to save the report to google drive.
        :return: A message as a string indicating success or failure.
        """
        list_of_result_dicts = [
            row for study in self.study_list for row in self.process(study)
        ]
        path = app.config.get('MTBLS_PRIVATE_FTP_ROOT') + '/' + app.config.get(
            'REPORTING_PATH') + 'global/europepmc.csv'
        try:

            report_dataframe = pandas.DataFrame(
                list_of_result_dicts,
                columns=[
                    'Identifier', 'Title', 'Submission Date', 'Status',
                    'Release Date', 'PubmedID', 'DOI', 'Author List',
                    'Publication Date', 'Citation Reference',
                    'Publication in MTBLS', 'Journal in EuropePMC',
                    'Released before curated?'
                ])
            if drive is False:
                report_dataframe.to_csv(path, sep='\t')
                msg = 'EuropePMC report successfully saved to {0}'.format(path)
                logger.info(msg)
            else:
                try:
                    setGoogleSheet(report_dataframe,
                                   app.config.get('EUROPE_PMC_REPORT'),
                                   'europe_pmc_report',
                                   app.config.get('GOOGLE_SHEET_TOKEN'))
                    msg = 'Saved report to google drive.'
                except Exception as e:
                    abort(500, str(e))
        except Exception as e:
            msg = 'Problem in building and saving europe pmc report: {0}'.format(
                e)
            logger.error(msg)
            abort(500, msg)

        return msg

    def process(self, study_id) -> List:
        """
        Process an individual study_id from the study list. First ping our java webservice to get some basic information
        about the study. Then we ping the IsaApi client so that we can get title and publication information.
        We then iterate over the publications from the IAC, pinging europePMC for each one, creating a dict for each.

        :param study_id: current study_id to process.
        :return: List of Dicts that each represent a row in the generated report.
        """
        row_dicts = []
        self.session.headers.update(self.headers_register['article'])
        # kind of unsavoury to do this iteratively but saves me writing another method that does much the same thing
        is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \
            study_status = self.wsc.get_permissions(study_id, self.user_token)

        base_return_dict = CascaDict({
            'Identifier':
            study_id,
            'Title':
            'N/A',
            'Submission Date':
            submission_date,
            'Status':
            study_status,
            'Release Date':
            release_date,
            'PubmedID':
            'N/A',
            'DOI':
            'N/A',
            'Author List':
            'N/A',
            'Publication Date':
            'N/A',
            'Citing Reference':
            'N/A',
            'Publication in MTBLS':
            'N/A',
            'Journal in EuropePMC':
            'N/A',
            'Released before curation finished?':
            'N/A'
        })

        isa_study, isa_inv, std_path = self.iac.get_isa_study(
            study_id,
            self.user_token,
            skip_load_tables=True,
            study_location=study_location,
            failing_gracefully=True)

        # if get_isa_study has failed, isa_study will come back as None, and so we won't have any publication
        # information to work with. So we just return the very basic dict.
        if isa_study is None:
            row_dicts.append(base_return_dict)
            return row_dicts

        title = isa_study.title
        publications = isa_study.publications

        fresh_params = self.base_params.cascade({
            'query': title,
            'format': 'JSON'
        })
        # here we just search the article title rather than the specific publication
        europepmc_study_search_results = self.session.get(
            self.europe_pmc_url, params=fresh_params).json()
        # if there is an issue with query then just return the basic details dict.
        if 'resultList' not in europepmc_study_search_results:
            row_dicts.append(base_return_dict.cascade({'Title': title}))
            return row_dicts

        culled_results = [
            result for result in europepmc_study_search_results['resultList']
            ['result'] if fuzz.ratio(result['title'], title) > 80
        ]
        if len(culled_results) > 0:
            for pub in publications:
                logger.info(pub)
                result = self.has_mapping(pub, culled_results)
                if result:
                    logger.info('hit ' + str(result))
                    temp_dict = base_return_dict.cascade({
                        'Title':
                        title,
                        'PubmedId':
                        result['pmid'],
                        'DOI':
                        pub.doi,
                        'Author List':
                        pub.author_list,
                        'Publication Date':
                        result['journalInfo']['printPublicationDate'],
                        'Citation Reference':
                        self.get_citation_reference(title),
                        'Publication in MTBLS':
                        pub.title,
                        'Journal in EuropePMC':
                        result['journalInfo']['journal']['title'],
                        'Released before curated?':
                        self.assess_if_trangressed(study_status,
                                                   result['journalInfo'])
                    })
                else:
                    temp_dict = base_return_dict.cascade({
                        'Title':
                        title,
                        'PubmedId':
                        pub.pubmed_id,
                        'DOI':
                        pub.doi,
                        'Author List':
                        pub.author_list,
                        'Publication Date':
                        'N/A',
                        'Citation Reference':
                        self.get_citation_reference(title),
                        'Publication in MTBLS':
                        pub.title,
                        'Journal in EuropePMC':
                        'N/A',
                        'Publication the same?':
                        False,
                        'Released before curated?':
                        'N/A'
                    })
                row_dicts.append(temp_dict)
        if len(publications) is 0:
            row_dicts.append(base_return_dict)

        return row_dicts

    @staticmethod
    def has_mapping(publication, resultset):
        """Check whether a given publication has a match in the europePMC resultset"""
        for result in resultset:
            logger.info(result['source'] + str(len(result['source'])))
            if result[
                    'source'] == 'PPR':  #preprint so doesnt have an actual title.

                continue
            else:
                score = fuzz.ratio(result['title'], publication.title)
                logger.info('HASMAPPING: ' + str(score) + 'MTB: ' +
                            publication.title + '/PMC: ' + result['title'])
                if score > 80:
                    return result
        return None

    @staticmethod
    def assess_if_trangressed(status,
                              europe_pmc_publication) -> Union[bool, str]:
        """Check whether the journal has been published despite study not being public."""
        logger.info('ASSESSIF' + str(europe_pmc_publication))
        if 'printPublicationDate' in europe_pmc_publication:
            journal_publication_date = datetime.strptime(
                europe_pmc_publication['printPublicationDate'], '%Y-%m-%d')
            logger.info('ASSESSIF' + str(journal_publication_date))
            now = datetime.now()
            return status.upper(
            ) is not 'PUBLIC' and now > journal_publication_date
        else:
            return 'No publication date given.'

    def get_citation_reference(self, title) -> str:
        """Cascade a new param dict to use in the request and update the session headers to XML as the search endpoint
        on the EuropePMC API only returns the bibliographicCitation information if you specify the DC format (which is
        a kind of XML). Turn the resulting XML string into a dict, and then return the citation from that dict.

        :param title: Article title to get citation for
        :return: Bibliographic citation as string."""
        fresh_params = self.base_params.cascade({
            'format': 'DC',
            'query': title
        })
        self.session.headers.update(self.headers_register['citation_ref'])
        response = self.session.get(self.europe_pmc_url, params=fresh_params)
        response_xmldict = xmltodict.parse(response.text)
        # type is infuriatingly not consistent in responses from europepmc so we have to handle it ourselves.
        if type(response_xmldict['responseWrapper']['rdf:RDF']
                ['rdf:Description']) is list:
            return response_xmldict['responseWrapper']['rdf:RDF'][
                'rdf:Description'][0]['dcterms:bibliographicCitation']
        else:
            return response_xmldict['responseWrapper']['rdf:RDF'][
                'rdf:Description']['dcterms:bibliographicCitation']
    def process(self, study_id) -> List:
        """
        Process an individual study_id from the study list. First ping our java webservice to get some basic information
        about the study. Then we ping the IsaApi client so that we can get title and publication information.
        We then iterate over the publications from the IAC, pinging europePMC for each one, creating a dict for each.

        :param study_id: current study_id to process.
        :return: List of Dicts that each represent a row in the generated report.
        """
        row_dicts = []
        self.session.headers.update(self.headers_register['article'])
        # kind of unsavoury to do this iteratively but saves me writing another method that does much the same thing
        is_curator, read_access, write_access, obfuscation_code, study_location, release_date, submission_date, \
            study_status = self.wsc.get_permissions(study_id, self.user_token)

        base_return_dict = CascaDict({
            'Identifier':
            study_id,
            'Title':
            'N/A',
            'Submission Date':
            submission_date,
            'Status':
            study_status,
            'Release Date':
            release_date,
            'PubmedID':
            'N/A',
            'DOI':
            'N/A',
            'Author List':
            'N/A',
            'Publication Date':
            'N/A',
            'Citing Reference':
            'N/A',
            'Publication in MTBLS':
            'N/A',
            'Journal in EuropePMC':
            'N/A',
            'Released before curation finished?':
            'N/A'
        })

        isa_study, isa_inv, std_path = self.iac.get_isa_study(
            study_id,
            self.user_token,
            skip_load_tables=True,
            study_location=study_location,
            failing_gracefully=True)

        # if get_isa_study has failed, isa_study will come back as None, and so we won't have any publication
        # information to work with. So we just return the very basic dict.
        if isa_study is None:
            row_dicts.append(base_return_dict)
            return row_dicts

        title = isa_study.title
        publications = isa_study.publications

        fresh_params = self.base_params.cascade({
            'query': title,
            'format': 'JSON'
        })
        # here we just search the article title rather than the specific publication
        europepmc_study_search_results = self.session.get(
            self.europe_pmc_url, params=fresh_params).json()
        # if there is an issue with query then just return the basic details dict.
        if 'resultList' not in europepmc_study_search_results:
            row_dicts.append(base_return_dict.cascade({'Title': title}))
            return row_dicts

        culled_results = [
            result for result in europepmc_study_search_results['resultList']
            ['result'] if fuzz.ratio(result['title'], title) > 80
        ]
        if len(culled_results) > 0:
            for pub in publications:
                logger.info(pub)
                result = self.has_mapping(pub, culled_results)
                if result:
                    logger.info('hit ' + str(result))
                    temp_dict = base_return_dict.cascade({
                        'Title':
                        title,
                        'PubmedId':
                        result['pmid'],
                        'DOI':
                        pub.doi,
                        'Author List':
                        pub.author_list,
                        'Publication Date':
                        result['journalInfo']['printPublicationDate'],
                        'Citation Reference':
                        self.get_citation_reference(title),
                        'Publication in MTBLS':
                        pub.title,
                        'Journal in EuropePMC':
                        result['journalInfo']['journal']['title'],
                        'Released before curated?':
                        self.assess_if_trangressed(study_status,
                                                   result['journalInfo'])
                    })
                else:
                    temp_dict = base_return_dict.cascade({
                        'Title':
                        title,
                        'PubmedId':
                        pub.pubmed_id,
                        'DOI':
                        pub.doi,
                        'Author List':
                        pub.author_list,
                        'Publication Date':
                        'N/A',
                        'Citation Reference':
                        self.get_citation_reference(title),
                        'Publication in MTBLS':
                        pub.title,
                        'Journal in EuropePMC':
                        'N/A',
                        'Publication the same?':
                        False,
                        'Released before curated?':
                        'N/A'
                    })
                row_dicts.append(temp_dict)
        if len(publications) is 0:
            row_dicts.append(base_return_dict)

        return row_dicts
예제 #5
0
class TestCascaDict(unittest.TestCase):
    
    def setUp(self):
        self.cd_root = CascaDict([('name', 'Root'), ('color', 'Root color'), ('lvl', 0)]) #usuall, args based init
        self.cd_level1 = CascaDict(name='Lvl1', color='Lvl1 color', lvl= 1, ancestor=self.cd_root) #kwargs based init
        self.cd_level2 = self.cd_level1.cascade()
        self.cd_level2.update({'name': 'Lvl2', 'color': 'Lvl2 color', 'lvl': 2, 'nest': {'name': 'nested_lvl_2', 'lvl': 22, 'color': 'nested_color lvl2'}}) #cascade&update style init
        
        #Insert something with to level1
        self.cd_level1['test_insert'] = 'contents'
        
        #Insert something onlu to level1
        self.cd_level1['test_insert_level1'] = 'contents-lvl1'
        
        #Insert something to root level
        self.cd_root['test_insert_root'] = 'contents-root'
        
        #Insert something to root level which has the same name as the level1
        self.cd_root['test_insert'] = 'contents_root_only'
    
    def test_insert(self):
        self.assertTrue(self.cd_level1['test_insert'] == 'contents')
        
    def test_insert_level(self):
        self.assertRaises(KeyError, access_key, self.cd_root, 'test_insert_level1')
    
    def test_getitem(self):
        self.assertTrue(self.cd_level1['test_insert_root'] == 'contents-root')
        
    def test_getitem_level(self):
        self.assertTrue(self.cd_root['test_insert'] == 'contents_root_only')
    
    def test_get(self):
        self.assertTrue(self.cd_level1.get('test_insert_root') == 'contents-root')
        
    def test_get_default(self):
        self.assertTrue(self.cd_level1.get('test_nonexistent', 'response') == 'response')
        
    def test_has_key(self):
        self.assertTrue(self.cd_level1.has_key('test_insert_root'))
        
    def test_contains(self):
        self.assertTrue('test_insert_root' in self.cd_level1)
        
    def test_final_dict(self):
        print(self.cd_level1.final_dict)
        
    def test_flatten_dict_top(self):
        temp = self.cd_level1.__flatten__()
        print(temp)
        self.assertTrue(temp['name'] == 'Lvl1')
    
    def test_flatten_dict_bottom(self):
        temp = self.cd_level1.__flatten__(level='bottom')
        print(temp)
        self.assertTrue(temp['name'] == 'Root')
        
    def test_get_cascaded(self):
        temp = self.cd_level2.get_cascaded('lvl')
        print(temp)
        self.assertTrue(temp == [2, 1, 0])
        
    def test_get_cascaded_default(self):
        temp = self.cd_level2.get_cascaded('lvl_nonexistent', 'nic')
        self.assertTrue(temp == 'nic')
        
    def test_items(self):
        print(self.cd_level2.items())
        
    def test_inherit(self):
        temp = self.cd_level2.cascade({'name':'lvl3', 'lvl':3})
        self.assertTrue(temp['name'] == 'lvl3')
    
    def test_repr(self):
        print(self.cd_level2)
        
    def test_delete_valid(self):
        del self.cd_level2['color']
        self.assertTrue(self.cd_level2['color'] == 'Lvl1 color')
        
    def test_delete_invalid(self):
        def delsomething():
            del self.cd_level2['color']
            del self.cd_level2['color']
        self.assertRaises(CascaDictError, delsomething)
        
    def test_pickle(self):
        self.cd_level2['nest']['lvl'] = 23
        ptemp = pickle.dumps(self.cd_level2)
        temp = pickle.loads(ptemp)
        print temp['nest'].get_cascaded('lvl')
        self.assertTrue(temp['nest'].get_cascaded('lvl') == [23, 22])
        
        
    def test_nesting(self):
        self.cd_level2['nest']['color'] = 'nested overriden color'
        print(self.cd_level2['nest'].get_cascaded('color'))