コード例 #1
0
 async def _fetch_from_pubmed(self, doi=None, pmid=None, title=None):
     result = {'doi': None, 'pmid': None, 'title': None, 'authors': None,
               'journal': None, 'date': None}
     if pmid is None:
         url = URL('https://pubmed.ncbi.nlm.nih.gov/')
         if doi is not None:
             query = doi
         elif title is not None:
             query = title
         else:
             return result
         encoded_query = 'term=' + urllib.parse.quote(f'"{query}"')
         url._val = urllib.parse.SplitResult(url._val.scheme,
                                             url._val.netloc,
                                             url._val.path,
                                             encoded_query,
                                             url._val.fragment)
     else:
         url = URL('https://pubmed.ncbi.nlm.nih.gov/' + pmid)
     retry = True
     while retry:
         async with self.session.get(url, ssl=sslcontext) as response:
             if response.status == 200:
                 retry = False
                 soup = BeautifulSoup(await response.read(), 'lxml')
             else:
                 print(response.status)
     d_soup = soup.find('span', {'class': 'doi'})
     if d_soup is None:
         if title is None:
             return result
         # This means there was more than one search result,
         # so we compare the titles of all of the search results
         # to our title and pick the closest one that is more
         # than 90% similar
         d_soup = soup.find_all('a', {'class': 'labs-docsum-title'})
         max_score = 0
         max_link = ''
         for d in d_soup:
             try:
                 score = SequenceMatcher(None, title,
                                         d.text.strip()).ratio()
             except AttributeError:
                 return result
             if score > max_score:
                 max_score = score
                 max_link = d['href']
         if max_score > 0.9:
             async with self.session.get('https://pubmed.ncbi.nlm.nih.gov' +
                                         max_link,
                                         ssl=sslcontext) as response:
                 soup = BeautifulSoup(await response.read(),
                                      'lxml')
             d_soup = soup.find('span', {'class': 'doi'})
         else:
             return result
     try:
         result['doi'] = d_soup.find('a').text.strip()
     except AttributeError:
         return result
     if doi is not None:
         assert doi.lower() == result['doi'].lower()
     result['pmid'] = soup.find('strong',
                                {'class': 'current-id'}).text.strip()
     if pmid is not None:
         assert pmid == result['pmid']
     cite = soup.find('span', {'class': 'cit'}).text
     date = cite[:cite.find(';')]
     result['date'] = self._parse_date(date).isoformat()
     a_list = soup.find('div', {'class': 'authors-list'})
     if a_list is not None:
         authors = a_list.find_all('a', {'class': 'full-name'})
     else:
         authors = []
     result['authors'] = []
     for a in authors:
         result['authors'].append(a.text.strip())
     result['title'] = soup.find('h1',
                                 {'class': 'heading-title'}).text.strip()
     result['journal'] = soup.find(
         'button', {'id': 'full-view-journal-trigger'})['title'].strip()
     return result