Esempio n. 1
0
    def get(self, *, article_info: ArticleInfo) -> str or None:
        """
        def get(self, *, article_info: ArticleInfo, num: int)->[str]:
            -- GoogleScholar から論文情報ページまでのURLを取得します。 --
                > GoogleScholarのQueryに第1著者とメインタイトル情報を渡し,検索結果のリンクから引数numだけURLを返します。
                >> 著者とタイトル情報はArticleInfo classから,Query作成はTranslateGoogleScholarQuery classに任せます。

            1. 引数に受け取ったarticle_infoから,第1著者とメインタイトルを抽出します。
            2. 抽出した著者とタイトルから,TranslateGoogleScholarQueryによりGoogleScholar用のQueryを含んだURLに変換します。
            3. GoogleScholar URLをスクレイピングし,リストに保持します。
            4. 結果として引数numの数だけ返します。
        """

        first_author = article_info.first_author
        main_title = article_info.article_main_title
        translator = TranslateGoogleScholarQuery()
        scholar_url = translator.translate(first_author, main_title)
        soup = BeautifulSoupModelImpl()
        anchor_texts_and_links = soup.get_anchor_texts_and_links(scholar_url)

        # [['text', 'link'], ['text', 'link']]
        # print(anchor_texts_and_links)

        results_urls = self.__whitelisted_texts_and_links(
            anchor_texts_and_links, self.WHITELIST_KEYWORDS_LINK)
        results_without_site_funcs = self.__blacklisted_texts_and_links(
            results_urls, self.BLACKLIST_KEYWORDS_SITE_FUNCTIONS)
        results_without_pdfs = self.__blacklisted_texts_and_links(
            results_without_site_funcs, self.BLACKLIST_KEYWORDS_LINK_EXTENSION,
            True)
        cooked_results = results_without_pdfs

        return self.__decision_title_match_link(cooked_results, main_title)
Esempio n. 2
0
 def get_url(self, *, url) -> str or None:
     """return a full URL link"""
     """doiを読み込む。doiはmeta内である。"""
     soup = BeautifulSoupModelImpl()
     raw_doi = soup.get_meta_content(url=url,
                                     key=self.META_KEY,
                                     id=self.META_ID)
     # print(raw_doi)
     if raw_doi is not None:
         doi_url = self.__translate_url(raw_doi=raw_doi)
         return doi_url
     else:
         print('Any DOI found from {journal} ({link})'.format(
             journal=self.JOURNAL_STR, link=url))
         return None
Esempio n. 3
0
 def __find_doi_url_from_anchor_texts(self, *, url: str) -> str or None:
     soup = BeautifulSoupModelImpl()
     anchor_texts = soup.get_anchor_texts(url)
     # print(anchor_texts)
     """取得したanchor_textsのうち,DOI_KEYに合致するものを検索"""
     results = []
     for link in anchor_texts:
         if self.__decision_include_keyword(keyword=self.DOI_KEY,
                                            text=link):
             results.append(link)
     # print(results)
     """https://doi.org/10.1371/journal.pgen.0010066"""
     if len(results) > 0:
         print(results[0])
         return results[0]
     else:
         return None
Esempio n. 4
0
 def get_url(self, *, url)->str or None:
     """return a full URL link"""
     """doiをScienceDirectから読み込む。doiはa hrefのリンクtextである。"""
     # e.g. http://www.sciencedirect.com/science/article/pii/104084289390007Q
     soup = BeautifulSoupModelImpl()
     anchor_links = soup.get_anchor_links(url=url)
     # print(anchor_links)
     """取得した全aタグリンクのうち,指定したジャーナルサイトURLに合致するものを検索"""
     results = []
     for link in anchor_links:
         if self.DOI_URL in link:
             results.append(link)
     # print(results)
     if len(results) > 0:
         # print(results[0])
         return results[0]
     else:
         print('Any DOI found from {journal} ({link})'.format(journal=self.JOURNAL_STR, link=url))
         return None
Esempio n. 5
0
 def __find_doi_from_anchor_links(self, url: str):
     """doiをPsycNETから読み込む。doiはanchor_linkである。"""
     # e.g. http://psycnet.apa.org/journals/amp/18/8/503/
     # link: http://psycnet.apa.org/doi/10.1037/h0045185
     soup = BeautifulSoupModelImpl()
     anchor_links = soup.get_anchor_links(url)
     ssl_state = SSLStateInPsycNET()
     # print(anchor_links)
     """取得した全anchor_linkのうち,http://psycnet.apa.org/doi/に合致するものを検索"""
     results = []
     if anchor_links is not None and len(anchor_links) > 0:
         for link in anchor_links:
             if ssl_state.get_psycnet_url(url) in link:
                 results.append(link)
         # print(results)
         if len(results) > 0:
             # print(results[0])
             return results[0]
     print('Any DOI found from anchor-links from PsycNET ({0})'.format(url))
     return None
Esempio n. 6
0
    def get_url(self, *, url: str) -> str or None:
        """doiをPMCから読み込む。doiはanchor_textである。が,検索できないため,anchor_linksより行う。"""
        # e.g. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1333534/
        soup = BeautifulSoupModelImpl()
        anchor_links = soup.get_anchor_links(url)
        # print(anchor_links)
        """取得した全aタグリンクのうち,指定したジャーナルサイトURLに合致するものを検索"""
        # __find_https_format()
        doi = self.__find_https_format(anchor_links=anchor_links)
        if doi is not None:
            return self.__translate_decode(doi)

        # __find_network_path_reference_format()
        doi = self.__find_network_path_reference_format(
            anchor_links=anchor_links)
        if doi is not None:
            return self.__translate_decode(doi)

        print('AnyDOI From PMC ({link})'.format(link=url))
        return None
class TestReadEnteredTextImpl(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        print('class BeautifulSoupModelImpl(unittest.TestCase):')
        print('def setUpClass(cls):')

    @classmethod
    def tearDownClass(cls):
        print('def tearDownClass(cls):')

    def setUp(self):
        print('def setUp(self):')
        self.soup = BeautifulSoupModelImpl()

    def tearDown(self):
        print('def tearDown(self):')

    def test_html(self):
        print('def test_html(self):')
        url = 'https://www.yahoo.co.jp/'
        html = self.soup.get_html(url)
        self.assertIsNotNone(html)

    def test_get_anchors(self):
        print('def test_get_anchors(self):')
        url = 'https://www.yahoo.co.jp/'
        anchors = self.soup.get_anchors(url)
        self.assertIsNotNone(anchors)

    def test_get_anchor_links(self):
        print('def get_anchor_links(self):')
        url = 'https://www.yahoo.co.jp/'
        anchors = self.soup.get_anchor_links(url)
        self.assertIsNotNone(anchors)

    def test_get_anchor_texts(self):
        print('def get_anchor_texts(self):')
        url = 'https://www.yahoo.co.jp/'
        anchors = self.soup.get_anchor_texts(url)
        self.assertIsNotNone(anchors)
 def setUp(self):
     print('def setUp(self):')
     self.soup = BeautifulSoupModelImpl()