コード例 #1
0
    class ScieloLocationsBase(mining.PageLocations):
        date_publication = mining.Element("css_selector", ".content > h3")
        language = mining.MetaData("citation_language")
        organization_affiliated = mining.MetaData(
            "citation_author_institution")
        source = mining.MetaData("citation_journal_title")
        title = mining.Complex()

        license = "https://scielo.org/en/about-scielo/open-access-statement/"
コード例 #2
0
    class MedrxivLocations(mining.PageLocations):
        category = mining.Element("xpath", "//span[@class='highwire-article-collection-term']")
        date_publication = mining.MetaData("article:published_time")
        language = 'en'
        license = mining.Element("xpath", "//div[@class='field-item even']")
        organization_affiliated = mining.MetaData("citation_author_institution")
        source = mining.MetaData("citation_journal_title")

        # Null
        body = ''
        citations = ''
        keywords = ''
        references = ''
        search_keyword = ''
        source_impact_factor = ''
コード例 #3
0
    class BiorxivLocations(mining.PageLocations):
        """ This is a class used to find the schema in the journal """
        category = mining.Element("xpath", "//span[@class='highwire-article-collection-term']")
        language = 'en'
        license = mining.Element("xpath", "//div[@class='field-item even']")
        organization_affiliated = mining.MetaData("citation_author_institution")
        references = mining.MetaData("citation_reference")
        source = mining.MetaData("citation_journal_title")

        # Null:
        body = ''
        citations = ''
        keywords = ''
        search_keyword = ''
        source_impact_factor = ''
コード例 #4
0
 def get_date_publication(self, element):
     """"Gather article date publication, in YYYY-MM-DD format
     Args:
         element(:obj: `centaurminer.Element`): Page element to
             gather body data from.
     Return:
         String representing date publication, in format YYYY-MM-DD.
     """
     try:
         date_str = str(self.get(element).split('Epub ')[1])
         try:
             date_obj = datetime.datetime.strptime(date_str, '%b %d, %Y')
         except ValueError:
             date_obj = datetime.datetime.strptime(date_str, '%B %d, %Y')          
         return date_obj.strftime('%Y-%m-%d')
     except (AttributeError, IndexError):
         element = mining.MetaData("citation_date")
         try:
             # date in the format MM/YYYY
             month, year = self.get(element).split('/')
             return datetime.combine(datetime.date(year, month, 1), datetime.min.time())
             #return self.get(element).replace('/', '-')
         except Exception as e:
             return None
     except:
         return None
コード例 #5
0
    class ScieloLocations(mining.PageLocations):
        """Locations on the page to be gathered by Selenium webdriver

        The locations may be declared here as static variables of any type, to be retrieved
        as keys on the centaurminer.MiningEngine.results dictionary. Some examples of data that
        can be declared here:

        centaurminer.Metadata: Selenium retrieved elements from a page metadata
        centaurminer.Element: Selenium retrived elements from a page body.
        string: Strings declared here won't change, independently of the page searched.
        """
        abstract = mining.Element("css_selector", ".trans-abstract > p:not([class^=sec]), .trans-abstract > div.section")
        body = mining.Element("css_selector", "#article-body, .index\,pt > p, .index\,en > p, .index\,es > p")
        category = mining.Element("xpath", "//p[@class='categoria']")
        date_publication = mining.Element("xpath", "//div[@class='content']/h3")
        keywords = mining.Element("css_selector", ".trans-abstract > p:last-of-type")
        license = "https://scielo.org/en/about-scielo/open-access-statement/"
        organization_affiliated = mining.Element("css_selector", "p.aff").get_attribute('innerHTML')
        references = mining.Element("css_selector", "p.ref")
        source = mining.MetaData("citation_journal_title")

        # Null:
        citations = ''
        language = ''
        search_keyword = ''
        source_impact_factor = ''
コード例 #6
0
    class PreprintsLocations(mining.PageLocations):
        abstract = mining.MetaData("og:description")
        keywords = mining.MetaData("citation_keywords")
        language = 'en'
        license = "https://creativecommons.org/licenses/by/4.0/"
        organization_affiliated = mining.MetaData(
            "citation_author_institution")
        references = mining.MetaData("citation_reference")
        source = mining.MetaData('citation_publisher')

        # Null
        body = ''
        category = ''
        citations = ''
        search_keyword = ''
        source_impact_factor = ''
コード例 #7
0
 def get_date_publication(self, element):
     """Gather article date publication, in YYYY-MM-DD format
     Args:
         element(:obj: `centaurminer.Element`): Page element to
             gather body data from.
     Return:
         String representing date publication, in format YYYY-MM-DD.
     """
     try:
         date_str = str(self.get(element).split('Epub ')[1])
         try:
             date_obj = datetime.datetime.strptime(
                 date_str, '%b %d, %Y').date()
         except ValueError:
             date_obj = datetime.datetime.strptime(
                 date_str, '%B %d, %Y').date()
         return date_obj
     except (AttributeError, IndexError):
         element = mining.MetaData("citation_date")
         try:
             return datetime.datetime.strptime(self.get(element),
                                               "%m/%Y").date()
         except Exception as e:
             return None
     except:
         return None
コード例 #8
0
        def get_date_publication(self, element):
            """
            Gather article date publication - look first for a specific element, then
            look at metadata if it's not found.

            (Some metadata is corrupted, hence needing to look at a page element)
            """
            date_elem = self.get(element).split('Epub ')
            if len(date_elem) > 1:
                date_str = str(date_elem[1])
                try:
                    date_obj = datetime.datetime.strptime(
                        date_str, '%b %d, %Y').date()
                except ValueError:
                    date_obj = datetime.datetime.strptime(
                        date_str, '%B %d, %Y').date()
                return date_obj
            else:
                element = mining.MetaData("citation_date")
                date = self.get(element)
                try:
                    if date.startswith("00/"):
                        self.results['publication_date_missing_month'] = True
                        self.results['publication_date_missing_day'] = True
                        return datetime.datetime.strptime(date, "00/%Y").date()
                    else:
                        self.results['publication_date_missing_day'] = True
                        return datetime.datetime.strptime(date, "%m/%Y").date()
                except ValueError as e:
                    return None
コード例 #9
0
        def get_title(self, element):
            element = mining.MetaData("citation_title")
            title = self.get(element)
            if title is not None:
                return title

            element = mining.Element("css_selector", "p.title")
            title = self.get(element)
            return title
コード例 #10
0
    class IbmcLocations(mining.PageLocations):
        """
        IbmcLocations class sets instructions to find an element on
        `http://pbmc.ibmc.msk.ru/`
        """
        abstract = mining.Element("xpath", "//td[@class='arti'][@style='text-align:justify;']")
        category = mining.Element("xpath", "//tr[4]/td[@class='arti']")
        citations = mining.Element("xpath", "//div[@class='__db_score __db_score_normal']")
        keywords = mining.MetaData("citation_keywords")
        license = "http://pbmc.ibmc.msk.ru/ru/authors-rules-ru/"
        organization_affiliated = mining.MetaData("citation_author_institution")
        source = mining.MetaData("citation_journal_title")
        language = 'ru'
        # pubmed_link = mining.Element("xpath", "//td[@class='arti'][@style='align:justify;']//a[@target='_blank']").get_attribute('href')
        # translated_link = mining.Element("xpath", "//td[@class='arti']//a[@target='_blank']").get_attribute('href')

        # Null:
        body = ''
        references = ''
        source_impact_factor = ''
        search_keyword = ''
コード例 #11
0
ファイル: arxiv.py プロジェクト: yeshwanth16/data_pipeline
class Arxiv(mining.PageLocations):
    """
    This is a class used to find the schema in the journal
    """
    references = mining.MetaData("")
    title = mining.MetaData("citation_title")
    doi = mining.MetaData("citation_doi")
    authors = mining.MetaData("citation_author")
    abstract = mining.Element("css_selector", "blockquote.abstract.mathjax")
    body = mining.MetaData("")
    source_impact_factor = mining.MetaData("")
    category = mining.MetaData("")
    quantity_of_citations = mining.MetaData("")
    organization = mining.MetaData("")
    keywords = mining.MetaData("")
    extra_link = mining.MetaData("citation_pdf_url")

    # Constants
    source = 'Arxiv'
    language = "english"
    license = "https://arxiv.org/licenses/nonexclusive-distrib/1.0/license.html"
    search_keyword = "SARS"