class ScieloLocationsBase(mining.PageLocations): date_publication = mining.Element("css_selector", ".content > h3") language = mining.MetaData("citation_language") organization_affiliated = mining.MetaData( "citation_author_institution") source = mining.MetaData("citation_journal_title") title = mining.Complex() license = "https://scielo.org/en/about-scielo/open-access-statement/"
class MedrxivLocations(mining.PageLocations): category = mining.Element("xpath", "//span[@class='highwire-article-collection-term']") date_publication = mining.MetaData("article:published_time") language = 'en' license = mining.Element("xpath", "//div[@class='field-item even']") organization_affiliated = mining.MetaData("citation_author_institution") source = mining.MetaData("citation_journal_title") # Null body = '' citations = '' keywords = '' references = '' search_keyword = '' source_impact_factor = ''
class BiorxivLocations(mining.PageLocations): """ This is a class used to find the schema in the journal """ category = mining.Element("xpath", "//span[@class='highwire-article-collection-term']") language = 'en' license = mining.Element("xpath", "//div[@class='field-item even']") organization_affiliated = mining.MetaData("citation_author_institution") references = mining.MetaData("citation_reference") source = mining.MetaData("citation_journal_title") # Null: body = '' citations = '' keywords = '' search_keyword = '' source_impact_factor = ''
def get_date_publication(self, element): """"Gather article date publication, in YYYY-MM-DD format Args: element(:obj: `centaurminer.Element`): Page element to gather body data from. Return: String representing date publication, in format YYYY-MM-DD. """ try: date_str = str(self.get(element).split('Epub ')[1]) try: date_obj = datetime.datetime.strptime(date_str, '%b %d, %Y') except ValueError: date_obj = datetime.datetime.strptime(date_str, '%B %d, %Y') return date_obj.strftime('%Y-%m-%d') except (AttributeError, IndexError): element = mining.MetaData("citation_date") try: # date in the format MM/YYYY month, year = self.get(element).split('/') return datetime.combine(datetime.date(year, month, 1), datetime.min.time()) #return self.get(element).replace('/', '-') except Exception as e: return None except: return None
class ScieloLocations(mining.PageLocations): """Locations on the page to be gathered by Selenium webdriver The locations may be declared here as static variables of any type, to be retrieved as keys on the centaurminer.MiningEngine.results dictionary. Some examples of data that can be declared here: centaurminer.Metadata: Selenium retrieved elements from a page metadata centaurminer.Element: Selenium retrived elements from a page body. string: Strings declared here won't change, independently of the page searched. """ abstract = mining.Element("css_selector", ".trans-abstract > p:not([class^=sec]), .trans-abstract > div.section") body = mining.Element("css_selector", "#article-body, .index\,pt > p, .index\,en > p, .index\,es > p") category = mining.Element("xpath", "//p[@class='categoria']") date_publication = mining.Element("xpath", "//div[@class='content']/h3") keywords = mining.Element("css_selector", ".trans-abstract > p:last-of-type") license = "https://scielo.org/en/about-scielo/open-access-statement/" organization_affiliated = mining.Element("css_selector", "p.aff").get_attribute('innerHTML') references = mining.Element("css_selector", "p.ref") source = mining.MetaData("citation_journal_title") # Null: citations = '' language = '' search_keyword = '' source_impact_factor = ''
class PreprintsLocations(mining.PageLocations): abstract = mining.MetaData("og:description") keywords = mining.MetaData("citation_keywords") language = 'en' license = "https://creativecommons.org/licenses/by/4.0/" organization_affiliated = mining.MetaData( "citation_author_institution") references = mining.MetaData("citation_reference") source = mining.MetaData('citation_publisher') # Null body = '' category = '' citations = '' search_keyword = '' source_impact_factor = ''
def get_date_publication(self, element): """Gather article date publication, in YYYY-MM-DD format Args: element(:obj: `centaurminer.Element`): Page element to gather body data from. Return: String representing date publication, in format YYYY-MM-DD. """ try: date_str = str(self.get(element).split('Epub ')[1]) try: date_obj = datetime.datetime.strptime( date_str, '%b %d, %Y').date() except ValueError: date_obj = datetime.datetime.strptime( date_str, '%B %d, %Y').date() return date_obj except (AttributeError, IndexError): element = mining.MetaData("citation_date") try: return datetime.datetime.strptime(self.get(element), "%m/%Y").date() except Exception as e: return None except: return None
def get_date_publication(self, element): """ Gather article date publication - look first for a specific element, then look at metadata if it's not found. (Some metadata is corrupted, hence needing to look at a page element) """ date_elem = self.get(element).split('Epub ') if len(date_elem) > 1: date_str = str(date_elem[1]) try: date_obj = datetime.datetime.strptime( date_str, '%b %d, %Y').date() except ValueError: date_obj = datetime.datetime.strptime( date_str, '%B %d, %Y').date() return date_obj else: element = mining.MetaData("citation_date") date = self.get(element) try: if date.startswith("00/"): self.results['publication_date_missing_month'] = True self.results['publication_date_missing_day'] = True return datetime.datetime.strptime(date, "00/%Y").date() else: self.results['publication_date_missing_day'] = True return datetime.datetime.strptime(date, "%m/%Y").date() except ValueError as e: return None
def get_title(self, element): element = mining.MetaData("citation_title") title = self.get(element) if title is not None: return title element = mining.Element("css_selector", "p.title") title = self.get(element) return title
class IbmcLocations(mining.PageLocations): """ IbmcLocations class sets instructions to find an element on `http://pbmc.ibmc.msk.ru/` """ abstract = mining.Element("xpath", "//td[@class='arti'][@style='text-align:justify;']") category = mining.Element("xpath", "//tr[4]/td[@class='arti']") citations = mining.Element("xpath", "//div[@class='__db_score __db_score_normal']") keywords = mining.MetaData("citation_keywords") license = "http://pbmc.ibmc.msk.ru/ru/authors-rules-ru/" organization_affiliated = mining.MetaData("citation_author_institution") source = mining.MetaData("citation_journal_title") language = 'ru' # pubmed_link = mining.Element("xpath", "//td[@class='arti'][@style='align:justify;']//a[@target='_blank']").get_attribute('href') # translated_link = mining.Element("xpath", "//td[@class='arti']//a[@target='_blank']").get_attribute('href') # Null: body = '' references = '' source_impact_factor = '' search_keyword = ''
class Arxiv(mining.PageLocations): """ This is a class used to find the schema in the journal """ references = mining.MetaData("") title = mining.MetaData("citation_title") doi = mining.MetaData("citation_doi") authors = mining.MetaData("citation_author") abstract = mining.Element("css_selector", "blockquote.abstract.mathjax") body = mining.MetaData("") source_impact_factor = mining.MetaData("") category = mining.MetaData("") quantity_of_citations = mining.MetaData("") organization = mining.MetaData("") keywords = mining.MetaData("") extra_link = mining.MetaData("citation_pdf_url") # Constants source = 'Arxiv' language = "english" license = "https://arxiv.org/licenses/nonexclusive-distrib/1.0/license.html" search_keyword = "SARS"