def pages(self): my_pages = [] if self.normalized_title: if title_is_too_short(self.normalized_title): # logger.info(u"title too short! don't match by title") pass elif title_is_too_common(self.normalized_title): # logger.info(u"title too common! don't match by title.") pass else: my_pages = self.page_matches_by_title_filtered # do dois last, because the objects are actually the same, not copies, and then they get the doi reason for my_page in self.page_matches_by_doi: my_page.scrape_evidence = u"oa repository (via OAI-PMH doi match)" my_pages.append(my_page) # eventually only apply this filter to matches by title, once pages only includes # the doi when it comes straight from the pmh record if max_pages_from_one_repo( [p.repo_id for p in self.page_matches_by_title_filtered]) >= 3: my_pages = [] logger.info( u"matched too many pages in one repo, not allowing matches") return my_pages
def query_for_num_pub_matches(self): from pmh_record import title_is_too_common from pmh_record import title_is_too_short from pub import Pub # it takes too long to query for things like "tablecontents" if title_is_too_common(self.normalized_title) or title_is_too_short(self.normalized_title): logger.info(u"title is too common or too short, not scraping") return -1 num_pubs_with_this_normalized_title = db.session.query(Pub.id).filter(Pub.normalized_title==self.normalized_title).count() return num_pubs_with_this_normalized_title