def old_link(self, url): """ If a link to (a session variant of) url is already known on this page (as stored in the database), returns the stored Link, otherwise returns None. """ if not hasattr(self, '_links'): cur = db.dict_cursor() query = "SELECT * FROM links WHERE source_id = %s" cur.execute(query, (self.source_id,)) debug(5, cur._last_executed) self._links = [ Link(source=self, **li) for li in cur.fetchall() ] #debug(2, 'xxx old links:\n%s', '\n'.join([li.url for li in self._links])) for li in self._links: if li.url == url: return li s_url = self.strip_session_variables(url) if s_url != url: for li in self._links: if s_url == self.strip_session_variables(li.url): return li return None
def test_query(caplog): cur = db.dict_cursor() query = "SELECT * FROM sources" #cur.execute(query, (url,)) cur.execute(query) sources = cur.fetchall() assert True
def load_from_db(self, url=''): url = url or self.url if not url: raise TypeError("need source url to load Source from db") cur = db.dict_cursor() query = "SELECT * FROM sources WHERE urlhash = MD5(%s)" cur.execute(query, (url,)) debug(5, cur._last_executed) sources = cur.fetchall() if sources: for k,v in sources[0].items(): setattr(self, k, v) else: debug(4, "%s not in sources table", url)
def load_from_db(self, url='', source_id=0): url = url or self.url source_id = source_id or self.source_id if not url or not source_id: raise TypeError("need url and source_id to load Link from db") cur = db.dict_cursor() query = "SELECT * FROM links WHERE urlhash = MD5(%s) AND source_id = %s LIMIT 1" cur.execute(query, (url, source_id)) debug(5, cur._last_executed) links = cur.fetchall() if links: for k,v in links[0].items(): setattr(self, k, v) else: debug(4, "link to %s not in database", url)
def next_source(): """return the next source from db that's due to be checked""" min_age = datetime.now() - timedelta(hours=16) min_age = min_age.strftime('%Y-%m-%d %H:%M:%S') cur = db.dict_cursor() query = ("SELECT * FROM sources WHERE" " sourcetype != 'blog'" # ignore rss feeds " AND (last_checked IS NULL OR last_checked < %s)" " ORDER BY last_checked LIMIT 1") cur.execute(query, (min_age,)) debug(4, cur._last_executed) sources = cur.fetchall() if sources: return Source(**sources[0]) else: debug(1, "all pages recently checked") return None
def update(): """ re-train classifier; the training corpus is taken from the database. """ debug(3, "re-training philosophy classifier") cur = db.dict_cursor() query = "SELECT cat_id FROM cats WHERE label=%s LIMIT 1" cur.execute(query, ('philosophy',)) cat_id = cur.fetchall()[0]['cat_id'] query = ("SELECT D.*, M.strength" " FROM docs D, docs2cats M" " WHERE M.doc_id = D.doc_id AND M.cat_id = %s AND M.is_training = 1") cur.execute(query, (cat_id,)) debug(4, cur._last_executed) rows = cur.fetchall() if not rows: raise Exception('no training documents for philosophy classifier') docs = [Doc(**row) for row in rows] classes = [row['strength'] for row in rows] clf.train(docs, classes) clf.save()
def load_from_db(self, doc_id=None, url=None): doc_id = doc_id or self.doc_id url = url or self.url cur = db.dict_cursor() if doc_id: query = "SELECT * FROM docs WHERE doc_id = %s" cur.execute(query, (doc_id,)) elif url: query = "SELECT * FROM docs WHERE urlhash = MD5(%s)" cur.execute(query, (url,)) else: raise TypeError("need doc_id or url to load doc from db") debug(5, cur._last_executed) docs = cur.fetchall() if docs: for k,v in docs[0].items(): setattr(self, k, v) return True else: debug(4, "no doc with id %s or url %s in database", doc_id, url) return False
def get_duplicate(doc): """ returns a document from db that closely resembles doc, or None """ # This is non-trivial because duplicates can have slightly # different titles (e.g. with and without <i>), different # filesize and wordcount (manuscript vs published version), # different authors and abstracts (due to parser mistakes, # author name variants, etc.). debug(5, "checking for duplicates") where = ['doc_id != %s'] values = [doc.doc_id] m = re.search('\w+', doc.title) # first title word if m: where.append('title LIKE %s') values.append('%'+m.group()+'%') m = re.search('(\w+)(?:,|$)', doc.authors) # first author surname if m: where.append('authors LIKE %s') values.append('%'+m.group(1)+'%') cur = db.dict_cursor() query = "SELECT * FROM docs WHERE " + (' AND '.join(where)) cur.execute(query, values) debug(5, cur._last_executed) dupes = cur.fetchall() for dupe in dupes: debug(5, "candidate: %s, '%s'", dupe['authors'], dupe['title']) if abs(doc.numwords - dupe['numwords']) / doc.numwords > 0.2: debug(5, "length not close enough") continue sm = SequenceMatcher(None, doc.content, dupe['content']) match_ratio = sm.ratio() if match_ratio < 0.1: # sic debug(5, "content too different, ratio %s", match_ratio) continue debug(4, "duplicate: %s, '%s'", dupe['authors'], dupe['title']) return Doc(**dupe) return None
logger = logging.getLogger('opp') logger.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.DEBUG) logger.addHandler(ch) ap = argparse.ArgumentParser() ap.add_argument('url', help='(part of) url of source page to scrape') ap.add_argument('-d', '--debug_level', default=1, type=int) ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files') ap.add_argument('-l', '--link', type=str, help='only process this link') args = ap.parse_args() debug.debuglevel(args.debug_level) cur = db.dict_cursor() query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1" cur.execute(query, ('%'+args.url+'%',)) sources = cur.fetchall() if not sources: raise Exception(args.url+' not in sources table') source = scraper.Source(**sources[0]) if args.link: browser = scraper.Browser(use_virtual_display=True) browser.goto(source.url) source.set_html(browser.page_source) try: el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link)) except Exception as e: sys.exit('no link containing '+args.link+' on '+source.url)