def page_parser(self, url, page_content): """docstring for page_parser""" con = helpers.mssqlconn() paths = self.PAPER_PATH_RE.findall(page_content) for i in paths: try: #con.execute_non_query(self.SUB_SQL, (int(self.PAPER_ID_RE.search(i).group(1)), i)) print self.PAPER_ID_RE.search(i).group(1), i except Exception: pass con.close()
def content_parser(self, url, page_content): """docstring for content_parser""" con = helpers.mssqlconn() p = pq(page_content) paper_id = int(self.PAPER_ID_RE.search(url).group(1)) title = p('h1').text() or '-----' author = p('.author a').text() or '-----' abstract = p('.abstrack').text() or '-----' keywords = p('.keywords a').text() or '-----' paper_class = p('#wxClass').attr.value or '-----' download_url = '-----' try: con.execute_non_query(self.INSERT_SQL, (paper_id, title, author, abstract, keywords, paper_class, download_url)) print paper_id except Exception: print 'not', paper_id