def add_page_pair_to_database(from_page, to_page, limit): with db_lock: cou = session.query(Page.id).filter(Page.url == from_page).scalar() cou1 = session.query(Page.id).filter(Page.url == to_page).scalar() if cou is None: new_page_from = Page(url=from_page, text="", rank=0) session.add(new_page_from) session.flush() id0 = new_page_from.id else: id0 = cou if cou1 is None: allowed = limit < 1 or limit > session.query(Page).count() if not allowed: return new_page_to = Page(url=to_page, text="", rank=0) session.add(new_page_to) session.flush() id1 = new_page_to.id else: id1 = cou1 new_relation = Relation(page_id=id0, destination_id=id1) # print(new_relation.page_id.id) session.add(new_relation) session.commit()
def load_pages(): """Load pages from seed data into database""" with open("seed_data/pages.txt") as pages: for row in pages: page = row.rstrip().split("|") hidden = True if page[3] == "True" else False kwargs = dict( page_id = page[0], user_id = page[1], page = page[2], hidden = hidden ) keys_to_remove = [] for key in kwargs.keys(): if kwargs[key] == "": keys_to_remove.append(key) for key in keys_to_remove: del kwargs[key] page = Page(**kwargs) db.session.add(page) db.session.commit()
def process_list(self, page): queue = self.application.queue logging.debug('Processing list %s' % page.url) processed = 0 if page.state == Page.State.PARSED: with session_scope(self.application.Session) as session: for url in page.get_contents(): (page_exists, ), = session.query( exists().where(Page.url == url)) if not page_exists: if 'http' not in url: url = self.application.config.SCRAPE_ROOT_PATH + url subpage = Page(url=url) session.add(subpage) session.commit() queue.add_page(subpage.page_id) processed += 1 page.state = Page.State.PROCESSED logging.debug('Processed list %s' % page.url) print( 'Processed %s urls! Fetching movies & getting more movie urls...' % processed) else: logging.debug('Aleady processed list %s' % page.url)
def create_book_page(page_text, page_image, email): """Create a pages of book""" book_id = get_book_id(email) page = Page(text= page_text, image=page_image, book_id = book_id) db.session.add(page) db.session.commit() return page
def add_page(self): params = self.request.params order = params.get('order', Page.get_children_count(self.master_key)) parent = self._get_parent_key() page_type = params.get('type', None) if page_type: page = Page(parent=parent, page_type=page_type, order=int(order)) self._get_page(page) else: self.error(400)
def create_cover_page(page_text, cover_image, email): """Create a cover of book""" book_id = 0 book_id_list = db.session.query(Book.id).all() for last_book in book_id_list: book_id = last_book cover_page = Page(text= page_text, cover_image=cover_image, book_id = book_id) db.session.add(cover_page) db.session.commit() return cover_page
def getPage(self, article, oldid=None): global wikiDatabase if article not in wikiDatabase: return NoPage(article=article, controller=self) if oldid: return OldPage(article=article, wikitext=wikiDatabase[article][1][int(oldid)][0], controller=self) return Page(article=article, wikitext=wikiDatabase[article][0], controller=self)
def add_page(): """Add a Page.""" data = json.loads(request.data.decode()) title = data.get('title') content = data.get('content') page = Page(title, content) db.session.add(page) db.session.commit() return get_page_json(page.id)
def update_pages(): """change what pages to display""" user_id = session["current_user"] user = User.query.get(user_id) new_pages = request.form.getlist("pages") print(new_pages) Page.query.filter_by(user_id=user_id).delete() for page in new_pages: kwargs = dict(user_id=user_id, page=page) db.session.add(Page(**kwargs)) db.session.commit() print(user.pages) return redirect("users/{}/my_homepage".format(user_id))
def post(self): name = self.get_argument("name", None) slug = self.get_argument("slug", None) content = self.get_argument("content", "") template = self.get_argument("template", "staticpage.html") page = Page() page.name = name page.slug = slug page.content = content page.template = template try: page.validate() page.save() self.flash(u"栏目%s添加成功" % name) self.redirect("/admin/pages") return except Exception, ex: self.flash(str(ex))
def request_page(self, url): global last_requst_time import hashlib from sqlalchemy import desc query_result = self.__session_crawler.query(Page).filter_by( url=url).order_by(desc(Page.mtime)).first() if query_result and os.path.isfile( os.path.join(self.__data_dir, query_result.file_path)): return open(os.path.join(self.__data_dir, query_result.file_path), 'rb').read() now = datetime.datetime.now() print('[{0}] Request {1}'.format(now.strftime("%Y-%m-%d %H:%M:%S.%f"), url)) requst_timedelta = (now - last_requst_time).microseconds if requst_timedelta < 500000: # 0.5s time.sleep(0.5 - requst_timedelta / 1000000) request_args = {'cookies': self.get_cookies(url)} r = requests.get(url, **request_args) #print('[{0}] Receive {1}'.format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"),url)) last_requst_time = now if r is None or r.status_code != 200 or r.content is None: return '<html></html>' content_hash = hashlib.md5(r.content).hexdigest() file_path = os.path.abspath( os.path.join(self.__data_dir, self.url_to_file_path(url, content_hash))) try: os.makedirs(os.path.dirname(file_path)) except: pass page = Page(url=url, size=len(r.content), file_path=file_path, content_hash=content_hash, mtime=now) self.__session_crawler.add(page) self.__session_crawler.commit() open(os.path.join(self.__data_dir, file_path), 'wb').write(r.content) return r.content
def start_crawler(self): start = time.time() # read robots.txt tmp = "http://" + self.base + "/robots.txt" self.robot_parser.set_url(tmp) self.robot_parser.read() # put first link self.q.put((0, self.website)) new_page = Page(url=self.website, text="", rank=0) session.add(new_page) session.commit() threads = [] for x in range(self.threads_number): t = threading.Thread(target=self.worker) t.daemon = True threads.append(t) t.start() # wait until the queue becomes empty self.q.join() # join threads for i in range(self.threads_number): self.q.put(None) for t in threads: t.join() session.commit() # empty the queue self.q.queue.clear() end = time.time() print("With", self.threads_number, "threads elapsed : ", end - start) print("Total number of pages processed :", self.current_pages_processed)
def init(self): pages_added = 0 with session_scope(self.Session) as session: if session.query(Page).count() == 0: for i in self.config.ROOT_NODES: page = Page(url=i) session.add(page) session.commit() self.queue.add_page(page.page_id) pages_added += 1 else: for page in session.query(Page).filter(Page.state != Page.State.PROCESSED).all(): self.queue.add_page(page.page_id) pages_added += 1 session.expunge_all() if pages_added != 0: print('No movie data in our system. We need to scrape IMDB for data...') print('Started pipeline! Added %s root pages to processing queue' % pages_added) self.queue.join() print("Finished processing!") self.search_module.build_index()
from flask_sqlalchemy import SQLAlchemy from model import Page, User, Role, Image from setting import app db = SQLAlchemy(app) # db.drop_all() # db.create_all() page = Page() x=page.query.filter_by(title='homepage').first() if x is None: page.title='homepage' page.contents='<h1> Selamat datang di dunia python </h1>' page.is_homepage=True db.session.add(page) db.session.commit() # x1=page.query.filter_by(title='Hallo Dunia').first() # if x1 is None: # page.title='Hallo Dunia' # page.contents='<h1> Hallo Dunia ? apa kabar... </h1>' # page.is_homepage=False # page.url='page/hallo-dunia' # page.image_id=1 # db.session.add(page) # db.session.commit() gbr=Image()