def page_create(): page_fields = page_schema.load(request.json) new_page = Page() new_page.page_number = page_fields["page_number"] new_page.page_content = page_fields["page_content"] new_page.book_id = page_fields["book_id"] db.session.add(new_page) db.session.commit() return jsonify(page_schema.dump(new_page))
def __load_root(self) -> None: """Loads all files in the root folder.""" # loop through all files and load them as separate pages for root, _, files in os.walk(self.__root): for fileName in files: filePath = normalize_path(os.path.join(root, fileName)) newPage = Page(fileName) if self.__linkMgr is not None: pageLinks = self.__linkMgr.get_links_for(fileName) newPage.add_links(pageLinks) self.__read_file(filePath, newPage) self.__pages.add(newPage)
def setUp(self): self.__class__.page1 = Page("Page 1", "www.index1.com", "This is page number 1", OccurrenceList(["www.index2.com", "www.index3.com"])) self.__class__.page2 = Page("Page 2", "www.index2.com", "This is page number 2", OccurrenceList(["www.index1.com", "www.index3.com"])) self.__class__.page3 = Page("Page 3", "www.index3.com", "This is page number 3", OccurrenceList(["www.index1.com", "www.index2.com"]))
def get_word_frequency_score(self, page: Page, query: List[int]) -> int: score = 0 for word in query: score += page.get_word_count(word) return score
def get_results(self): results = [] for url in self._pages: page_infos = requests.get(url) page = Page(website=self._website, link=url, headers=self._get_headers(page_infos), content=self._get_content(page_infos)) results.append(page) return results
def get_document_location_score(self, page: Page, query: List[int]) -> int: score = 0 for word in query: wordIndex = page.get_word_index(word) if wordIndex >= 0: score += wordIndex + 1 else: score += 100000 return score
def get_results(self): results = [] for url in self._pages: url = url.replace('\n', '') page_infos = requests.get(url, verify=False, timeout=60) page = Page(website=self._website, link=url, headers=self._get_headers(page_infos), content=self._get_content(page_infos)) results.append(page) return results
def seed_db(): from models.Book import Book from models.User import User from models.Page import Page from main import bcrypt from faker import Faker import random faker = Faker() users = [] for i in range(5): user = User() user.email = f"test{i}@test.com" user.password = bcrypt.generate_password_hash("123456").decode("utf-8") db.session.add(user) users.append(user) db.session.commit() for i in range(20): book = Book() book.title = faker.catch_phrase() book.user_id = random.choice(users).id book.author = faker.name() db.session.add(book) db.session.commit() for i in range(100): page = Page() page.page_content = faker.text(1500) page.page_number = random.randint(1, 300) page.book_id = random.randint(1, 20) db.session.add(page) db.session.commit() print("Tables seeded")
def scrape(site, prefix="https://en.wikipedia.org"): """ Scrape a webpage by url or html :param site: Site object which specifies the url or the html document of the website :param prefix: prefix url to prefix to links found in the webpage, default is "https://en.wikipedia.org" :return: A Page object containing title, url, text and links_to from the web page scraped """ if site.html_doc: with open(HTML_DIR.child(site.html_doc)) as html: page = BeautifulSoup(html, 'html.parser') else: page = BeautifulSoup(urlopen(site.url), 'html.parser') links_to = OccurrenceList() for link in page.find_all('a'): if link.get('href'): url_link = link.get('href') if not url_link.startswith("http"): url_link = prefix + url_link links_to = links_to.union(OccurrenceList([url_link])) """ Remove script tags """ for script in page("script"): page.script.extract() """ Remove style tags """ for style in page("style"): page.style.extract() """ Remove comments """ comments = page.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() return Page(page.title.string, site.url, page.text, links_to)
def parse_chapter(self, response): ##Infos :: tab2 = response.css("#tab-2") texts = tab2.css('#leftis').get() texts = texts.split("\n") chapterRes = Chapter.select().where( (Chapter.title_en == texts[3].split("</strong> ")[1]) & (Chapter.edition_number == int(texts[6].split("</strong> ")[1]))) if chapterRes: print("Chapter " + str(texts[6].split("</strong> ")[1]) + " of " + texts[3].split("</strong> ")[1] + " is already exist!") return else: mangaRes = Manga.get( Manga.title_en == texts[3].split("</strong> ")[1]) if (mangaRes): chapter = Chapter() edition = texts[6].split("</strong> ")[1] chapter.edition_number = edition chapter.views = texts[8].split("</strong> ")[1] chapter.title_en = mangaRes.title_en chapter.title = mangaRes.title chapter.cover = mangaRes.cover chapter.about = mangaRes.about chapter.manga_id = mangaRes.id chapter.save() ##Add Pages images = response.css('#sqmang > img') for image in images: imageSRC = image.xpath("@src").get() if Page.select().where((Page.image == imageSRC) & (Page.chapter_id == chapter.id)): continue page = Page() page.chapter_id = chapter.id page.image = imageSRC page.save() print("Chapter " + str(texts[6].split("</strong> ")[1]) + " of " + texts[3].split("</strong> ")[1] + " has been added") else: print(texts[3].split("</strong> ")[1] + " Manga Dosen't exist")
def __read_file(self, path: str, page: Page) -> None: """Reads the given file and fills the provided page with the words contained.""" # todo: better naming for this method fileContents = open(path).read().strip() for word in fileContents.split(): page.add_word(self.__get_word_id(word))