Ejemplo n.º 1
0
def page_create():
    page_fields = page_schema.load(request.json)

    new_page = Page()
    new_page.page_number = page_fields["page_number"]
    new_page.page_content = page_fields["page_content"]
    new_page.book_id = page_fields["book_id"]

    db.session.add(new_page)
    db.session.commit()

    return jsonify(page_schema.dump(new_page))
Ejemplo n.º 2
0
  def __load_root(self) -> None:
    """Loads all files in the root folder."""
    # loop through all files and load them as separate pages
    for root, _, files in os.walk(self.__root):
      for fileName in files:
        filePath = normalize_path(os.path.join(root, fileName))
        newPage = Page(fileName)

        if self.__linkMgr is not None:
          pageLinks = self.__linkMgr.get_links_for(fileName)
          newPage.add_links(pageLinks)

        self.__read_file(filePath, newPage)
        self.__pages.add(newPage)
Ejemplo n.º 3
0
 def setUp(self):
     self.__class__.page1 = Page("Page 1",
                                 "www.index1.com",
                                 "This is page number 1",
                                 OccurrenceList(["www.index2.com",
                                                 "www.index3.com"]))
     self.__class__.page2 = Page("Page 2",
                                 "www.index2.com",
                                 "This is page number 2",
                                 OccurrenceList(["www.index1.com",
                                                 "www.index3.com"]))
     self.__class__.page3 = Page("Page 3",
                                 "www.index3.com",
                                 "This is page number 3",
                                 OccurrenceList(["www.index1.com",
                                                 "www.index2.com"]))
Ejemplo n.º 4
0
  def get_word_frequency_score(self, page: Page, query: List[int]) -> int:
    score = 0

    for word in query:
      score += page.get_word_count(word)
    
    return score
Ejemplo n.º 5
0
 def get_results(self):
     results = []
     for url in self._pages:
         page_infos = requests.get(url)
         page = Page(website=self._website, link=url, headers=self._get_headers(page_infos), content=self._get_content(page_infos))
         results.append(page)
     return results
Ejemplo n.º 6
0
  def get_document_location_score(self, page: Page, query: List[int]) -> int:
    score = 0

    for word in query:
      wordIndex = page.get_word_index(word)
      if wordIndex >= 0:
        score += wordIndex + 1
      else:
        score += 100000
    
    return score
 def get_results(self):
     results = []
     for url in self._pages:
         url = url.replace('\n', '')
         page_infos = requests.get(url, verify=False, timeout=60)
         page = Page(website=self._website,
                     link=url,
                     headers=self._get_headers(page_infos),
                     content=self._get_content(page_infos))
         results.append(page)
     return results
Ejemplo n.º 8
0
def seed_db():
    from models.Book import Book
    from models.User import User
    from models.Page import Page
    from main import bcrypt
    from faker import Faker
    import random

    faker = Faker()
    users = []

    for i in range(5):
        user = User()
        user.email = f"test{i}@test.com"
        user.password = bcrypt.generate_password_hash("123456").decode("utf-8")
        db.session.add(user)
        users.append(user)

    db.session.commit()

    for i in range(20):
        book = Book()
        book.title = faker.catch_phrase()
        book.user_id = random.choice(users).id
        book.author = faker.name()
        db.session.add(book)

    db.session.commit()

    for i in range(100):
        page = Page()
        page.page_content = faker.text(1500)
        page.page_number = random.randint(1, 300)
        page.book_id = random.randint(1, 20)
        db.session.add(page)
    db.session.commit()

    print("Tables seeded")
Ejemplo n.º 9
0
def scrape(site, prefix="https://en.wikipedia.org"):
    """
    Scrape a webpage by url or html
    :param site: Site object which specifies the url
    or the html document of the website
    :param prefix: prefix url to prefix to links
    found in the webpage, default is "https://en.wikipedia.org"
    :return: A Page object containing title, url, text and
    links_to from the web page scraped
    """
    if site.html_doc:
        with open(HTML_DIR.child(site.html_doc)) as html:
            page = BeautifulSoup(html, 'html.parser')
    else:
        page = BeautifulSoup(urlopen(site.url), 'html.parser')

    links_to = OccurrenceList()
    for link in page.find_all('a'):
        if link.get('href'):
            url_link = link.get('href')
            if not url_link.startswith("http"):
                url_link = prefix + url_link
            links_to = links_to.union(OccurrenceList([url_link]))

    """
    Remove script tags
    """
    for script in page("script"):
        page.script.extract()

    """
    Remove style tags
    """
    for style in page("style"):
        page.style.extract()

    """
    Remove comments
    """
    comments = page.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    return Page(page.title.string, site.url, page.text, links_to)
Ejemplo n.º 10
0
    def parse_chapter(self, response):
        ##Infos ::

        tab2 = response.css("#tab-2")
        texts = tab2.css('#leftis').get()
        texts = texts.split("\n")
        chapterRes = Chapter.select().where(
            (Chapter.title_en == texts[3].split("</strong> ")[1])
            & (Chapter.edition_number == int(texts[6].split("</strong> ")[1])))
        if chapterRes:
            print("Chapter " + str(texts[6].split("</strong> ")[1]) + " of " +
                  texts[3].split("</strong> ")[1] + " is already exist!")
            return
        else:
            mangaRes = Manga.get(
                Manga.title_en == texts[3].split("</strong> ")[1])
            if (mangaRes):
                chapter = Chapter()
                edition = texts[6].split("</strong> ")[1]
                chapter.edition_number = edition
                chapter.views = texts[8].split("</strong> ")[1]
                chapter.title_en = mangaRes.title_en
                chapter.title = mangaRes.title
                chapter.cover = mangaRes.cover
                chapter.about = mangaRes.about
                chapter.manga_id = mangaRes.id
                chapter.save()
                ##Add Pages
                images = response.css('#sqmang > img')
                for image in images:
                    imageSRC = image.xpath("@src").get()
                    if Page.select().where((Page.image == imageSRC)
                                           & (Page.chapter_id == chapter.id)):
                        continue
                    page = Page()
                    page.chapter_id = chapter.id
                    page.image = imageSRC
                    page.save()
                print("Chapter " + str(texts[6].split("</strong> ")[1]) +
                      " of " + texts[3].split("</strong> ")[1] +
                      " has been added")
            else:
                print(texts[3].split("</strong> ")[1] + " Manga Dosen't exist")
Ejemplo n.º 11
0
 def __read_file(self, path: str, page: Page) -> None:
   """Reads the given file and fills the provided page with the words contained."""
   # todo: better naming for this method
   fileContents = open(path).read().strip()
   for word in fileContents.split():
     page.add_word(self.__get_word_id(word))