Ejemplo n.º 1
0
 def __init__(self):
     self.Inicio = Page()
     self.Inicio2 = Page()
     self.NBinsert = NodoB()
     self.Enlace = Page()
     self.Pivote = False
     self.Bandera = False
     self.Bandera2 = False
     """"""
Ejemplo n.º 2
0
def pull():
    url = str(input("Enter the url for the first page of the reader: "))
    firstPage = Page(url)
    page = firstPage
    while page.get_next():
        if page.get_next():
            page = Page(page.get_next())
        else:
            print("End of chapters")

    path = os.path.join(os.getcwd(), 'novels', firstPage.title, "lastRead.txt")
    with open(path, 'w', encoding='utf-8') as info:
        info.write("{}\n".format(firstPage.title))
        info.write(firstPage.name)
        info.write(0)
Ejemplo n.º 3
0
    def DivPage(self, clave, root, position):
        posi = 0
        PosPiv = 0
        if (position <= 2):
            PosPiv = 2
        else:
            PosPiv = 3
        Medio = Page(Ramas=[None, None, None, None, None],
                     Claves=[None, None, None, None],
                     Cuentas=0)
        Posi = PosPiv + 1
        while (posi != 5):
            i = ((posi - PosPiv) - 1)
            j = posi - 1
            Medio.Claves[i] = root.Claves[j]
            Medio.Ramas[posi - PosPiv] = root.Ramas[posi]
            posi += 1
        Medio.setCuentas(4 - PosPiv)
        root.setCuentas(PosPiv)

        if (posi < 2):
            self.ClaveInsert(clave, root, position)
        else:
            self.ClaveInsert(clave, Medio, (position - PosPiv))

        self.setNBInsert(root.Claves[root.getCuentas() - 1])
        Medio.Ramas[0] = root.Ramas[root.getCuentas()]
        valor = root.getCuentas() - 1
        root.setCuentas(valor)
        self.setEnlace(Medio)
        """"""
def scrape(site, prefix="https://en.wikipedia.org"):
    page = BeautifulSoup(urlopen(site.url), 'html.parser')
    links_to = OccurrenceList()
    for link in page.find_all('a'):
        if link.get('href'):
            url_link = link.get('href')
            if not url_link.startswith("http"):
                url_link = prefix + url_link
            links_to = links_to.union(OccurrenceList([url_link]))

    """
    Remove script tags
    """
    for script in page("script"):
        page.script.extract()

    """
    Remove style tags
    """
    for style in page("style"):
        page.style.extract()

    """
    Remove comments
    """
    comments = page.findAll(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    return Page(page.title.string, site.url, page.text, links_to)
Ejemplo n.º 5
0
 def get_page(self, page_id):
     # try:
     self.cursor.execute(
         "SELECT title, in_page_title, summary, content FROM pages WHERE page_id = %s;",
         (page_id,))
     title, in_page_title, summary, content = self.cursor.fetchall()[0]
     return Page(title, in_page_title, summary, content)
Ejemplo n.º 6
0
def add_page_to_memory(new_process, page_number):
    global global_time
    # Check available frames
    if memory_available(M) <= 0:
        # Swap
        swap(new_process, page_number)
    else:
        # Insert
        new_frame = -1
        for index, memory in enumerate(M):
            if memory == [-1, -1]:
                M[index] = [new_process, page_number]
                new_frame = index
                break
        if page_number not in processes[new_process].table:
            # Create page object
            new_page_obj = Page(page_number, new_frame, 1)
            # Insert into process table
            processes[new_process].insert_page(new_page_obj)
        else:
            #Change S memory
            S[processes[new_process].table[page_number].frame] = [-1, -1]
            #Changes page characteristics in processes
            processes[new_process].table[page_number].frame = new_frame
            processes[new_process].table[page_number].bit_memory = 1
        algorithm[PAGE_REPLACEMENT_ALGORITHM].insert(new_process, page_number)
    #adds 1 sec to global time for adding page to M
    global_time += 10
Ejemplo n.º 7
0
def SIMULATE(Frames, Pages, PRL, ZLO):
    """
    PRL - Page Reference Lenght
    :param int Frames:
    :param int Pages:
    :param int PRL:
    :param boolean ZLO:
    :return:
    """

    pages = []
    for i in range(Pages):
        pages.append(Page(i))

    pageReferences = []

    if not ZLO:

        for index in range(PRL):
            pageReferences.append(pages[random.randint(0, len(pages) - 1)])
    else:
        for index in range(PRL):
            odw = pages[index - 1].pageN - 5 + random.randint(0, 1 + 5 * 2)
            pageReferences.append(pages[max(0, min(Pages, odw))])

    executeFIFO(Frames, pageReferences)
    executeLRU(Frames, pageReferences)
    executeALRU(Frames, pageReferences)
    executeOPT(Frames, pageReferences)
    executeRAND(Frames, pageReferences)
Ejemplo n.º 8
0
 def readPage(self, page_name):
     if self.page_window is None:
         self.page_window = Page(page_name, 1)
         self.page_window.show()
     else:
         self.page_window.close()
         self.page_window = None
Ejemplo n.º 9
0
 def ouv(self, path):
     if path[-7:] == ".projpy":
         self.sommets = []
         fichier = open(path, "r")
         i = 0
         somm = {}
         for ligne in fichier.readlines():
             ligne = ligne.replace("\n", "")
             if ligne == "---":
                 i = i + 1
             else:
                 if i == 0:
                     str = ligne.split(":")
                     if str[1] == "U":
                         carac = str[2].split(",")
                         somm[str[0]] = Utilisateur(carac[0], carac[1],
                                                    int(carac[2]))
                         self.add_sommet(somm[str[0]])
                     if str[1] == "P":
                         somm[str[0]] = Page(str[2])
                         self.add_sommet(somm[str[0]])
                 if i == 1:
                     firstUtil = ligne.split(":")[0]
                     others = ligne.split(":")[1].split(",")
                     for followed in others:
                         somm[firstUtil].connect(somm[followed])
                 if i == 2:
                     thePage = ligne.split(":")[0]
                     theAdmins = ligne.split(":")[1].split(",")
                     for admin in theAdmins:
                         somm[thePage].add_admin(somm[admin])
         fichier.close()
     else:
         print("mauvais format.\nextention .projpy")
Ejemplo n.º 10
0
 def __init__(self):
     self.max_memory = int(max_pages)
     self.main_memory = []  #list of Pages
     for i in range(self.max_memory):
         page = Page()
         self.main_memory.append(page)
     self.disk_memory = "vm.txt"
     self.command_list = command_list
def get_all_friends(total_page_number, original_url):
    all_friends = []
    for page_num in range(0, total_page_number):
        url = original_url + '&curpage=' + str(page_num)
        page = Page(url)
        parser = PageParser(page.fetch())
        all_friends.extend(parser.get_friends())
    return all_friends
Ejemplo n.º 12
0
    def __add_existing_page(self, configs=None):
        """ adds a page and fills with the given widgets """
        page = Page(self.stack, self)

        if configs is not None:
            for config in configs:
                self.__add_existing_widget(page, config)

        self.stack.addWidget(page)
Ejemplo n.º 13
0
def view(page):
    """View multiple Pages."""

    path = page
    path = str(path)
    print(path)
    x = Page(path)
    x.disp()
    input()
Ejemplo n.º 14
0
def init():
    global driver, soup, page_type, page, website
    driver = webdriver.Chrome()
    soup = ""
    page_type = ""
    page = Page()
    page = Article()
    page = YoutubePage()
    page = YoutubeVideo()
    website = ""
Ejemplo n.º 15
0
 def InsertarNodo(self, usuario, nombre, clave, root):
     self.Add(usuario, nombre, clave, root)
     if (self.getPivote() == True):
         self.Inicio = Page(Ramas=[None, None, None, None, None],
                            Claves=[None, None, None, None],
                            Cuentas=0)
         self.Inicio.setCuentas(1)
         self.Inicio.Claves[0] = self.getInsert()
         self.Inicio.Ramas[0] = root
         self.Inicio.Ramas[1] = self.getEnlace()
     """"""
Ejemplo n.º 16
0
 def find_apt_urls(self, given_url):
     apt_links = self.gen_site(given_url).findAll(
         'a', {'class': 'placardTitle js-placardTitle '})
     apt_urls = []
     for link in apt_links:
         gen_url = link.get('href')
         if not Page(gen_url).check_invalid():
             apt_urls.append(gen_url)
     # get unique urls by converting it to a set, then convert it back to a list
     apt_urls = list(set(apt_urls))
     return apt_urls
Ejemplo n.º 17
0
def recursiveScrape(url, visitedLinks, domain):
    if url in visitedLinks : return True
    try:
        if domain not in url : return True
    except TypeError:
        return True
    print(url)
    # page object for the current page
    curPage = Page()
    # open url
    source = requests.get(url).text
    soup = BeautifulSoup(source, 'lxml')

    # set curPage elements
    try:
        curPage.title = soup.find('title').text
    except AttributeError:
        curPage.title = None
    curPage.url = url
    curPage.childLinks = []
    hyperLinks = soup.find_all('a')
    hyperLinks = list(filter((None).__ne__, hyperLinks))

    # get all links on a page
    for link in hyperLinks:
        # set hyperLink to href of link
        hyperLink = link.get('href')
        # remove '.' or '..' from the link
        if hyperLink == None:
            pass
        elif len(hyperLink) >= 2 and hyperLink[0:2] == '..':
            hyperLink = hyperLink.replace('..', '')
        elif len(hyperLink) == 1 and hyperLink[0] == '.':
            hyperLink = hyperLink.replace('.', '')
        
        # if not an external url add domain to hyperLink
        if hyperLink == None:
            pass
        elif hyperLink[0:4] != "http" and hyperLink[0] != '/':
            hyperLink = 'http://' + domain + '/' + hyperLink
        elif hyperLink[0:4] != "http":
            hyperLink = 'http://' + domain + hyperLink
        curPage.childLinks.append(hyperLink)

    # write curPage object to file
    curPage.appendToFile(domain)

    # add current link to visitedLinks
    visitedLinks.append(url)

    # for all child links in the page
    for link in curPage.childLinks:
        # call this function on that link
        recursiveScrape(link, visitedLinks, domain)
Ejemplo n.º 18
0
 def insertarCapeta():
         usuario = str(request.form['user'])
         nombre = str(request.form['nombre'])
         if(nombre == "nulo"):
                 return "No inserta nada"
         else:
                 LaRaiz = ListaUsuarios.ObtenerCarpetaRaiz(usuario)
                 LaRaiz.CrearNodo(usuario, nombre, Page(Ramas=[None,None, None,None,None], Claves=[None,None,None,None],Cuentas=0))
                 LaRaiz.CrearArchivo()
                 elestring =  LaRaiz.CrearCarpetaHTML()
         return str(elestring)
Ejemplo n.º 19
0
def get_wikipage(data):
    room_code = data['roomCode']
    page_name = data['wikiPage']
    room = rooms[room_code]

    page = Page(page_name, rooms[room_code].target_page).export()
    emit('updatePage', page)
    winner = room.update_game(request.sid, page_name)

    if winner:
        emit('endRound', winner.export(), broadcast=True, room=room_code)
Ejemplo n.º 20
0
def swap(process_to_insert_ID, process_to_insert_page_number):
    global global_time
    global swaps
    #gets process to switch with
    process_to_switch_ID, process_to_switch_page_number = algorithm[
        PAGE_REPLACEMENT_ALGORITHM].pop()
    S_frame = -1
    #If process needs to switch from S to M
    if process_to_insert_page_number in processes[process_to_insert_ID].table:
        S_frame = processes[process_to_insert_ID].table[
            process_to_insert_page_number].frame
        swaps += 1
        processes[process_to_insert_ID].page_faults += 1
    #If it is a new page/process
    else:
        S_frame = -1
        for index, memory in enumerate(S):
            if memory == [-1, -1]:
                S_frame = index
                break
        # Create default page object
        new_page_obj = Page(process_to_insert_page_number, -1, -1)
        # Insert into process table
        processes[process_to_insert_ID].insert_page(new_page_obj)
    page_to_S = processes[process_to_switch_ID].table[
        process_to_switch_page_number]
    ##inserts new/needed process into main memory and algorithm
    processes[process_to_insert_ID].table[
        process_to_insert_page_number].frame = page_to_S.frame
    processes[process_to_insert_ID].table[
        process_to_insert_page_number].bit_memory = 1
    algorithm[PAGE_REPLACEMENT_ALGORITHM].insert(
        process_to_insert_ID, process_to_insert_page_number)
    #updates process table of switched page
    processes[process_to_switch_ID].table[
        process_to_switch_page_number].frame = S_frame
    processes[process_to_switch_ID].table[
        process_to_switch_page_number].bit_memory = 0
    #updates S memory
    S[S_frame] = [process_to_switch_ID, page_to_S.ID]
    #updates M memory
    M[processes[process_to_insert_ID].table[process_to_insert_page_number].
      frame] = [process_to_insert_ID, process_to_insert_page_number]
    print(
        "Página", process_to_insert_page_number, "del proceso",
        process_to_insert_ID, "swappeada al marco",
        processes[process_to_insert_ID].table[process_to_insert_page_number].
        frame, "del área real")
    print("Página", process_to_switch_page_number, "del proceso",
          process_to_switch_ID, "swappeada al marco", S_frame,
          "del área de swapping")
    #adds 1 sec to global timer because of the extra operation of swapping out "process_to_switch"
    global_time += 10
Ejemplo n.º 21
0
    def __init__(self, filename):

        # Open the PDF file
        docfile = open(filename, 'rb')

        # Read the file object
        document = PyPDF2.PdfFileReader(docfile)

        # Make a list of Page objects
        pageList = []
        for pg in range(document.getNumPages()):
            pageList.append(Page(document, pg))
        self.pages = pageList
Ejemplo n.º 22
0
    def post(self, path):
        if not self.user:
            self.error(400)
            return

        content = self.request.get("content")
        old_page = Page.by_path(path).get()
        if  not (old_page or content):
            return
        elif not old_page or old_page.content != content:
            p = Page(parent = Page.parent_key(path), content = content)
            p.put()
        else:
            path = re.sub('\_edit$', '', path)
            self.redirect(path)
Ejemplo n.º 23
0
def update():
    update_soup()
    update_website_base()
    update_page_type()
    if ("nytimes" in globals.website):
        if (globals.page_type == "Article"):
            globals.page = Article(globals.soup)
        else:
            globals.page = Page(globals.soup, globals.page_type)
    elif ("youtube" in globals.website):
        if (globals.page_type == "YoutubeVideo"):
            globals.page = YoutubeVideo(globals.soup)
        else:
            globals.page = YoutubePage(globals.soup, globals.page_type)
    playsound.playsound("confirmation_beep.mp3")
Ejemplo n.º 24
0
 def prep_pagefiles(self):
     """
     TODO
     """
     self.page_files = sorted(self.page_files,
                              key=lambda s: [
                                  int(t) if t.isdigit() else t.lower()
                                  for t in re.split(r'(\d+)', s)
                              ])
     self.repeated = list()
     self.found_pagenumbers = list()
     self.page_list = []
     for i, page_filepath in enumerate(self.page_files):
         self.repeated.append(set())
         self.found_pagenumbers.append(None)
         self.page_list.append(Page(page_filepath, self, i))
Ejemplo n.º 25
0
 def __init__(self, pid, size, arrival_time, page_exec_order):
     super(Process, self).__init__()
     self.pid = pid
     self.size = size
     self.pages = []
     self.page_table = []
     self.next_to_be_executed = 0
     self.arrival_time = arrival_time
     self.page_exec_order = page_exec_order
     if self.size % OSParams.page_size != 0:
         page_count = (self.size / OSParams.page_size) + 1
     else:
         page_count = (self.size / OSParams.page_size)
     for i in xrange(0, page_count):
         self.pages.append(Page(self.pid, i))
         self.page_table.append(-1)
def createPage(livre: Livre, numero: int, fin: bool = False):
    """Permet de creer les dossier et sous dossiers d'un page"""
    titre = livre.titre
    if projetExiste(titre):
        if fin:
            path = "livres/" + titre + "/pages/pages_fin"
            with open(path + "/" + str(numero) + ".txt", "w") as f:
                f.close()
        if not pageExiste(titre, numero):
            os.mkdir("livres/" + titre + "/pages/" + str(numero))
            Page(numero, "", livre, fin)

        else:
            print("cette page existe deja")
    else:
        print("inexistant")
Ejemplo n.º 27
0
    def crawler(self, search_type, node):
        while len(self.to_visit
                  ) > 0 and not self.stop_crawl and self.count < MAX_URLS:
            print('crawling')
            # check if link has already been crawled
            crawled = node.url in self.visited_set
            if not crawled:
                self.graph.add_node(node, self.id)
                self.count += 1
                self.id += 1
                self.visited_set.add(node.url)
                if node.parents_list:
                    # get the node's parent node
                    source_node = node.parents_list.pop()
                    # update the node depth
                    node.node_depth = source_node.node_depth + 1
                    if node.id is not None and source_node.id is not None:
                        # create an edge between the current node and its parent node
                        self.graph.add_edge(source_node.id, node.id)
                        # set node's parent node
                        node.parent_node = source_node.id

                # create new Page object
                pg = Page(node.url)

                # if node limit has not been reached
                if node.node_depth < self.depth_limit:
                    links = pg.get_links(node.url)
                    links = validate_url(links, self.count, MAX_URLS)
                    # remove any duplicate links present
                    links = remove_duplicates(links)
                    self.crawl_links(node, links)

        # check if stop keyword is found
            if self.keyword and pg.find_keyword(self.keyword):
                node.found = True
                self.stop_crawl = True
            #self.end_crawl()
            if self.stop_crawl:
                break

            # get next node to crawl
            if self.count < MAX_URLS:
                node = self.get_next()

        self.end_crawl()
        return self.visited_set
Ejemplo n.º 28
0
    def iterateOverAllDocs(self):
        print "started"
        docsnameslist = self.all_documents_name("data/Words")
        linksList = self.all_documents_name("data/Links")
        nos_of_documents = len(docsnameslist)
        #print nos_of_documents

        # Check all the documents
        for i in range(nos_of_documents - 1):
            page = Page()
            filename = docsnameslist[i].split('/')[2]
            print filename
            page.url = "/wiki/" + filename
            stop_words = set(stopwords.words('english'))

            # Then opens the docsnameslist and read it as file
            with open(docsnameslist[i], "r") as f:
                # We use delimiter a one-character string used to separate fields
                # in our case is white space
                reader = csv.reader(f, delimiter=" ", quoting=csv.QUOTE_NONE)
                for row in reader:
                    for word in row:
                        if word not in stop_words:
                            if word != "":
                                word_exist = self.keyExists(word)
                                if word_exist:
                                    word_id = self.getIdForWord(word)
                                else:
                                    word_id = len(self.wordToId)
                                    self.wordToId[word] = word_id

                            page.words.append(word_id)

            with open(linksList[i], "r") as f:
                reader = csv.reader(f, delimiter="\n", quoting=csv.QUOTE_NONE)
                for row in reader:
                    for link in row:
                        page.links.append(link)

            self.pages.append(page)
            print "indexed" + str(i)
        return None
Ejemplo n.º 29
0
def scan_page(url, base_url):

    if url in scanned_urls:
        return

    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html)
    for link in soup.find_all("a"):
        new_link = prepare_link(url, link.get("href"))
        if not is_outer_url(new_link, base_url):
            if soup.title:
                crr_page_title = soup.title.text
            crr_page_url = url
            session.add(Page(url=crr_page_url))
            #crr_page_desc = soup.find("meta", {"name": "description"})['content']
            print(new_link)
            scanned_urls.append(new_link)
            scan_page(new_link, base_url)
    return scanned_urls
Ejemplo n.º 30
0
    def __iter__(self):
        first = True
        # Ted je cas iterovat po jednotlivych zaznamech warc archivu
        for record in self._warc:
            # Prvni zaznam se ignoruje, je tam kvuli informacim o warc archivu.
            if (first):
                first = False
                continue

            # nacteme obsah
            # bacha, obsah je nacteny v bajtech
            content = record.payload.read()
            response = bytes()
            # najdeme http odpoved a telo
            index = content.find(b'\r\n\r\n')
            if (index > 0):
                response = content[:index]
                content = content[index:]

            # zjistime uri
            uri = ''
            if ('WARC-Target-URI' in record):
                uri = record['WARC-Target-URI']

            # a id zaznamu by take bodlo
            warc_id = ''
            if ('WARC-Record-ID' in record):
                warc_id = record['WARC-Record-ID'][10:-1]
            # ale pokud nam ID chybi, musime si ho vygenerovat.
            else:
                warc_id = str(uuid.uuid1())

            # pokusime se vytvorit stranku. Pokud selze dekodovani obsahu, je nam stranka k nicemu.
            try:
                yield Page(page=content,
                           url=uri,
                           http_response=response.decode(),
                           page_id=warc_id,
                           old_style=self._old_style)
            except ValueError:
                pass