def __init__(self): self.Inicio = Page() self.Inicio2 = Page() self.NBinsert = NodoB() self.Enlace = Page() self.Pivote = False self.Bandera = False self.Bandera2 = False """"""
def pull(): url = str(input("Enter the url for the first page of the reader: ")) firstPage = Page(url) page = firstPage while page.get_next(): if page.get_next(): page = Page(page.get_next()) else: print("End of chapters") path = os.path.join(os.getcwd(), 'novels', firstPage.title, "lastRead.txt") with open(path, 'w', encoding='utf-8') as info: info.write("{}\n".format(firstPage.title)) info.write(firstPage.name) info.write(0)
def DivPage(self, clave, root, position): posi = 0 PosPiv = 0 if (position <= 2): PosPiv = 2 else: PosPiv = 3 Medio = Page(Ramas=[None, None, None, None, None], Claves=[None, None, None, None], Cuentas=0) Posi = PosPiv + 1 while (posi != 5): i = ((posi - PosPiv) - 1) j = posi - 1 Medio.Claves[i] = root.Claves[j] Medio.Ramas[posi - PosPiv] = root.Ramas[posi] posi += 1 Medio.setCuentas(4 - PosPiv) root.setCuentas(PosPiv) if (posi < 2): self.ClaveInsert(clave, root, position) else: self.ClaveInsert(clave, Medio, (position - PosPiv)) self.setNBInsert(root.Claves[root.getCuentas() - 1]) Medio.Ramas[0] = root.Ramas[root.getCuentas()] valor = root.getCuentas() - 1 root.setCuentas(valor) self.setEnlace(Medio) """"""
def scrape(site, prefix="https://en.wikipedia.org"): page = BeautifulSoup(urlopen(site.url), 'html.parser') links_to = OccurrenceList() for link in page.find_all('a'): if link.get('href'): url_link = link.get('href') if not url_link.startswith("http"): url_link = prefix + url_link links_to = links_to.union(OccurrenceList([url_link])) """ Remove script tags """ for script in page("script"): page.script.extract() """ Remove style tags """ for style in page("style"): page.style.extract() """ Remove comments """ comments = page.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() return Page(page.title.string, site.url, page.text, links_to)
def get_page(self, page_id): # try: self.cursor.execute( "SELECT title, in_page_title, summary, content FROM pages WHERE page_id = %s;", (page_id,)) title, in_page_title, summary, content = self.cursor.fetchall()[0] return Page(title, in_page_title, summary, content)
def add_page_to_memory(new_process, page_number): global global_time # Check available frames if memory_available(M) <= 0: # Swap swap(new_process, page_number) else: # Insert new_frame = -1 for index, memory in enumerate(M): if memory == [-1, -1]: M[index] = [new_process, page_number] new_frame = index break if page_number not in processes[new_process].table: # Create page object new_page_obj = Page(page_number, new_frame, 1) # Insert into process table processes[new_process].insert_page(new_page_obj) else: #Change S memory S[processes[new_process].table[page_number].frame] = [-1, -1] #Changes page characteristics in processes processes[new_process].table[page_number].frame = new_frame processes[new_process].table[page_number].bit_memory = 1 algorithm[PAGE_REPLACEMENT_ALGORITHM].insert(new_process, page_number) #adds 1 sec to global time for adding page to M global_time += 10
def SIMULATE(Frames, Pages, PRL, ZLO): """ PRL - Page Reference Lenght :param int Frames: :param int Pages: :param int PRL: :param boolean ZLO: :return: """ pages = [] for i in range(Pages): pages.append(Page(i)) pageReferences = [] if not ZLO: for index in range(PRL): pageReferences.append(pages[random.randint(0, len(pages) - 1)]) else: for index in range(PRL): odw = pages[index - 1].pageN - 5 + random.randint(0, 1 + 5 * 2) pageReferences.append(pages[max(0, min(Pages, odw))]) executeFIFO(Frames, pageReferences) executeLRU(Frames, pageReferences) executeALRU(Frames, pageReferences) executeOPT(Frames, pageReferences) executeRAND(Frames, pageReferences)
def readPage(self, page_name): if self.page_window is None: self.page_window = Page(page_name, 1) self.page_window.show() else: self.page_window.close() self.page_window = None
def ouv(self, path): if path[-7:] == ".projpy": self.sommets = [] fichier = open(path, "r") i = 0 somm = {} for ligne in fichier.readlines(): ligne = ligne.replace("\n", "") if ligne == "---": i = i + 1 else: if i == 0: str = ligne.split(":") if str[1] == "U": carac = str[2].split(",") somm[str[0]] = Utilisateur(carac[0], carac[1], int(carac[2])) self.add_sommet(somm[str[0]]) if str[1] == "P": somm[str[0]] = Page(str[2]) self.add_sommet(somm[str[0]]) if i == 1: firstUtil = ligne.split(":")[0] others = ligne.split(":")[1].split(",") for followed in others: somm[firstUtil].connect(somm[followed]) if i == 2: thePage = ligne.split(":")[0] theAdmins = ligne.split(":")[1].split(",") for admin in theAdmins: somm[thePage].add_admin(somm[admin]) fichier.close() else: print("mauvais format.\nextention .projpy")
def __init__(self): self.max_memory = int(max_pages) self.main_memory = [] #list of Pages for i in range(self.max_memory): page = Page() self.main_memory.append(page) self.disk_memory = "vm.txt" self.command_list = command_list
def get_all_friends(total_page_number, original_url): all_friends = [] for page_num in range(0, total_page_number): url = original_url + '&curpage=' + str(page_num) page = Page(url) parser = PageParser(page.fetch()) all_friends.extend(parser.get_friends()) return all_friends
def __add_existing_page(self, configs=None): """ adds a page and fills with the given widgets """ page = Page(self.stack, self) if configs is not None: for config in configs: self.__add_existing_widget(page, config) self.stack.addWidget(page)
def view(page): """View multiple Pages.""" path = page path = str(path) print(path) x = Page(path) x.disp() input()
def init(): global driver, soup, page_type, page, website driver = webdriver.Chrome() soup = "" page_type = "" page = Page() page = Article() page = YoutubePage() page = YoutubeVideo() website = ""
def InsertarNodo(self, usuario, nombre, clave, root): self.Add(usuario, nombre, clave, root) if (self.getPivote() == True): self.Inicio = Page(Ramas=[None, None, None, None, None], Claves=[None, None, None, None], Cuentas=0) self.Inicio.setCuentas(1) self.Inicio.Claves[0] = self.getInsert() self.Inicio.Ramas[0] = root self.Inicio.Ramas[1] = self.getEnlace() """"""
def find_apt_urls(self, given_url): apt_links = self.gen_site(given_url).findAll( 'a', {'class': 'placardTitle js-placardTitle '}) apt_urls = [] for link in apt_links: gen_url = link.get('href') if not Page(gen_url).check_invalid(): apt_urls.append(gen_url) # get unique urls by converting it to a set, then convert it back to a list apt_urls = list(set(apt_urls)) return apt_urls
def recursiveScrape(url, visitedLinks, domain): if url in visitedLinks : return True try: if domain not in url : return True except TypeError: return True print(url) # page object for the current page curPage = Page() # open url source = requests.get(url).text soup = BeautifulSoup(source, 'lxml') # set curPage elements try: curPage.title = soup.find('title').text except AttributeError: curPage.title = None curPage.url = url curPage.childLinks = [] hyperLinks = soup.find_all('a') hyperLinks = list(filter((None).__ne__, hyperLinks)) # get all links on a page for link in hyperLinks: # set hyperLink to href of link hyperLink = link.get('href') # remove '.' or '..' from the link if hyperLink == None: pass elif len(hyperLink) >= 2 and hyperLink[0:2] == '..': hyperLink = hyperLink.replace('..', '') elif len(hyperLink) == 1 and hyperLink[0] == '.': hyperLink = hyperLink.replace('.', '') # if not an external url add domain to hyperLink if hyperLink == None: pass elif hyperLink[0:4] != "http" and hyperLink[0] != '/': hyperLink = 'http://' + domain + '/' + hyperLink elif hyperLink[0:4] != "http": hyperLink = 'http://' + domain + hyperLink curPage.childLinks.append(hyperLink) # write curPage object to file curPage.appendToFile(domain) # add current link to visitedLinks visitedLinks.append(url) # for all child links in the page for link in curPage.childLinks: # call this function on that link recursiveScrape(link, visitedLinks, domain)
def insertarCapeta(): usuario = str(request.form['user']) nombre = str(request.form['nombre']) if(nombre == "nulo"): return "No inserta nada" else: LaRaiz = ListaUsuarios.ObtenerCarpetaRaiz(usuario) LaRaiz.CrearNodo(usuario, nombre, Page(Ramas=[None,None, None,None,None], Claves=[None,None,None,None],Cuentas=0)) LaRaiz.CrearArchivo() elestring = LaRaiz.CrearCarpetaHTML() return str(elestring)
def get_wikipage(data): room_code = data['roomCode'] page_name = data['wikiPage'] room = rooms[room_code] page = Page(page_name, rooms[room_code].target_page).export() emit('updatePage', page) winner = room.update_game(request.sid, page_name) if winner: emit('endRound', winner.export(), broadcast=True, room=room_code)
def swap(process_to_insert_ID, process_to_insert_page_number): global global_time global swaps #gets process to switch with process_to_switch_ID, process_to_switch_page_number = algorithm[ PAGE_REPLACEMENT_ALGORITHM].pop() S_frame = -1 #If process needs to switch from S to M if process_to_insert_page_number in processes[process_to_insert_ID].table: S_frame = processes[process_to_insert_ID].table[ process_to_insert_page_number].frame swaps += 1 processes[process_to_insert_ID].page_faults += 1 #If it is a new page/process else: S_frame = -1 for index, memory in enumerate(S): if memory == [-1, -1]: S_frame = index break # Create default page object new_page_obj = Page(process_to_insert_page_number, -1, -1) # Insert into process table processes[process_to_insert_ID].insert_page(new_page_obj) page_to_S = processes[process_to_switch_ID].table[ process_to_switch_page_number] ##inserts new/needed process into main memory and algorithm processes[process_to_insert_ID].table[ process_to_insert_page_number].frame = page_to_S.frame processes[process_to_insert_ID].table[ process_to_insert_page_number].bit_memory = 1 algorithm[PAGE_REPLACEMENT_ALGORITHM].insert( process_to_insert_ID, process_to_insert_page_number) #updates process table of switched page processes[process_to_switch_ID].table[ process_to_switch_page_number].frame = S_frame processes[process_to_switch_ID].table[ process_to_switch_page_number].bit_memory = 0 #updates S memory S[S_frame] = [process_to_switch_ID, page_to_S.ID] #updates M memory M[processes[process_to_insert_ID].table[process_to_insert_page_number]. frame] = [process_to_insert_ID, process_to_insert_page_number] print( "Página", process_to_insert_page_number, "del proceso", process_to_insert_ID, "swappeada al marco", processes[process_to_insert_ID].table[process_to_insert_page_number]. frame, "del área real") print("Página", process_to_switch_page_number, "del proceso", process_to_switch_ID, "swappeada al marco", S_frame, "del área de swapping") #adds 1 sec to global timer because of the extra operation of swapping out "process_to_switch" global_time += 10
def __init__(self, filename): # Open the PDF file docfile = open(filename, 'rb') # Read the file object document = PyPDF2.PdfFileReader(docfile) # Make a list of Page objects pageList = [] for pg in range(document.getNumPages()): pageList.append(Page(document, pg)) self.pages = pageList
def post(self, path): if not self.user: self.error(400) return content = self.request.get("content") old_page = Page.by_path(path).get() if not (old_page or content): return elif not old_page or old_page.content != content: p = Page(parent = Page.parent_key(path), content = content) p.put() else: path = re.sub('\_edit$', '', path) self.redirect(path)
def update(): update_soup() update_website_base() update_page_type() if ("nytimes" in globals.website): if (globals.page_type == "Article"): globals.page = Article(globals.soup) else: globals.page = Page(globals.soup, globals.page_type) elif ("youtube" in globals.website): if (globals.page_type == "YoutubeVideo"): globals.page = YoutubeVideo(globals.soup) else: globals.page = YoutubePage(globals.soup, globals.page_type) playsound.playsound("confirmation_beep.mp3")
def prep_pagefiles(self): """ TODO """ self.page_files = sorted(self.page_files, key=lambda s: [ int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s) ]) self.repeated = list() self.found_pagenumbers = list() self.page_list = [] for i, page_filepath in enumerate(self.page_files): self.repeated.append(set()) self.found_pagenumbers.append(None) self.page_list.append(Page(page_filepath, self, i))
def __init__(self, pid, size, arrival_time, page_exec_order): super(Process, self).__init__() self.pid = pid self.size = size self.pages = [] self.page_table = [] self.next_to_be_executed = 0 self.arrival_time = arrival_time self.page_exec_order = page_exec_order if self.size % OSParams.page_size != 0: page_count = (self.size / OSParams.page_size) + 1 else: page_count = (self.size / OSParams.page_size) for i in xrange(0, page_count): self.pages.append(Page(self.pid, i)) self.page_table.append(-1)
def createPage(livre: Livre, numero: int, fin: bool = False): """Permet de creer les dossier et sous dossiers d'un page""" titre = livre.titre if projetExiste(titre): if fin: path = "livres/" + titre + "/pages/pages_fin" with open(path + "/" + str(numero) + ".txt", "w") as f: f.close() if not pageExiste(titre, numero): os.mkdir("livres/" + titre + "/pages/" + str(numero)) Page(numero, "", livre, fin) else: print("cette page existe deja") else: print("inexistant")
def crawler(self, search_type, node): while len(self.to_visit ) > 0 and not self.stop_crawl and self.count < MAX_URLS: print('crawling') # check if link has already been crawled crawled = node.url in self.visited_set if not crawled: self.graph.add_node(node, self.id) self.count += 1 self.id += 1 self.visited_set.add(node.url) if node.parents_list: # get the node's parent node source_node = node.parents_list.pop() # update the node depth node.node_depth = source_node.node_depth + 1 if node.id is not None and source_node.id is not None: # create an edge between the current node and its parent node self.graph.add_edge(source_node.id, node.id) # set node's parent node node.parent_node = source_node.id # create new Page object pg = Page(node.url) # if node limit has not been reached if node.node_depth < self.depth_limit: links = pg.get_links(node.url) links = validate_url(links, self.count, MAX_URLS) # remove any duplicate links present links = remove_duplicates(links) self.crawl_links(node, links) # check if stop keyword is found if self.keyword and pg.find_keyword(self.keyword): node.found = True self.stop_crawl = True #self.end_crawl() if self.stop_crawl: break # get next node to crawl if self.count < MAX_URLS: node = self.get_next() self.end_crawl() return self.visited_set
def iterateOverAllDocs(self): print "started" docsnameslist = self.all_documents_name("data/Words") linksList = self.all_documents_name("data/Links") nos_of_documents = len(docsnameslist) #print nos_of_documents # Check all the documents for i in range(nos_of_documents - 1): page = Page() filename = docsnameslist[i].split('/')[2] print filename page.url = "/wiki/" + filename stop_words = set(stopwords.words('english')) # Then opens the docsnameslist and read it as file with open(docsnameslist[i], "r") as f: # We use delimiter a one-character string used to separate fields # in our case is white space reader = csv.reader(f, delimiter=" ", quoting=csv.QUOTE_NONE) for row in reader: for word in row: if word not in stop_words: if word != "": word_exist = self.keyExists(word) if word_exist: word_id = self.getIdForWord(word) else: word_id = len(self.wordToId) self.wordToId[word] = word_id page.words.append(word_id) with open(linksList[i], "r") as f: reader = csv.reader(f, delimiter="\n", quoting=csv.QUOTE_NONE) for row in reader: for link in row: page.links.append(link) self.pages.append(page) print "indexed" + str(i) return None
def scan_page(url, base_url): if url in scanned_urls: return r = requests.get(url) html = r.text soup = BeautifulSoup(html) for link in soup.find_all("a"): new_link = prepare_link(url, link.get("href")) if not is_outer_url(new_link, base_url): if soup.title: crr_page_title = soup.title.text crr_page_url = url session.add(Page(url=crr_page_url)) #crr_page_desc = soup.find("meta", {"name": "description"})['content'] print(new_link) scanned_urls.append(new_link) scan_page(new_link, base_url) return scanned_urls
def __iter__(self): first = True # Ted je cas iterovat po jednotlivych zaznamech warc archivu for record in self._warc: # Prvni zaznam se ignoruje, je tam kvuli informacim o warc archivu. if (first): first = False continue # nacteme obsah # bacha, obsah je nacteny v bajtech content = record.payload.read() response = bytes() # najdeme http odpoved a telo index = content.find(b'\r\n\r\n') if (index > 0): response = content[:index] content = content[index:] # zjistime uri uri = '' if ('WARC-Target-URI' in record): uri = record['WARC-Target-URI'] # a id zaznamu by take bodlo warc_id = '' if ('WARC-Record-ID' in record): warc_id = record['WARC-Record-ID'][10:-1] # ale pokud nam ID chybi, musime si ho vygenerovat. else: warc_id = str(uuid.uuid1()) # pokusime se vytvorit stranku. Pokud selze dekodovani obsahu, je nam stranka k nicemu. try: yield Page(page=content, url=uri, http_response=response.decode(), page_id=warc_id, old_style=self._old_style) except ValueError: pass