def addLinks(self, links, sel = True, startFrom = 0): self.links = links if sel: driver = webdriver.Firefox() for url in links: driver.get(url) self.htmls.append(driver.page_source) driver.close() # save htmls else: for url in links: r = requests.get(url) self.htmls.append(r.text) if not os.path.exists(self.path + self.name): os.makedirs(self.path + self.name) for num, html in enumerate(self.htmls): with open(self.path + self.name + str(num + startFrom) + ".html", "w") as f: f.write(html) self.trees = [makeTree(" ".join(html.split()), url) for html, url in zip(self.htmls, self.links)]
def load(self): obj = Training(self.name, self.path) with open(obj.path + obj.name + "sky.training.links") as f: obj.links = f.read().split('\n') # load targets with open(obj.path + obj.name + "sky.training.targets") as f: targets = f.read() obj.targets = [" ".join(x.split()) for x in targets.split("sky\nsky")] # load htmls obj.htmls = [] for num in range(len(obj.links)): with open(obj.path + obj.name + str(num) + ".html") as f: obj.htmls.append(f.read()) obj.trees = [makeTree(" ".join(html.split()), url) for html, url in zip(obj.htmls, obj.links)] return obj
def addLinks(self, links, sel=True, startFrom=0): self.links = links if sel: driver = webdriver.Firefox() for url in links: driver.get(url) self.htmls.append(driver.page_source) driver.close() # save htmls else: for url in links: r = requests.get(url) self.htmls.append(r.text) if not os.path.exists(self.path + self.name): os.makedirs(self.path + self.name) for num, html in enumerate(self.htmls): with open(self.path + self.name + str(num + startFrom) + ".html", "w") as f: f.write(html) self.trees = [makeTree(" ".join(html.split()), url) for html, url in zip(self.htmls, self.links)]