def __init__(self, unvisited, visited, shares): self.unvisited = unvisited self.visited = visited self.shares = shares self.session = requests.session() self.parser = Parser() agent = {'user-agent', r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} self.session.headers = agent self.stop = False
def getitems(self, html): """ Analyze the original webpage, and extract the valuable info. Here only extract the page title and all page contents. """ try: p = Parser() p.feed(html) except: ferrmsg('Error: feed error!', 'Index') items = {} title = p.get_title() items['title'] = title content = p.get_content() items['content'] = content return items
def main(): config = configparser.ConfigParser() config.read('config.ini') config = config['DEFAULT'] n = config.getint('n') _n = math.floor(math.sqrt(n)) h = config.getfloat('h') iteration = config.getint('Iteration') strad = config.getboolean('Straddling function') if 'File name' in config: file_name = config.get('File name') else: file_name = f'sch{n}' problems = load_file(file_name) up = Parser().parse() results = defaultdict(list) times = [0 for _ in range(iteration + 1)] for _ in tqdm(range(iteration)): for i, p in enumerate(problems): start_time = time.time() s = RHRM(p, h, 5, 10, strad) solution = s.solve() se = SelfEvolution(p, solution, 5, 10 * _n, 3 * _n) solution = se.start() results[i + 1].append( Result(solution, time.time() - start_time, n, h, i + 1)) times[i + 1] += time.time() - start_time save_as_latex_table(up, n, results, times) save_to_validate(file_name, results, h)
def test(): if(len(sys.argv) < 3): raise Exception("Script must be called with two arguments, the path to chromedriver and the path to firebase config") chromedriver = sys.argv[1] elapsed = Elapsed() scraper = Scraper(chromedriver, headless=True) test_url = "https://96hpr.csb.app" try: scraper.open_page(test_url) html = scraper.get_outerhtml( By.XPATH, "/html/body/div/div/table/tbody") parsed = Parser(html, log_each_n=10) template = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"] parsed.extract_data_from_table(template, [0], True) print_time(f"Extracted data") finally: scraper.close() elapsed.end()
class FileFinder: count=0 def __init__(self, unvisited, visited, shares): self.unvisited = unvisited self.visited = visited self.shares = shares self.session = requests.session() self.parser = Parser() agent = {'user-agent', r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} self.session.headers = agent self.stop = False def findshare(self): while not self.stop and self.unvisited.size() != 0: publisher = self.unvisited.pop() if publisher == None: time.sleep(2) continue #construct a new session self.session = requests.session() agent = {'user-agent', r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} self.session.headers = agent self.findShareFromPublisher(publisher) self.visited.push(publisher) FileFinder.count += 1 if FileFinder.count >= 5: self.writepublishback() FileFinder.count = 0 time.sleep(3) self.writepublishback() def stopWork(self): self.stop = True def findShareFromPublisher(self, p): url = r'https://vdisk.weibo.com/u/' + p; self.findShareFromUrl(url) def findShareFromUrl(self, url, travseSibling = True): try: r = self.session.get(url) if r.status_code == 200: self.listFile(r.text, url) # travser other pages if travseSibling == True: soup = BeautifulSoup(r.text, "html.parser") nextPages = self.pageList(soup) if len(nextPages) > 0: for p in nextPages: logger.info('To find share in page %s' % (url + p)) self.findShareFromUrl(url + p, travseSibling = False) #no need to traverse sibling except Exception as e: logger.warning("Unexpected error: %s"%str(e)) def writepublishback(self): v = self.visited.clone() u = self.unvisited.clone() with open('visited.txt', 'w') as f: for s in v: f.write(s+'\n') with open('unvisited.txt', 'w') as f: for s in u: f.write(s+'\n') def listFile(self, text, url): items = self.parser.getSharedItems(text) for item in items: if not item['is_dir']: bytes = 'unknown' if 'bytes' in item: bytes = item['bytes'] sina_uid = 'unknown' if 'sina_uid' in item: sina_uid = item['sina_uid'] elif 'uid' in item: sina_uid = item['uid'] sf = ShareFile(item['filename'], bytes, item['url'], sina_uid) self.shares.push(sf) else: self.searchInDirectory(item['url']) def pageList(self, soup): vd_page = soup.find(name='div', attrs={'class': 'vd_page'}) pages = [] if vd_page != None: page_links = vd_page.find_all(name='a') for p in page_links: if pages.count(p.attrs['href']) == 0: # ignore the next page by button pages.append(p.attrs['href']) #link page is not continuous, construct link page in continuous way if len(pages) != 0: min = int(pages[0].split(sep='=')[1]) max = int(pages[len(pages)-1].split(sep='=')[1]) pages.clear() for i in range(min, max+1): pages.append('?page=%d'%i) logger.info('pages is %s' % pages) return pages def searchInDirectory(self, dir_url, travserSibling = True): try: r = self.session.get(dir_url) if r.status_code == 200: self.listFile(r.text, dir_url) if (travserSibling): pages = self.pageListInDirectory(r.text, dir_url) for p in pages: self.searchInDirectory(dir_url+p, travserSibling = False) except Exception as e: logger.warning("Unexpected error: %s"%str(e)) def pageListInDirectory(self, text, dir_url): soup = BeautifulSoup(text, "html.parser") vd_page = soup.find(name='div', attrs={'class': 'vd_page'}) pages = [] if vd_page != None: page_links = vd_page.find_all(name='a') for p in page_links: if pages.count(p.attrs['href']) == 0: # ignore the next page by button pages.append(p.attrs['href']) if len(pages) != 0: minlink = pages[0] maxlink = pages[len(pages)-1] min = int((((minlink.split(sep='&'))[2]).split(sep='='))[1]) max = int((((maxlink.split(sep='&'))[2]).split(sep='='))[1]) pages.clear() for i in range(min, max+1): p = minlink[0:len(minlink)-1] + str(i) pages.append(p) logger.info("page in directory: %s"%pages) return pages def convertJsonItem(self, tag): downloadtag = tag.find(name='a', attrs={'class':'vd_pic_v2 vd_dload'}) share_item = json.loads(downloadtag.attrs['data-info']) share_item['url'] = self.stripSlash(share_item['url']) return share_item def stripSlash(self, str): s = '' for c in str: if c != r'\\': s += c return s
from googlecalendar import Calendar from albertheijn import AlbertHeijn from htmlparser import Parser # Create scraper objects. ah = AlbertHeijn() parser = Parser() # Convert all blocks to json format. json = filter(None, [ parser.block_to_json(element, ah.get_month(), ah.get_year()) for element in ah.get_blocks() ]) calendar = Calendar() print('Updating calendar...') for event in json: calendar.insert_event(event) print('Done') ah.dispose()
def __init__(self, unvisited, visited): self.unvisited = unvisited self.visited = visited self.lastSearchPage = 0 self.parser = Parser() self.stoped = False
class Publisher: def __init__(self, unvisited, visited): self.unvisited = unvisited self.visited = visited self.lastSearchPage = 0 self.parser = Parser() self.stoped = False def loadconf(self): if not os.path.exists('config.txt'): return with open('config.txt', 'r') as f: for line in f.readlines(): str = line.split() if str[0] == 'lastsearch': self.lastSearchPage = int(str[1]) logger.info('lastSearchPage is %d' % self.lastSearchPage) def loadunvisited(self): if not os.path.exists('unvisited.txt'): return with open('unvisited.txt', 'r') as f: for line in f.readlines(): line = line.strip('\n') if line != "": self.unvisited.push(line) def loadvisited(self): if not os.path.exists('visited.txt'): return with open('visited.txt', 'r') as f: for line in f.readlines(): line = line.strip('\n') if line != "": self.visited.push(line.strip('\n')) def load(self): self.loadconf() self.loadvisited() self.loadunvisited() def writeconf(self): with open('config.txt', 'w') as f: f.write("%s %s" % ('lastsearch', self.lastSearchPage)) """ From first page to get publisher """ def getpublisherbyfirstpage(self, firstpage): None """ get publisher by test a page exists """ def detectpublisher(self): None def work(self): startPage = self.lastSearchPage for i in range(startPage, 1000): url = "https://vdisk.weibo.com/?cid={:d}".format(i) self.listPublisher(url) self.lastSearchPage = i self.writeconf() time.sleep(3) logger.info("Worker for publisher search is done") self.stoped = True def listPublisher(self, url, visitedSibling=True): session = requests.session() agent = { 'user-agent', r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' } session.headers = agent publishers = [] try: r = session.get(url) if r.status_code == 200: shares = self.parser.getSharedItems(r.text) for s in shares: if not self.unvisited.exist( s['uid']) and not self.visited.exist(s['uid']): logger.info('find a uid %s at page %s' % (s['uid'], url)) if s['uid'] is not None: self.unvisited.push(s['uid']) else: logger.warning('find a user with uid is none:%s' % (s)) if visitedSibling: siblings = self.pageList(r.text) for s in siblings: url = 'https://vdisk.weibo.com/' + s self.listPublisher(url, visitedSibling=False) except Exception as e: logger.info(str(e)) def pageList(self, text): pages = [] soup = BeautifulSoup(text, "html.parser") pagetag = soup.find(name='div', attrs={'class': 'vd_page'}) if pagetag != None: hrefs = pagetag.find_all(name='a') for href in hrefs: if pages.count(href.attrs['href']) == 0: pages.append(href.attrs['href']) if len(pages) != 0: minlink = pages[0] maxlink = pages[len(pages) - 1] min = int((((minlink.split(sep='&'))[2]).split(sep='='))[1]) max = int((((maxlink.split(sep='&'))[2]).split(sep='='))[1]) pages.clear() for i in range(min, max + 1): p = minlink[0:len(minlink) - 1] + str(i) pages.append(p) return pages def isStopWork(self): return self.stoped