Beispiel #1
0
 def __init__(self, unvisited, visited, shares):
     self.unvisited = unvisited
     self.visited = visited
     self.shares = shares
     self.session = requests.session()
     self.parser = Parser()
     agent = {'user-agent',
              r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
     self.session.headers = agent
     self.stop = False
Beispiel #2
0
	def getitems(self, html):
		""" Analyze the original webpage, and extract the valuable info.
		Here only extract the page title and all page contents. """
		try:
			p = Parser()
			p.feed(html)
		except:
			ferrmsg('Error: feed error!', 'Index')
		items = {}
	   	title = p.get_title()
		items['title'] = title
		content = p.get_content()
		items['content'] = content
		return items
Beispiel #3
0
def main():
    config = configparser.ConfigParser()
    config.read('config.ini')
    config = config['DEFAULT']

    n = config.getint('n')
    _n = math.floor(math.sqrt(n))
    h = config.getfloat('h')
    iteration = config.getint('Iteration')
    strad = config.getboolean('Straddling function')
    if 'File name' in config:
        file_name = config.get('File name')
    else:
        file_name = f'sch{n}'

    problems = load_file(file_name)
    up = Parser().parse()
    results = defaultdict(list)
    times = [0 for _ in range(iteration + 1)]

    for _ in tqdm(range(iteration)):
        for i, p in enumerate(problems):
            start_time = time.time()
            s = RHRM(p, h, 5, 10, strad)
            solution = s.solve()
            se = SelfEvolution(p, solution, 5, 10 * _n, 3 * _n)
            solution = se.start()
            results[i + 1].append(
                Result(solution,
                       time.time() - start_time, n, h, i + 1))
            times[i + 1] += time.time() - start_time

    save_as_latex_table(up, n, results, times)
    save_to_validate(file_name, results, h)
Beispiel #4
0
def test():
    if(len(sys.argv) < 3):
      raise Exception("Script must be called with two arguments, the path to chromedriver and the path to firebase config")

    chromedriver = sys.argv[1]

    elapsed = Elapsed()

    scraper = Scraper(chromedriver, headless=True)
    test_url = "https://96hpr.csb.app"

    try:
        scraper.open_page(test_url)
        html = scraper.get_outerhtml(
            By.XPATH, "/html/body/div/div/table/tbody")
        parsed = Parser(html, log_each_n=10)
        template = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"]
        parsed.extract_data_from_table(template, [0], True)
        print_time(f"Extracted data")
    finally:
        scraper.close()
        elapsed.end()
Beispiel #5
0
class FileFinder:
    count=0
    def __init__(self, unvisited, visited, shares):
        self.unvisited = unvisited
        self.visited = visited
        self.shares = shares
        self.session = requests.session()
        self.parser = Parser()
        agent = {'user-agent',
                 r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
        self.session.headers = agent
        self.stop = False

    def findshare(self):
        while not self.stop and self.unvisited.size() != 0:
            publisher = self.unvisited.pop()
            if publisher == None:
                time.sleep(2)
                continue

            #construct a new session
            self.session = requests.session()
            agent = {'user-agent',
                     r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
            self.session.headers = agent

            self.findShareFromPublisher(publisher)
            self.visited.push(publisher)
            FileFinder.count += 1
            if FileFinder.count >= 5:
                self.writepublishback()
                FileFinder.count = 0

            time.sleep(3)
    
        
        self.writepublishback()

    def stopWork(self):
        self.stop = True

    def findShareFromPublisher(self, p):
        url = r'https://vdisk.weibo.com/u/' + p;
        self.findShareFromUrl(url)

    def findShareFromUrl(self, url, travseSibling = True):
        try:
            r = self.session.get(url)
            if r.status_code == 200:
                self.listFile(r.text, url)
                # travser other pages
                if travseSibling == True:
                    soup = BeautifulSoup(r.text, "html.parser")
                    nextPages = self.pageList(soup)
                    if len(nextPages) > 0:
                        for p in nextPages:
                            logger.info('To find share in page %s' % (url + p))
                            self.findShareFromUrl(url + p, travseSibling = False) #no need to traverse sibling
        except Exception as e:
            logger.warning("Unexpected error: %s"%str(e))

    def writepublishback(self):
        v = self.visited.clone()
        u = self.unvisited.clone()
        with open('visited.txt', 'w') as f:
            for s in v:
                f.write(s+'\n')

        with open('unvisited.txt', 'w') as f:
            for s in u:
                f.write(s+'\n')

    def listFile(self, text, url):
        items = self.parser.getSharedItems(text)
        for item in items:
            if not item['is_dir']:
                bytes = 'unknown'
                if 'bytes' in item:
                    bytes = item['bytes']
                sina_uid = 'unknown'
                if 'sina_uid' in item:
                    sina_uid = item['sina_uid']
                elif 'uid' in item:
                    sina_uid = item['uid']
                sf = ShareFile(item['filename'], bytes, item['url'], sina_uid)
                self.shares.push(sf)
            else:
                self.searchInDirectory(item['url'])

    def pageList(self, soup):
        vd_page = soup.find(name='div', attrs={'class': 'vd_page'})
        pages = []
        if vd_page != None:
            page_links = vd_page.find_all(name='a')
            for p in page_links:
                if pages.count(p.attrs['href']) == 0: # ignore the next page by button
                    pages.append(p.attrs['href'])

        #link page is not continuous, construct link page in continuous way
        if len(pages) != 0:
            min = int(pages[0].split(sep='=')[1])
            max = int(pages[len(pages)-1].split(sep='=')[1])
            pages.clear()
            for i in range(min, max+1):
                pages.append('?page=%d'%i)

        logger.info('pages is %s' % pages)
        return pages

    def searchInDirectory(self, dir_url, travserSibling = True):
        try:
            r = self.session.get(dir_url)
            if r.status_code == 200:
                self.listFile(r.text, dir_url)
                if (travserSibling):
                    pages = self.pageListInDirectory(r.text, dir_url)
                    for p in pages:
                        self.searchInDirectory(dir_url+p, travserSibling = False)
        except Exception as e:
            logger.warning("Unexpected error: %s"%str(e))

    def pageListInDirectory(self, text, dir_url):
        soup = BeautifulSoup(text, "html.parser")
        vd_page = soup.find(name='div', attrs={'class': 'vd_page'})
        pages = []
        if vd_page != None:
            page_links = vd_page.find_all(name='a')
            for p in page_links:
                if pages.count(p.attrs['href']) == 0:  # ignore the next page by button
                    pages.append(p.attrs['href'])

        if len(pages)  != 0:
            minlink = pages[0]
            maxlink = pages[len(pages)-1]
            min = int((((minlink.split(sep='&'))[2]).split(sep='='))[1])
            max = int((((maxlink.split(sep='&'))[2]).split(sep='='))[1])
            pages.clear()
            for i in range(min, max+1):
                p = minlink[0:len(minlink)-1] + str(i)
                pages.append(p)
            logger.info("page in directory: %s"%pages)
        return pages

    def convertJsonItem(self, tag):
        downloadtag = tag.find(name='a', attrs={'class':'vd_pic_v2 vd_dload'})
        share_item = json.loads(downloadtag.attrs['data-info'])
        share_item['url'] = self.stripSlash(share_item['url'])
        return share_item

    def stripSlash(self, str):
        s = ''
        for c in str:
            if c != r'\\':
                s += c
        return s
Beispiel #6
0
from googlecalendar import Calendar
from albertheijn import AlbertHeijn
from htmlparser import Parser

# Create scraper objects.
ah = AlbertHeijn()
parser = Parser()

# Convert all blocks to json format.
json = filter(None, [
    parser.block_to_json(element, ah.get_month(), ah.get_year())
    for element in ah.get_blocks()
])

calendar = Calendar()
print('Updating calendar...')
for event in json:
    calendar.insert_event(event)

print('Done')

ah.dispose()
Beispiel #7
0
 def __init__(self, unvisited, visited):
     self.unvisited = unvisited
     self.visited = visited
     self.lastSearchPage = 0
     self.parser = Parser()
     self.stoped = False
Beispiel #8
0
class Publisher:
    def __init__(self, unvisited, visited):
        self.unvisited = unvisited
        self.visited = visited
        self.lastSearchPage = 0
        self.parser = Parser()
        self.stoped = False

    def loadconf(self):
        if not os.path.exists('config.txt'):
            return

        with open('config.txt', 'r') as f:
            for line in f.readlines():
                str = line.split()
                if str[0] == 'lastsearch':
                    self.lastSearchPage = int(str[1])
                    logger.info('lastSearchPage is %d' % self.lastSearchPage)

    def loadunvisited(self):
        if not os.path.exists('unvisited.txt'):
            return

        with open('unvisited.txt', 'r') as f:
            for line in f.readlines():
                line = line.strip('\n')
                if line != "":
                    self.unvisited.push(line)

    def loadvisited(self):
        if not os.path.exists('visited.txt'):
            return
        with open('visited.txt', 'r') as f:
            for line in f.readlines():
                line = line.strip('\n')
                if line != "":
                    self.visited.push(line.strip('\n'))

    def load(self):
        self.loadconf()
        self.loadvisited()
        self.loadunvisited()

    def writeconf(self):
        with open('config.txt', 'w') as f:
            f.write("%s %s" % ('lastsearch', self.lastSearchPage))

    """
    From first page to get publisher
    """

    def getpublisherbyfirstpage(self, firstpage):
        None

    """
    get publisher by test a page exists
    """

    def detectpublisher(self):
        None

    def work(self):
        startPage = self.lastSearchPage
        for i in range(startPage, 1000):
            url = "https://vdisk.weibo.com/?cid={:d}".format(i)
            self.listPublisher(url)
            self.lastSearchPage = i
            self.writeconf()
            time.sleep(3)
        logger.info("Worker for publisher search is done")
        self.stoped = True

    def listPublisher(self, url, visitedSibling=True):
        session = requests.session()
        agent = {
            'user-agent',
            r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        }
        session.headers = agent
        publishers = []
        try:
            r = session.get(url)
            if r.status_code == 200:
                shares = self.parser.getSharedItems(r.text)
                for s in shares:
                    if not self.unvisited.exist(
                            s['uid']) and not self.visited.exist(s['uid']):
                        logger.info('find a uid %s at page %s' %
                                    (s['uid'], url))
                        if s['uid'] is not None:
                            self.unvisited.push(s['uid'])
                        else:
                            logger.warning('find a user with uid is none:%s' %
                                           (s))
                if visitedSibling:
                    siblings = self.pageList(r.text)
                    for s in siblings:
                        url = 'https://vdisk.weibo.com/' + s
                        self.listPublisher(url, visitedSibling=False)
        except Exception as e:
            logger.info(str(e))

    def pageList(self, text):
        pages = []
        soup = BeautifulSoup(text, "html.parser")
        pagetag = soup.find(name='div', attrs={'class': 'vd_page'})
        if pagetag != None:
            hrefs = pagetag.find_all(name='a')
            for href in hrefs:
                if pages.count(href.attrs['href']) == 0:
                    pages.append(href.attrs['href'])

        if len(pages) != 0:
            minlink = pages[0]
            maxlink = pages[len(pages) - 1]
            min = int((((minlink.split(sep='&'))[2]).split(sep='='))[1])
            max = int((((maxlink.split(sep='&'))[2]).split(sep='='))[1])
            pages.clear()
            for i in range(min, max + 1):
                p = minlink[0:len(minlink) - 1] + str(i)
                pages.append(p)
        return pages

    def isStopWork(self):
        return self.stoped