Beispiel #1
0
def update(request, gameid):
    game = get_object_or_404(Game, id=gameid)
    if game.is_locked():
        return HttpResponse(simplejson.dumps({
            'success':
            False,
            'message':
            'Someone else is updating that game right now.  Please wait.'
        }),
                            mimetype='application/json')
    else:
        game.lock()
    try:
        p = PageParser()
        newGame = p.Update(game)
        if newGame:
            return HttpResponse(simplejson.dumps({
                'success': True,
                'curPage': newGame.currentPage,
                'maxPages': newGame.maxPages
            }),
                                mimetype='application/json')
        else:
            game.save()
            return HttpResponse(simplejson.dumps({
                'success':
                False,
                'message':
                'There was a problem either downloading or parsing the forum page.  Please try again later.'
            }),
                                mimetype='application/json')
    except:
        game.save()
        raise
Beispiel #2
0
class FetchWiki(object):

	def __init__(self, dbserver):
		"""
		Initialize parser, connects to mongodb server and creates indexes
		if they don't exist.

		- dbserver: mongodb server's address.
		"""
		# Wikipedia page parser
		self.parser = PageParser(self.save)
		# Database client
		self.client = MongoClient(dbserver, 27017)
		# Database object
		self.db_wiki = self.client['db_wiki']
		# Events collection
		self.col_events = self.db_wiki['events']
		# Create text index on title (for title search) (if not exist)
		self.col_events.ensure_index([("title", pymongo.TEXT)])
		# Create indexes for year, day and category
		self.col_events.ensure_index([
			("year", pymongo.ASCENDING), 
			("category", pymongo.ASCENDING), 
			("day", pymongo.ASCENDING)])
		
	def save(self, **kwargs):
		""" Inserts into database a new entry """
		self.col_events.insert_one(kwargs)

	def start(self):
		""" Starts the parsing process """
		# Iterate over all days of a leap year (2016 is one)
		start_date = date(2016, 1, 1)
		end_date = date(2017, 1, 1)
		start = timer()
		
		for curr_date in self.__date_range(start_date, end_date):
			page_title = '{d:%B}_{d.day}'.format(d=curr_date)
			self.parse(page_title)

		end = timer()
		print "Time elapsed: ", end - start

	def parse(self, page_title):
		""" 
		Calls the parser for a given page title, 
		the parser calls save method
		"""

		try:
			print "Parsing page '%s'" % page_title,
			self.parser.parse_page(page_title)
			print " .... OK"
		except Exception:
			print " ... failed"

	def __date_range(self, start_date, end_date):
		""" Generates dates between start_date and end_date """
		for n in range(int((end_date - start_date).days)):
			yield start_date + timedelta(n)
Beispiel #3
0
    def replace(self, url):
        from Window import Window
        from PageParser import PageParser

        #TODO: Add referrer
        window = Window(self.__dict__['__document'].contentWindow.__dict__['__root'], self.fix_url(url))
        parser = PageParser(window, window.document, window.__dict__['__html'])
        parser.close()
        return url
Beispiel #4
0
    def start(self, ):

        urls = self.FreshUrls
        self.FreshUrls = []
        for url in urls:
            req = self._get(url)
            pp = PageParser()
            pp.parse(req.text, self.PageInfo, self.BaseHost)
        self.PageInfo.output()
Beispiel #5
0
 def __init__(self, urls, maxLinksToCrawl):
     self.linksToVisit = utils.elementToList(urls)
     if maxLinksToCrawl == 0:
         self.maxLinksToCrawl = sys.maxint
     else:
         self.maxLinksToCrawl = maxLinksToCrawl
     self.linksVisited = []     # The links we have visited.
     self.websitesVisited = []  # The websites we have visited.
     self.pathsNotToCrawl = []  # As defined in robots.txt.
     self.parser = PageParser() # Used to parse the websites we crawl.
    def process(self):
        print ""
        pdfbuilder = PdfBuilder(self.fname, self.coords, self.W, self.H, self.rsrcmgr, self.laparams)
        for (pageno, page) in self.pagesEnumerator:
            if pageno > self.endPage:
                break
            if not self.pageRanges.isInRange(pageno+1):
                continue
            self.interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = self.device.get_result()
            print "processing page %d                                         \n" %(pageno+1),
            trimboxes = [self.trimbox]
            if self.trimboxes != None and pageno in self.trimboxes:
                trimboxes = self.trimboxes[pageno]
             
            pageParser= PageParser(layout, self.maxSplit, self.W, self.H, trimboxes, self.exclude, pagebox=self.getpagebox(page))
            pageCoords = pageParser.process()
            maxScale = self.GetMaxScale(pageCoords['combinedLines'])
            pageCoords['pageno']= pageno
            crops = None
            if pageCoords != None:
                crops = pageCoords['crops']
            
            if crops == None or len(crops)==0:
                continue
            self.coords.append((pageno, crops))
            self.pagesCoords.append(pageCoords)
            pdfbuilder.processTrimmed(page, pageno, crops, maxScale = maxScale) #self.outputJson)

        self.scales = pdfbuilder.endProcess()
        if self.DEBUG > 0:
            with open(self.picklefile, 'wb') as f:
                procResult = {'args': self.args, 'pagesCoords':self.pagesCoords, 'scales':self.scales}
                pickle.dump(procResult, f)

        if self.coords == None or len(self.coords) ==0:
            print "No objects found\r"
            return
        
        with open(self.fname + '.json', 'wb') as f:
            f.write(json.dumps({'scales':self.scales}))
        try:
            params = [r"java.exe", "-cp", r"pdf2ereader.jar", "jpdftoepub.TrimPdf", "crop", self.fname, self.outfile,self.fname + '.json']
            print ' '.join(params)
            p = subprocess.Popen(params)
            p.wait()
            #sself.DEBUG = 1
            if p.returncode == 0 and not (self.DEBUG >0):
                os.remove(self.fname + '.json')
                os.remove(self.fname + ".cropped-streams")
            print "\nDone"
        except Exception, e:
            print e
Beispiel #7
0
def main():
    # Variables for testing in IDE
    document_id = "Ms-114"
    page_path = "./Ms-114/page"
    json_path = "./Ms-114/test_output.json"

    # Actual arguments when calling from command line
    #page_path, document_id, json_path = handle_args()

    parser = PageParser(page_path, document_id)
    parser.read_files()
    save_json(parser, json_path)
Beispiel #8
0
 def __init__(self,
              sale_terms,
              status,
              page_number=1,
              max_retry=3,
              max_wait_timeout=10,
              tasks=None):
     threading.Thread.__init__(self)
     PageParser.__init__(self, base_url, max_retry, max_wait_timeout)
     self.daemon = True
     self.sale_terms = sale_terms
     self.status = status
     self.page_number = page_number
     self.tasks = tasks
Beispiel #9
0
class pytrends():
    
    def __init__(self):
        pass

    def __clean_date__(self,date):
        """Google Trends uses int-based date"""
        tmp = []
        for x in  date.split('-'):
            tmp.append(str((int(x))))
        return '-'.join(tmp)
    
    def trends_by_date(self,date):
        """Returns a list of Google Trends Keywords by Date. Returns False on Error"""
        args = {'sa':'X',
                'date':self.__clean_date__(date)
                }
        q = urllib.urlencode(args)
        html = ''
        try:
            html = urllib.urlopen(URL_TRENDS+q).read()
        except Exception,e:
            return False
        x = PageParser()
        x.feed(html)
        keywords = []
        for href in  x.hrefs:
            if '/trends/hottrends?q=' in href:
                url =  urlparse( 'http://www.google.com/'+href )
                params = dict([part.split('=') for part in url[4].split('&')])
                for k,v in params.items():
                    params[k] = v.replace('+',' ')
                keywords.append(params['q'])
        return keywords
Beispiel #10
0
def check_update_game(game):
    if game.is_locked():
        return game
    else:
        game.lock()

    try:
        p = PageParser()
        newGame = p.Update(game)
        if newGame:
            return newGame
        else:
            game.save()
            return game
    except:
        return game
Beispiel #11
0
def add_game(request, threadid):
    data = {'success': True, 'message': 'Success!', 'url': ''}

    try:
        game = Game.objects.get(threadId=threadid)
        data['url'] = game.get_absolute_url()
    except Game.DoesNotExist:
        p = PageParser()
        p.user = request.user
        game = p.Add(threadid)
        if game:
            data['url'] = game.get_absolute_url()
            game.status_update("A new game was created by %s!" %
                               game.moderator)
        else:
            data['success'] = False
            data[
                'message'] = "Couldn't download or parse the forum thread.  Sorry!"

    return HttpResponse(simplejson.dumps(data), mimetype='application/json')
 def render_string(self, textstate, seq):
     curItem = self.cur_item 
     l1 = len(curItem)
     PDFTextDevice.render_string(self, textstate, seq)
     l2 = len(curItem)
     
     b = reduce(lambda x,y: PageParser.mergeBoxes(x, y.bbox), curItem._objs[l1:l2], None )
     if self.inFigure:
         self.showfigure = [x[0] or x[1] for x in zip(self.showfigure, self.intersect(b))]
     else:
         self.showtext = self.intersect(b)
 def paint_path(self, gstate, stroke, fill, evenodd, path):
     curItem = self.cur_item 
     l1 = len(curItem)
     PDFPageAggregator.paint_path(self, gstate, stroke, fill, evenodd, path)
     l2 = len(curItem)
     
     b = reduce(lambda x,y: PageParser.mergeBoxes(x, y.bbox), curItem._objs[l1:l2], None )
     if self.inFigure:
         self.showfigure = [x[0] or x[1] for x in zip(self.showfigure, self.intersect(b))]
     else:
         self.showpath = self.intersect(b)
Beispiel #14
0
    def handle_src(self, name, val):
        url = self.__dict__['__window'].document.location.fix_url(val)

        if config.retrieval_all:
            hc.get(url, self.__dict__['__window'].document.location.href)
        
        scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
        if scheme not in ('http','file','https','ftp'):
            config.VERBOSE(config.VERBOSE_WARNING, "[WARNING] Got unknown scheme: %s in %s.%s ."%(url,self.tagName, name));
            if 'onerror' in self.__dict__:
                config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] Calling onerror of %s."%(self.tagName));
                self.onerror()

        if self.tagName == "iframe":
            from Window import Window
            from PageParser import PageParser
            window = Window(self.__dict__['__window'].__dict__['__root'],
                            self.__dict__['__window'].document.location.fix_url(val),
                            self.__dict__['__window'].document.location.href)
            parser = PageParser(window, window.document, window.__dict__['__html'])
            parser.close()
Beispiel #15
0
 def handle_innerHTML(self, name, val):
     val = str(val)
     if self.__parser:
         self.__parser.html = self.__parser.html[:self.begin] + val + self.__parser.html[self.end:]
         dev = self.end - self.begin - len(val)
         for i in self.__dict__['__window'].document.all:
             if i.begin:
                 if i.begin > self.end:
                     i.begin -= dev
             if i.end:
                 if i.end >= self.end:
                     i.end -= dev
         
         self.__parser.current -= dev
         return
     
     from PageParser import PageParser
     self.__parser = PageParser(self.__dict__['__window'], self.__dict__['__window'].document, val)
Beispiel #16
0
 def parse(self, **args):
     page = args['page']
     source = args['source']
     hashurl = args['hashurl']
     ret = PageParser.parse(page, source)
     if ret.has_key('error'):
         Logger.info(hashurl+' '+ret['error'])
         return
     record = '\t'.join([
         hashurl,
         ret['title2'] if ret['title2']\
             else ret['title'],
         json.dumps(ret['author']),
         json.dumps(ret['images']),
         json.dumps(ret['links']),
         ret['text'],
         ret['pub_time'],
     ]).encode('utf-8')
     self._db.insert(record)
Beispiel #17
0
    def write(self, text):
        """
        Writes a string of text to a document stream.
        Syntax

        document.write(text) 

        Parameters

        text is a string containing the text to be written to the current
        document.
        """
        config.VERBOSE(config.VERBOSE_DEBUG,
                       '[DEBUG] in Document.py Document.write(ln)...')
        config.VERBOSE(config.VERBOSE_DETAIL, str(text))

        self.__dict__['__dynamic'].append(text)
        content = ''.join(self.__dict__['__dynamic'])
        p = PageParser(self.contentWindow,
                       self.contentWindow.__dict__['__sl'][-1], content, True)
Beispiel #18
0
	def __init__(self, dbserver):
		"""
		Initialize parser, connects to mongodb server and creates indexes
		if they don't exist.

		- dbserver: mongodb server's address.
		"""
		# Wikipedia page parser
		self.parser = PageParser(self.save)
		# Database client
		self.client = MongoClient(dbserver, 27017)
		# Database object
		self.db_wiki = self.client['db_wiki']
		# Events collection
		self.col_events = self.db_wiki['events']
		# Create text index on title (for title search) (if not exist)
		self.col_events.ensure_index([("title", pymongo.TEXT)])
		# Create indexes for year, day and category
		self.col_events.ensure_index([
			("year", pymongo.ASCENDING), 
			("category", pymongo.ASCENDING), 
			("day", pymongo.ASCENDING)])
Beispiel #19
0
 def crawl(self, **args):
     source = args['source']
     ext = args['ext']
     reply_time = args['reply_time']
     br = Browser()
     page = br.open(self.baseurl)
     new_reply_time = reply_time
     while True:
         links = PageParser.parse(page, source)
         for i, link in enumerate(links):
             if reply_time < link.reply_time:
                 if i is 0:
                     new_reply_time = link.reply_time
                 self.db.insert('\t'.join([str(link), source, json.dumps(ext)]))
             else:
                 return new_reply_time
         try:
             page = br.follow_link(text='后页>')
         except:
             Logger.info('finished!')
             break
     return new_reply_time
Beispiel #20
0
 def parse(self, page=-1):
     if page == 1:
         return 'User Instruction Page'
     elif page == 0:
         return 'Page Number Error'
     elif page > 1:
         current_ws = self.load_page(page - 1)
         page_parser = PageParser(current_ws, self.table_type)
         return {
             1: page_parser.parse_attached_table(),
             2: page_parser.parse_main_table()
         }
     else:
         num_of_page = len(sheet_type[self.table_type]['page'])
         data = {}
         for i in range(1, num_of_page):
             current_ws = self.load_page(i - 1)
             page_parser = PageParser(current_ws, self.table_type)
             page_name = sheet_type[self.table_type]['page'][i]
             data[page_name] = {
                 1: page_parser.parse_attached_table(),
                 2: page_parser.parse_main_table()
             }
         return data
Beispiel #21
0
 def crawl(self, **args):
     source = args['source']
     ext = args['ext']
     reply_time = args['reply_time']
     br = Browser()
     page = br.open(self.baseurl)
     new_reply_time = reply_time
     while True:
         links = PageParser.parse(page, source)
         for i, link in enumerate(links):
             if reply_time < link.reply_time:
                 if i is 0:
                     new_reply_time = link.reply_time
                 self.db.insert('\t'.join(
                     [str(link), source, json.dumps(ext)]))
             else:
                 return new_reply_time
         try:
             page = br.follow_link(text='后页>')
         except:
             Logger.info('finished!')
             break
     return new_reply_time
Beispiel #22
0
    def process(self):
        print ""
        pdfbuilder = PdfBuilder(self.fname, self.coords, self.W, self.H,
                                self.rsrcmgr, self.laparams)
        for (pageno, page) in self.pagesEnumerator:
            if pageno > self.endPage:
                break
            if not self.pageRanges.isInRange(pageno + 1):
                continue
            self.interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = self.device.get_result()
            print "processing page %d                                         \n" % (
                pageno + 1),
            trimboxes = [self.trimbox]
            if self.trimboxes != None and pageno in self.trimboxes:
                trimboxes = self.trimboxes[pageno]

            pageParser = PageParser(layout,
                                    self.maxSplit,
                                    self.W,
                                    self.H,
                                    trimboxes,
                                    self.exclude,
                                    pagebox=self.getpagebox(page))
            pageCoords = pageParser.process()
            maxScale = self.GetMaxScale(pageCoords['combinedLines'])
            pageCoords['pageno'] = pageno
            crops = None
            if pageCoords != None:
                crops = pageCoords['crops']

            if crops == None or len(crops) == 0:
                continue
            self.coords.append((pageno, crops))
            self.pagesCoords.append(pageCoords)
            pdfbuilder.processTrimmed(page, pageno, crops,
                                      maxScale=maxScale)  #self.outputJson)

        self.scales = pdfbuilder.endProcess()
        if self.DEBUG > 0:
            with open(self.picklefile, 'wb') as f:
                procResult = {
                    'args': self.args,
                    'pagesCoords': self.pagesCoords,
                    'scales': self.scales
                }
                pickle.dump(procResult, f)

        if self.coords == None or len(self.coords) == 0:
            print "No objects found\r"
            return

        with open(self.fname + '.json', 'wb') as f:
            f.write(json.dumps({'scales': self.scales}))
        try:
            params = [
                r"java.exe", "-cp", r"pdf2ereader.jar", "jpdftoepub.TrimPdf",
                "crop", self.fname, self.outfile, self.fname + '.json'
            ]
            print ' '.join(params)
            p = subprocess.Popen(params)
            p.wait()
            #sself.DEBUG = 1
            if p.returncode == 0 and not (self.DEBUG > 0):
                os.remove(self.fname + '.json')
                os.remove(self.fname + ".cropped-streams")
            print "\nDone"
        except Exception, e:
            print e
Beispiel #23
0
class Crawler():
    # pre:  urls is either an absolute url in string format
    #       or a list of absolute urls in string format.
    #       maxLinksToCrawl is an integer.
    # post: An instance of Crawler is initiated. When future
    #       crawl will be started, the crawler starts by
    #       crawling the given urls. In total, it will
    #       not crawl any more websites than maxLinksToCrawl.
    def __init__(self, urls, maxLinksToCrawl):
        self.linksToVisit = utils.elementToList(urls)
        if maxLinksToCrawl == 0:
            self.maxLinksToCrawl = sys.maxint
        else:
            self.maxLinksToCrawl = maxLinksToCrawl
        self.linksVisited = []     # The links we have visited.
        self.websitesVisited = []  # The websites we have visited.
        self.pathsNotToCrawl = []  # As defined in robots.txt.
        self.parser = PageParser() # Used to parse the websites we crawl.

    # pre:  urls is a string or a list of strings of websites to crawl.
    #       maxLinksToCrawl is the maximum amount of pages to crawl.
    #       if equal to zero, there's "no" limit for pages to crawl.
    # post: Crawls the web beginning with the given urls. Writes the 
    #       visible text on the webpages to disk in the folder Mapped.
    def crawl(self):
        sys.stdout.write("\n")
        while len(self.linksVisited) < self.maxLinksToCrawl and len(self.linksToVisit) > 0:
            url = self.linksToVisit[0] # Fetch the url to parse
            self.linksToVisit = self.linksToVisit[1:] # Delete it from linksToVisit

            # If we've reached a new website, obey defined robots exclusion rules.
            if urlparse(url).netloc not in self.websitesVisited:
                self.obeyRobotsExclusion(url)
                self.websitesVisited.append(urlparse(url).netloc)
            
            self.manageLinksAndData(url)

            # Crawler's politeness. Wait 2 seconds before crawling next link.
            time.sleep(2)

    # pre:  We have reached a new website during the crawl.
    # post: The url's website' robots.txt file is consulted if it exists.
    #       The crawler will take note of which paths are not to be crawled,
    #       as defined in the .txt file. In other words, the .txt file's
    #       content is used for following the robot exclusion standard.
    def obeyRobotsExclusion(self, url):
        sys.stdout.write(" -> Entering a new website. ")
        disallowedPaths = self.parser.getRobotsTXTDisallowedPathsFrom(url)
        if not disallowedPaths:
            sys.stdout.write("The website allows all paths to be crawled.\n")
        else:
            sys.stdout.write("Disallowed paths will be respected.\n")
            self.pathsNotToCrawl += disallowedPaths

    # post: Lets the PageParser retrieve the url's webpage links and data.
    #       Furthermore, filters out links that are not to be crawled and
    #       stores the remaining links for future crawling.
    def manageLinksAndData(self, url):
        sys.stdout.write(" [" + str(len(self.linksVisited)) + "] Crawling: " + url + "\n")
        try:
            self.linksVisited = self.linksVisited + [url]
            self.parser.parse(url)
            self.linksToVisit = self.linksToVisit \
                                + utils.filterOutLinks(self.parser.links, \
                                                       self.pathsNotToCrawl \
                                                       + self.linksVisited \
                                                       + self.linksToVisit)
            sys.stdout.write(" -> Success.\n")
        except URLError:
            sys.stdout.write(" -> Failed.\n")
 def reduceCrops(self, crops):
     scales = []
     if False:
         zeroHeight = [idx for idx in range(len(crops)) if crops[idx][3]-crops[idx][1]==0]
         reduced = False
         for idx in zeroHeight:
             prev = idx
             next = idx
             while True:
                 prev-=1
                 if prev < 0:
                     prev = None
                     break
                 if crops[prev]!= None:
                     break
             while True:
                 next+=1
                 if not (next < len(crops)):
                     next = None
                     break
                 if crops[next]!=None:
                     break
             if prev == None and next == None:
                 continue
             reduced = True
             if prev != None and next != None:
                 pass
                 #crops[p]
             elif prev != None:
                 crops[prev] = PageParser.mergeBoxes(crops[idx], crops[prev])
                 crops[idx] = None
             else:
                 crops[next] = PageParser.mergeBoxes(crops[idx], crops[next])
                 crops[idx] = None
 
         if reduced:
             try:
                 while True:
                     crops.remove(None)
             except ValueError:
                 pass
             
     for idx in range(len(crops)):
         _, _, _,  (scale, _, _) = self.getTransformation(crops[idx], idx==0)
         scales.append(scale)
     scales = [[idx,idx+1] for idx in range(len(crops)-1) if scales[idx]==self.maxScale and scales[idx+1]==self.maxScale]
     cnt = len(scales)
     idx = 0
     reduced = False
     while idx < cnt:
         box = PageParser.mergeBoxes(crops[scales[idx][0]], crops[scales[idx][1]])
         _, _, _,  (scale, _, _) = self.getTransformation(box, idx==0)
         if scale == self.maxScale:
             reduced = True
             crops[scales[idx][0]] = box
             crops[scales[idx][1]] = None
             if [scales[idx][1],scales[idx][1]+1] in scales:
                 cnt-=1
                 scales[idx][1]=scales[idx+1][1]
                 del(scales[idx+1])
             else:
                 idx+=1
         else:
             idx+=1
     if reduced:
         try:
             while True:
                 crops.remove(None)
         except ValueError:
             pass
from PageParser import PageParser
from CourseraSessionClass import CourseraSession
import os

session = CourseraSession().initSession()

response = session.get("https://class.coursera.org/hetero-004/lecture") #interactivepython2-009

tet = PageParser()
result = tet.feed( response.text.encode("utf-8") )

if os.path.isdir("Video"):
    print "File Exists!!"
else:
    os.makedirs("Video")

index = 1
for i in result:
    f = session.get(i)
    filename = "Video\\Video%02d.mp4" % index
    with open(filename, 'wb') as code:
        code.write(f.content)
    print "Download Complete. [%02d/%02d]" % (index, result.__len__())
    index += 1
Beispiel #26
0
    def reduceCrops(self, crops):
        scales = []
        if False:
            zeroHeight = [
                idx for idx in range(len(crops))
                if crops[idx][3] - crops[idx][1] == 0
            ]
            reduced = False
            for idx in zeroHeight:
                prev = idx
                next = idx
                while True:
                    prev -= 1
                    if prev < 0:
                        prev = None
                        break
                    if crops[prev] != None:
                        break
                while True:
                    next += 1
                    if not (next < len(crops)):
                        next = None
                        break
                    if crops[next] != None:
                        break
                if prev == None and next == None:
                    continue
                reduced = True
                if prev != None and next != None:
                    pass
                    #crops[p]
                elif prev != None:
                    crops[prev] = PageParser.mergeBoxes(
                        crops[idx], crops[prev])
                    crops[idx] = None
                else:
                    crops[next] = PageParser.mergeBoxes(
                        crops[idx], crops[next])
                    crops[idx] = None

            if reduced:
                try:
                    while True:
                        crops.remove(None)
                except ValueError:
                    pass

        for idx in range(len(crops)):
            _, _, _, (scale, _,
                      _) = self.getTransformation(crops[idx], idx == 0)
            scales.append(scale)
        scales = [
            [idx, idx + 1] for idx in range(len(crops) - 1)
            if scales[idx] == self.maxScale and scales[idx +
                                                       1] == self.maxScale
        ]
        cnt = len(scales)
        idx = 0
        reduced = False
        while idx < cnt:
            box = PageParser.mergeBoxes(crops[scales[idx][0]],
                                        crops[scales[idx][1]])
            _, _, _, (scale, _, _) = self.getTransformation(box, idx == 0)
            if scale == self.maxScale:
                reduced = True
                crops[scales[idx][0]] = box
                crops[scales[idx][1]] = None
                if [scales[idx][1], scales[idx][1] + 1] in scales:
                    cnt -= 1
                    scales[idx][1] = scales[idx + 1][1]
                    del (scales[idx + 1])
                else:
                    idx += 1
            else:
                idx += 1
        if reduced:
            try:
                while True:
                    crops.remove(None)
            except ValueError:
                pass
Beispiel #27
0
 def __init__(self, product_id):
     PageParser.__init__(self, url=url_ficha + product_id)
     self.product_id = product_id
import sys
from PageParser import PageParser
from PageTopicAnalyzer import PageTopicAnalyzer

if __name__ == '__main__':
    reload(sys)
    sys.setdefaultencoding('UTF8')
    if len(sys.argv) < 2:
        print("URL missing! Please try again.")
    elif len(sys.argv) > 2:
        print(
            "The program takes exactly one argument. Two received. Please try again."
        )
    else:
        url = sys.argv[1]
        parser = PageParser(url)
        allText = parser.getAllText()
        # print(allText)
        titleText = parser.getTitle()
        # print(titleText)
        headingText = parser.getHeadings()
        # print(headingText)

        allAnalyzer = PageTopicAnalyzer(allText)
        # print(allAnalyzer.bagOfWords)
        titleAnalyzer = PageTopicAnalyzer(titleText)
        # print(titleAnalyzer.bagOfWords)

        ## Unigram ##
        allAnalyzer.unigram()
        titleAnalyzer.bigram()