def update(request, gameid): game = get_object_or_404(Game, id=gameid) if game.is_locked(): return HttpResponse(simplejson.dumps({ 'success': False, 'message': 'Someone else is updating that game right now. Please wait.' }), mimetype='application/json') else: game.lock() try: p = PageParser() newGame = p.Update(game) if newGame: return HttpResponse(simplejson.dumps({ 'success': True, 'curPage': newGame.currentPage, 'maxPages': newGame.maxPages }), mimetype='application/json') else: game.save() return HttpResponse(simplejson.dumps({ 'success': False, 'message': 'There was a problem either downloading or parsing the forum page. Please try again later.' }), mimetype='application/json') except: game.save() raise
class FetchWiki(object): def __init__(self, dbserver): """ Initialize parser, connects to mongodb server and creates indexes if they don't exist. - dbserver: mongodb server's address. """ # Wikipedia page parser self.parser = PageParser(self.save) # Database client self.client = MongoClient(dbserver, 27017) # Database object self.db_wiki = self.client['db_wiki'] # Events collection self.col_events = self.db_wiki['events'] # Create text index on title (for title search) (if not exist) self.col_events.ensure_index([("title", pymongo.TEXT)]) # Create indexes for year, day and category self.col_events.ensure_index([ ("year", pymongo.ASCENDING), ("category", pymongo.ASCENDING), ("day", pymongo.ASCENDING)]) def save(self, **kwargs): """ Inserts into database a new entry """ self.col_events.insert_one(kwargs) def start(self): """ Starts the parsing process """ # Iterate over all days of a leap year (2016 is one) start_date = date(2016, 1, 1) end_date = date(2017, 1, 1) start = timer() for curr_date in self.__date_range(start_date, end_date): page_title = '{d:%B}_{d.day}'.format(d=curr_date) self.parse(page_title) end = timer() print "Time elapsed: ", end - start def parse(self, page_title): """ Calls the parser for a given page title, the parser calls save method """ try: print "Parsing page '%s'" % page_title, self.parser.parse_page(page_title) print " .... OK" except Exception: print " ... failed" def __date_range(self, start_date, end_date): """ Generates dates between start_date and end_date """ for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n)
def replace(self, url): from Window import Window from PageParser import PageParser #TODO: Add referrer window = Window(self.__dict__['__document'].contentWindow.__dict__['__root'], self.fix_url(url)) parser = PageParser(window, window.document, window.__dict__['__html']) parser.close() return url
def start(self, ): urls = self.FreshUrls self.FreshUrls = [] for url in urls: req = self._get(url) pp = PageParser() pp.parse(req.text, self.PageInfo, self.BaseHost) self.PageInfo.output()
def __init__(self, urls, maxLinksToCrawl): self.linksToVisit = utils.elementToList(urls) if maxLinksToCrawl == 0: self.maxLinksToCrawl = sys.maxint else: self.maxLinksToCrawl = maxLinksToCrawl self.linksVisited = [] # The links we have visited. self.websitesVisited = [] # The websites we have visited. self.pathsNotToCrawl = [] # As defined in robots.txt. self.parser = PageParser() # Used to parse the websites we crawl.
def process(self): print "" pdfbuilder = PdfBuilder(self.fname, self.coords, self.W, self.H, self.rsrcmgr, self.laparams) for (pageno, page) in self.pagesEnumerator: if pageno > self.endPage: break if not self.pageRanges.isInRange(pageno+1): continue self.interpreter.process_page(page) # receive the LTPage object for the page. layout = self.device.get_result() print "processing page %d \n" %(pageno+1), trimboxes = [self.trimbox] if self.trimboxes != None and pageno in self.trimboxes: trimboxes = self.trimboxes[pageno] pageParser= PageParser(layout, self.maxSplit, self.W, self.H, trimboxes, self.exclude, pagebox=self.getpagebox(page)) pageCoords = pageParser.process() maxScale = self.GetMaxScale(pageCoords['combinedLines']) pageCoords['pageno']= pageno crops = None if pageCoords != None: crops = pageCoords['crops'] if crops == None or len(crops)==0: continue self.coords.append((pageno, crops)) self.pagesCoords.append(pageCoords) pdfbuilder.processTrimmed(page, pageno, crops, maxScale = maxScale) #self.outputJson) self.scales = pdfbuilder.endProcess() if self.DEBUG > 0: with open(self.picklefile, 'wb') as f: procResult = {'args': self.args, 'pagesCoords':self.pagesCoords, 'scales':self.scales} pickle.dump(procResult, f) if self.coords == None or len(self.coords) ==0: print "No objects found\r" return with open(self.fname + '.json', 'wb') as f: f.write(json.dumps({'scales':self.scales})) try: params = [r"java.exe", "-cp", r"pdf2ereader.jar", "jpdftoepub.TrimPdf", "crop", self.fname, self.outfile,self.fname + '.json'] print ' '.join(params) p = subprocess.Popen(params) p.wait() #sself.DEBUG = 1 if p.returncode == 0 and not (self.DEBUG >0): os.remove(self.fname + '.json') os.remove(self.fname + ".cropped-streams") print "\nDone" except Exception, e: print e
def main(): # Variables for testing in IDE document_id = "Ms-114" page_path = "./Ms-114/page" json_path = "./Ms-114/test_output.json" # Actual arguments when calling from command line #page_path, document_id, json_path = handle_args() parser = PageParser(page_path, document_id) parser.read_files() save_json(parser, json_path)
def __init__(self, sale_terms, status, page_number=1, max_retry=3, max_wait_timeout=10, tasks=None): threading.Thread.__init__(self) PageParser.__init__(self, base_url, max_retry, max_wait_timeout) self.daemon = True self.sale_terms = sale_terms self.status = status self.page_number = page_number self.tasks = tasks
class pytrends(): def __init__(self): pass def __clean_date__(self,date): """Google Trends uses int-based date""" tmp = [] for x in date.split('-'): tmp.append(str((int(x)))) return '-'.join(tmp) def trends_by_date(self,date): """Returns a list of Google Trends Keywords by Date. Returns False on Error""" args = {'sa':'X', 'date':self.__clean_date__(date) } q = urllib.urlencode(args) html = '' try: html = urllib.urlopen(URL_TRENDS+q).read() except Exception,e: return False x = PageParser() x.feed(html) keywords = [] for href in x.hrefs: if '/trends/hottrends?q=' in href: url = urlparse( 'http://www.google.com/'+href ) params = dict([part.split('=') for part in url[4].split('&')]) for k,v in params.items(): params[k] = v.replace('+',' ') keywords.append(params['q']) return keywords
def check_update_game(game): if game.is_locked(): return game else: game.lock() try: p = PageParser() newGame = p.Update(game) if newGame: return newGame else: game.save() return game except: return game
def add_game(request, threadid): data = {'success': True, 'message': 'Success!', 'url': ''} try: game = Game.objects.get(threadId=threadid) data['url'] = game.get_absolute_url() except Game.DoesNotExist: p = PageParser() p.user = request.user game = p.Add(threadid) if game: data['url'] = game.get_absolute_url() game.status_update("A new game was created by %s!" % game.moderator) else: data['success'] = False data[ 'message'] = "Couldn't download or parse the forum thread. Sorry!" return HttpResponse(simplejson.dumps(data), mimetype='application/json')
def render_string(self, textstate, seq): curItem = self.cur_item l1 = len(curItem) PDFTextDevice.render_string(self, textstate, seq) l2 = len(curItem) b = reduce(lambda x,y: PageParser.mergeBoxes(x, y.bbox), curItem._objs[l1:l2], None ) if self.inFigure: self.showfigure = [x[0] or x[1] for x in zip(self.showfigure, self.intersect(b))] else: self.showtext = self.intersect(b)
def paint_path(self, gstate, stroke, fill, evenodd, path): curItem = self.cur_item l1 = len(curItem) PDFPageAggregator.paint_path(self, gstate, stroke, fill, evenodd, path) l2 = len(curItem) b = reduce(lambda x,y: PageParser.mergeBoxes(x, y.bbox), curItem._objs[l1:l2], None ) if self.inFigure: self.showfigure = [x[0] or x[1] for x in zip(self.showfigure, self.intersect(b))] else: self.showpath = self.intersect(b)
def handle_src(self, name, val): url = self.__dict__['__window'].document.location.fix_url(val) if config.retrieval_all: hc.get(url, self.__dict__['__window'].document.location.href) scheme, netloc, path, query, fragment = urlparse.urlsplit(url) if scheme not in ('http','file','https','ftp'): config.VERBOSE(config.VERBOSE_WARNING, "[WARNING] Got unknown scheme: %s in %s.%s ."%(url,self.tagName, name)); if 'onerror' in self.__dict__: config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] Calling onerror of %s."%(self.tagName)); self.onerror() if self.tagName == "iframe": from Window import Window from PageParser import PageParser window = Window(self.__dict__['__window'].__dict__['__root'], self.__dict__['__window'].document.location.fix_url(val), self.__dict__['__window'].document.location.href) parser = PageParser(window, window.document, window.__dict__['__html']) parser.close()
def handle_innerHTML(self, name, val): val = str(val) if self.__parser: self.__parser.html = self.__parser.html[:self.begin] + val + self.__parser.html[self.end:] dev = self.end - self.begin - len(val) for i in self.__dict__['__window'].document.all: if i.begin: if i.begin > self.end: i.begin -= dev if i.end: if i.end >= self.end: i.end -= dev self.__parser.current -= dev return from PageParser import PageParser self.__parser = PageParser(self.__dict__['__window'], self.__dict__['__window'].document, val)
def parse(self, **args): page = args['page'] source = args['source'] hashurl = args['hashurl'] ret = PageParser.parse(page, source) if ret.has_key('error'): Logger.info(hashurl+' '+ret['error']) return record = '\t'.join([ hashurl, ret['title2'] if ret['title2']\ else ret['title'], json.dumps(ret['author']), json.dumps(ret['images']), json.dumps(ret['links']), ret['text'], ret['pub_time'], ]).encode('utf-8') self._db.insert(record)
def write(self, text): """ Writes a string of text to a document stream. Syntax document.write(text) Parameters text is a string containing the text to be written to the current document. """ config.VERBOSE(config.VERBOSE_DEBUG, '[DEBUG] in Document.py Document.write(ln)...') config.VERBOSE(config.VERBOSE_DETAIL, str(text)) self.__dict__['__dynamic'].append(text) content = ''.join(self.__dict__['__dynamic']) p = PageParser(self.contentWindow, self.contentWindow.__dict__['__sl'][-1], content, True)
def __init__(self, dbserver): """ Initialize parser, connects to mongodb server and creates indexes if they don't exist. - dbserver: mongodb server's address. """ # Wikipedia page parser self.parser = PageParser(self.save) # Database client self.client = MongoClient(dbserver, 27017) # Database object self.db_wiki = self.client['db_wiki'] # Events collection self.col_events = self.db_wiki['events'] # Create text index on title (for title search) (if not exist) self.col_events.ensure_index([("title", pymongo.TEXT)]) # Create indexes for year, day and category self.col_events.ensure_index([ ("year", pymongo.ASCENDING), ("category", pymongo.ASCENDING), ("day", pymongo.ASCENDING)])
def crawl(self, **args): source = args['source'] ext = args['ext'] reply_time = args['reply_time'] br = Browser() page = br.open(self.baseurl) new_reply_time = reply_time while True: links = PageParser.parse(page, source) for i, link in enumerate(links): if reply_time < link.reply_time: if i is 0: new_reply_time = link.reply_time self.db.insert('\t'.join([str(link), source, json.dumps(ext)])) else: return new_reply_time try: page = br.follow_link(text='后页>') except: Logger.info('finished!') break return new_reply_time
def parse(self, page=-1): if page == 1: return 'User Instruction Page' elif page == 0: return 'Page Number Error' elif page > 1: current_ws = self.load_page(page - 1) page_parser = PageParser(current_ws, self.table_type) return { 1: page_parser.parse_attached_table(), 2: page_parser.parse_main_table() } else: num_of_page = len(sheet_type[self.table_type]['page']) data = {} for i in range(1, num_of_page): current_ws = self.load_page(i - 1) page_parser = PageParser(current_ws, self.table_type) page_name = sheet_type[self.table_type]['page'][i] data[page_name] = { 1: page_parser.parse_attached_table(), 2: page_parser.parse_main_table() } return data
def crawl(self, **args): source = args['source'] ext = args['ext'] reply_time = args['reply_time'] br = Browser() page = br.open(self.baseurl) new_reply_time = reply_time while True: links = PageParser.parse(page, source) for i, link in enumerate(links): if reply_time < link.reply_time: if i is 0: new_reply_time = link.reply_time self.db.insert('\t'.join( [str(link), source, json.dumps(ext)])) else: return new_reply_time try: page = br.follow_link(text='后页>') except: Logger.info('finished!') break return new_reply_time
def process(self): print "" pdfbuilder = PdfBuilder(self.fname, self.coords, self.W, self.H, self.rsrcmgr, self.laparams) for (pageno, page) in self.pagesEnumerator: if pageno > self.endPage: break if not self.pageRanges.isInRange(pageno + 1): continue self.interpreter.process_page(page) # receive the LTPage object for the page. layout = self.device.get_result() print "processing page %d \n" % ( pageno + 1), trimboxes = [self.trimbox] if self.trimboxes != None and pageno in self.trimboxes: trimboxes = self.trimboxes[pageno] pageParser = PageParser(layout, self.maxSplit, self.W, self.H, trimboxes, self.exclude, pagebox=self.getpagebox(page)) pageCoords = pageParser.process() maxScale = self.GetMaxScale(pageCoords['combinedLines']) pageCoords['pageno'] = pageno crops = None if pageCoords != None: crops = pageCoords['crops'] if crops == None or len(crops) == 0: continue self.coords.append((pageno, crops)) self.pagesCoords.append(pageCoords) pdfbuilder.processTrimmed(page, pageno, crops, maxScale=maxScale) #self.outputJson) self.scales = pdfbuilder.endProcess() if self.DEBUG > 0: with open(self.picklefile, 'wb') as f: procResult = { 'args': self.args, 'pagesCoords': self.pagesCoords, 'scales': self.scales } pickle.dump(procResult, f) if self.coords == None or len(self.coords) == 0: print "No objects found\r" return with open(self.fname + '.json', 'wb') as f: f.write(json.dumps({'scales': self.scales})) try: params = [ r"java.exe", "-cp", r"pdf2ereader.jar", "jpdftoepub.TrimPdf", "crop", self.fname, self.outfile, self.fname + '.json' ] print ' '.join(params) p = subprocess.Popen(params) p.wait() #sself.DEBUG = 1 if p.returncode == 0 and not (self.DEBUG > 0): os.remove(self.fname + '.json') os.remove(self.fname + ".cropped-streams") print "\nDone" except Exception, e: print e
class Crawler(): # pre: urls is either an absolute url in string format # or a list of absolute urls in string format. # maxLinksToCrawl is an integer. # post: An instance of Crawler is initiated. When future # crawl will be started, the crawler starts by # crawling the given urls. In total, it will # not crawl any more websites than maxLinksToCrawl. def __init__(self, urls, maxLinksToCrawl): self.linksToVisit = utils.elementToList(urls) if maxLinksToCrawl == 0: self.maxLinksToCrawl = sys.maxint else: self.maxLinksToCrawl = maxLinksToCrawl self.linksVisited = [] # The links we have visited. self.websitesVisited = [] # The websites we have visited. self.pathsNotToCrawl = [] # As defined in robots.txt. self.parser = PageParser() # Used to parse the websites we crawl. # pre: urls is a string or a list of strings of websites to crawl. # maxLinksToCrawl is the maximum amount of pages to crawl. # if equal to zero, there's "no" limit for pages to crawl. # post: Crawls the web beginning with the given urls. Writes the # visible text on the webpages to disk in the folder Mapped. def crawl(self): sys.stdout.write("\n") while len(self.linksVisited) < self.maxLinksToCrawl and len(self.linksToVisit) > 0: url = self.linksToVisit[0] # Fetch the url to parse self.linksToVisit = self.linksToVisit[1:] # Delete it from linksToVisit # If we've reached a new website, obey defined robots exclusion rules. if urlparse(url).netloc not in self.websitesVisited: self.obeyRobotsExclusion(url) self.websitesVisited.append(urlparse(url).netloc) self.manageLinksAndData(url) # Crawler's politeness. Wait 2 seconds before crawling next link. time.sleep(2) # pre: We have reached a new website during the crawl. # post: The url's website' robots.txt file is consulted if it exists. # The crawler will take note of which paths are not to be crawled, # as defined in the .txt file. In other words, the .txt file's # content is used for following the robot exclusion standard. def obeyRobotsExclusion(self, url): sys.stdout.write(" -> Entering a new website. ") disallowedPaths = self.parser.getRobotsTXTDisallowedPathsFrom(url) if not disallowedPaths: sys.stdout.write("The website allows all paths to be crawled.\n") else: sys.stdout.write("Disallowed paths will be respected.\n") self.pathsNotToCrawl += disallowedPaths # post: Lets the PageParser retrieve the url's webpage links and data. # Furthermore, filters out links that are not to be crawled and # stores the remaining links for future crawling. def manageLinksAndData(self, url): sys.stdout.write(" [" + str(len(self.linksVisited)) + "] Crawling: " + url + "\n") try: self.linksVisited = self.linksVisited + [url] self.parser.parse(url) self.linksToVisit = self.linksToVisit \ + utils.filterOutLinks(self.parser.links, \ self.pathsNotToCrawl \ + self.linksVisited \ + self.linksToVisit) sys.stdout.write(" -> Success.\n") except URLError: sys.stdout.write(" -> Failed.\n")
def reduceCrops(self, crops): scales = [] if False: zeroHeight = [idx for idx in range(len(crops)) if crops[idx][3]-crops[idx][1]==0] reduced = False for idx in zeroHeight: prev = idx next = idx while True: prev-=1 if prev < 0: prev = None break if crops[prev]!= None: break while True: next+=1 if not (next < len(crops)): next = None break if crops[next]!=None: break if prev == None and next == None: continue reduced = True if prev != None and next != None: pass #crops[p] elif prev != None: crops[prev] = PageParser.mergeBoxes(crops[idx], crops[prev]) crops[idx] = None else: crops[next] = PageParser.mergeBoxes(crops[idx], crops[next]) crops[idx] = None if reduced: try: while True: crops.remove(None) except ValueError: pass for idx in range(len(crops)): _, _, _, (scale, _, _) = self.getTransformation(crops[idx], idx==0) scales.append(scale) scales = [[idx,idx+1] for idx in range(len(crops)-1) if scales[idx]==self.maxScale and scales[idx+1]==self.maxScale] cnt = len(scales) idx = 0 reduced = False while idx < cnt: box = PageParser.mergeBoxes(crops[scales[idx][0]], crops[scales[idx][1]]) _, _, _, (scale, _, _) = self.getTransformation(box, idx==0) if scale == self.maxScale: reduced = True crops[scales[idx][0]] = box crops[scales[idx][1]] = None if [scales[idx][1],scales[idx][1]+1] in scales: cnt-=1 scales[idx][1]=scales[idx+1][1] del(scales[idx+1]) else: idx+=1 else: idx+=1 if reduced: try: while True: crops.remove(None) except ValueError: pass
from PageParser import PageParser from CourseraSessionClass import CourseraSession import os session = CourseraSession().initSession() response = session.get("https://class.coursera.org/hetero-004/lecture") #interactivepython2-009 tet = PageParser() result = tet.feed( response.text.encode("utf-8") ) if os.path.isdir("Video"): print "File Exists!!" else: os.makedirs("Video") index = 1 for i in result: f = session.get(i) filename = "Video\\Video%02d.mp4" % index with open(filename, 'wb') as code: code.write(f.content) print "Download Complete. [%02d/%02d]" % (index, result.__len__()) index += 1
def reduceCrops(self, crops): scales = [] if False: zeroHeight = [ idx for idx in range(len(crops)) if crops[idx][3] - crops[idx][1] == 0 ] reduced = False for idx in zeroHeight: prev = idx next = idx while True: prev -= 1 if prev < 0: prev = None break if crops[prev] != None: break while True: next += 1 if not (next < len(crops)): next = None break if crops[next] != None: break if prev == None and next == None: continue reduced = True if prev != None and next != None: pass #crops[p] elif prev != None: crops[prev] = PageParser.mergeBoxes( crops[idx], crops[prev]) crops[idx] = None else: crops[next] = PageParser.mergeBoxes( crops[idx], crops[next]) crops[idx] = None if reduced: try: while True: crops.remove(None) except ValueError: pass for idx in range(len(crops)): _, _, _, (scale, _, _) = self.getTransformation(crops[idx], idx == 0) scales.append(scale) scales = [ [idx, idx + 1] for idx in range(len(crops) - 1) if scales[idx] == self.maxScale and scales[idx + 1] == self.maxScale ] cnt = len(scales) idx = 0 reduced = False while idx < cnt: box = PageParser.mergeBoxes(crops[scales[idx][0]], crops[scales[idx][1]]) _, _, _, (scale, _, _) = self.getTransformation(box, idx == 0) if scale == self.maxScale: reduced = True crops[scales[idx][0]] = box crops[scales[idx][1]] = None if [scales[idx][1], scales[idx][1] + 1] in scales: cnt -= 1 scales[idx][1] = scales[idx + 1][1] del (scales[idx + 1]) else: idx += 1 else: idx += 1 if reduced: try: while True: crops.remove(None) except ValueError: pass
def __init__(self, product_id): PageParser.__init__(self, url=url_ficha + product_id) self.product_id = product_id
import sys from PageParser import PageParser from PageTopicAnalyzer import PageTopicAnalyzer if __name__ == '__main__': reload(sys) sys.setdefaultencoding('UTF8') if len(sys.argv) < 2: print("URL missing! Please try again.") elif len(sys.argv) > 2: print( "The program takes exactly one argument. Two received. Please try again." ) else: url = sys.argv[1] parser = PageParser(url) allText = parser.getAllText() # print(allText) titleText = parser.getTitle() # print(titleText) headingText = parser.getHeadings() # print(headingText) allAnalyzer = PageTopicAnalyzer(allText) # print(allAnalyzer.bagOfWords) titleAnalyzer = PageTopicAnalyzer(titleText) # print(titleAnalyzer.bagOfWords) ## Unigram ## allAnalyzer.unigram() titleAnalyzer.bigram()