Beispiel #1
0
class check_the_mangas():
    def __init__(self,manga_name, db_conn):
        self.db_conn = db_conn
        self.manga_name = manga_name
        self.manga_oldnumber = sqlite_manager.get_manga_chapter(
            db_conn,
            manga_name)
        self.manga_nownumber = self.manga_oldnumber
        self.manga_olddate = sqlite_manager.get_manga_date(
            db_conn,
            manga_name)
        self.nowdate = self.today_date()
        self.br = URLopener()

    def today_date(self):
        return subprocess.check_output(["date","+%a-%b-%e"]).replace("\n","")

    #return 1 if the connection is working
    def test_connection(self):
        try:
            response = self.br.open(configuration.WEBSITE_TO_CHECK_CONNECTION).read()
            if configuration.KEYWORD in response:
                return 1
            else:
                return 0
        except:
            print "manga connection"
            return 0

    def exec_cmd(self):
        pid = os.fork()
        os.umask(0)
        os.system(configuration.MANGA_NEW_CMD.replace("MANGA",self.manga_name))

    def run(self):
        if( self.test_connection() ):
            last_chapter = False
            try:
                while(last_chapter==False):
                    to_open = "http://www.mangareader.net/" + self.manga_name + "/" + str( int(self.manga_nownumber)+1 )
                    response = self.br.open( to_open).read()
                    if "is not released yet" in response or "not published yet" in response or response == "":
                        last_chapter = True
                        if self.manga_nownumber != sqlite_manager.get_manga_chapter(self.db_conn, self.manga_name):
                            print self.manga_name+":"+self.manga_nownumber+":"+self.nowdate
                            sqlite_manager.update_manga(self.db_conn,
                                self.manga_name,
                                self.manga_nownumber,
                                self.nowdate)
                    else:
                        self.manga_nownumber = str( int(self.manga_nownumber)+1 )
            except Exception,e :
                if "is not released yet. If you liked" in response:
                    if self.manga_nownumber != sqlite_manager.get_manga_chapter(self.db_conn,self.manga_name):
                        print self.manga_name+":"+self.manga_nownumber+":"+self.nowdate
                        sqlite_manager.update_manga(self.db_conn,
                            self.manga_name,
                            self.manga_nownumber,
                            self.nowdate)
                pass
Beispiel #2
0
class check_the_mangas():
	def __init__(self,manga_name):
		self.manga_name	     = manga_name
		self.myfile		     = open(configuration.DATA_FILE,'r').read()
		self.manga_oldnumber = self.get_number()
		self.manga_nownumber = self.manga_oldnumber
		self.manga_olddate   = self.get_date  ()
		self.nowdate		 = self.today_date()
		self.br			     = URLopener()

	def get_number(self):
		return re.findall(self.manga_name+':([0-9]+):',self.myfile)[0]

	def get_date(self):
		return re.findall(self.manga_name+":"+str(self.manga_oldnumber)+':(.*)\n',self.myfile)[0]

	def today_date(self):
		return subprocess.check_output(["date","+%a-%b-%e"]).replace("\n","")

	#return 1 if the connection is working
	def test_connection(self):
		try:
			response = self.br.open(configuration.WEBSITE_TO_CHECK_CONNECTION).read()
			if configuration.KEYWORD in response:
				return 1
			else:
				return 0
		except:
			print "manga connection"
			return 0

	def exec_cmd(self):
		pid = os.fork()
		os.umask(0)
		os.system(configuration.MANGA_NEW_CMD.replace("MANGA",self.manga_name))

	def run(self):
		if( self.test_connection() ):
			last_chapter = False
			try:
				while(last_chapter==False):
					to_open = "http://www.mangareader.net/" + self.manga_name + "/" + str( int(self.manga_nownumber)+1 )
					response = self.br.open( to_open).read()
					if "is not released yet" in response or "not published yet" in response or response == "":
						last_chapter = True
						if self.manga_name + ":" + str(self.manga_nownumber) not in open(configuration.DATA_FILE, "r").read():
							Thread(target=self.exec_cmd).start()
							configuration.backup()
							open(configuration.DATA_FILE,'w').write(open(configuration.DATA_FILE+".bak", "r").read().replace(self.manga_name+":"+str(self.manga_oldnumber)+":"+ self.manga_olddate, self.manga_name+":"+str(self.manga_nownumber)+":"+self.nowdate))
					else:
						print "not last chapter"
						self.manga_nownumber = str( int(self.manga_nownumber)+1 )
			except Exception,e :
				print e
				print "manga run"
				if "is not released yet. If you liked" in response:
					if self.manga_name + ":" + str(self.manga_nownumber) not in open(configuration.DATA_FILE, "r").read():
						configuration.backup()
						open(configuration.DATA_FILE,'w').write(open(configuration.DATA_FILE+".bak", "r").read().replace(self.manga_name+":"+str(self.manga_oldnumber)+":"+ self.manga_olddate, self.manga_name+":"+str(self.manga_nownumber)+":"+self.nowdate))
				pass
Beispiel #3
0
def unshortenurl(short):
    from urllib import URLopener
    opener = URLopener()
    try:
        opener.open(short)
    except IOError, e:
        f = e
Beispiel #4
0
def unshortenurl(short):
    from urllib import URLopener
    opener = URLopener()
    try:
        opener.open(short)
    except IOError, e:
        f = e
Beispiel #5
0
class Updater:
    def __init__(self, server, infoFile):
        """
        takes a server location and an info file as parameters in the constructor
        it will use this server to fetch the new information
        there should be a json/version and json/info.json dir on this server
        """
        self._infoFile = infoFile
        self._serverJSON = server + self._infoFile
        self._serverDate = server + "json/version"
        if sys.version < '3':
            self.br = URLopener()
        else:
            self.br = request

    def hasNewInfo(self):
        """
        hasNewInfo :: Boolean
        compare the local version tag with the one found on the server
        and returns true if the server version is newer
        """
        jsonDate = open(location_manager.VERSION, 'r').read().strip()
        if sys.version < '3':
            servDate = self.br.open(self._serverDate).read().strip()
        else:
            servDate = self.br.urlopen(self._serverDate).read().strip()
        return (int(jsonDate) < int(servDate))

    def generateTimeStamp(self):
        """
        generateTimeStamp :: String
        returns a string that is used to timestamp old config backup files
        """
        return open(location_manager.VERSION, 'r').read().strip()

    def fetchNewInfo(self):
        """
        fetchNewInfo :: Void
        it will download the info file from the server
        use the timestamp to back it up
        and overwrite it
        """
        # Fetching server's info.json
        if sys.version < '3':
            response = self.br.open(self._serverJSON).read()
        else:
            response = self.br.urlopen(self._serverJSON).read().decode("utf-8")
        oldInfo = open(self._infoFile, 'r').read()
        open(self._infoFile + "." + self.generateTimeStamp(),
             'w').write(oldInfo)
        open(self._infoFile, 'w').write(response)
        # Fetching server's version
        if sys.version < '3':
            servDate = int(self.br.open(self._serverDate).read().strip())
        else:
            servDate = int(self.br.urlopen(self._serverDate).read().strip())
        open(location_manager.VERSION, 'w').write(str(servDate))
Beispiel #6
0
class Updater:
    def __init__(self, server, infoFile):
        """
        takes a server location and an info file as parameters in the constructor
        it will use this server to fetch the new information
        there should be a json/version and json/info.json dir on this server
        """
        self._infoFile = infoFile
        self._serverJSON = server + self._infoFile
        self._serverDate = server + "json/version"
        if sys.version < '3':
            self.br = URLopener()
        else:
            self.br = request

    def hasNewInfo(self):
        """
        hasNewInfo :: Boolean
        compare the local version tag with the one found on the server
        and returns true if the server version is newer
        """
        jsonDate = open(location_manager.VERSION , 'r').read().strip()
        if sys.version < '3':
            servDate = self.br.open(self._serverDate).read().strip()
        else:
            servDate = self.br.urlopen(self._serverDate).read().strip()
        return (int(jsonDate) < int(servDate))

    def generateTimeStamp(self):
        """
        generateTimeStamp :: String
        returns a string that is used to timestamp old config backup files
        """
        return open(location_manager.VERSION, 'r').read().strip()

    def fetchNewInfo(self):
        """
        fetchNewInfo :: Void
        it will download the info file from the server
        use the timestamp to back it up
        and overwrite it
        """
        # Fetching server's info.json
        if sys.version < '3':
            response = self.br.open(self._serverJSON).read()
        else:
            response = self.br.urlopen(self._serverJSON).read().decode("utf-8")
        oldInfo = open(self._infoFile, 'r').read()
        open(self._infoFile + "." + self.generateTimeStamp(), 'w').write(oldInfo)
        open(self._infoFile, 'w').write(response)
        # Fetching server's version
        if sys.version < '3':
            servDate = int(self.br.open(self._serverDate).read().strip())
        else:
            servDate = int(self.br.urlopen(self._serverDate).read().strip())
        open(location_manager.VERSION, 'w').write(str(servDate))
Beispiel #7
0
def connection():
	try:
		br = URLopener()
		response = br.open(configuration.WEBSITE_TO_CHECK_CONNECTION).read()
		if configuration.KEYWORD in response:
			return 1
		else:
			return 0
	except:
		return 0
Beispiel #8
0
def utGrabFromUrl(p_url):
    """ Takes a file from a remote server """
    from urllib import URLopener
    try:
        l_opener = URLopener()
        l_file = l_opener.open(p_url)
        ctype = l_file.headers['Content-Type']
        l_opener.close()
        return (l_file.read(), ctype)
    except:
        return (None, 'text/x-unknown-content-type')
Beispiel #9
0
class Updater:
    """
    takes a server location and an info file as parameters in the constructor
    it will use this server to fetch the new information
    there should be a /hash and /info.json dir on this server
    """
    def __init__(self,server,infoFile):
        self._server = server
        self._infoFile = infoFile
        self.br = URLopener()

    """
    hasNewInfo :: Boolean
    compare the local info file hash with the one found on the server
    and returns true if they are different
    """
    def hasNewInfo(self):
        f = open(self._infoFile,'r').read()
        m = md5.new(f).hexdigest()
        response = self.br.open(self._server+'/hash').read()
        response = response.replace("\n","")
        return (m!=response)

    """
    generateTimeStamp :: String
    returns a string that is used to timestamp old config  backup files
    """
    def generateTimeStamp(self):
        return str(time.gmtime().tm_year)+"_"+str(time.gmtime().tm_mday)+"_"+str(time.gmtime().tm_hour)+"_"+str(time.gmtime().tm_min)

    """
    fetchNewInfo :: Void
    it will download the info file from the server
    use the timestamp to back it up
    and overwrite it
    """
    def fetchNewInfo(self):
        response = self.br.open(self._server+'/info.json').read()
        oldInfo = open(self._infoFile,'r').read()
        open(self._infoFile+"."+self.generateTimeStamp(),'w').write(oldInfo)
        open(self._infoFile,'w').write(response)
Beispiel #10
0
class Updater:
    def __init__(self,server,infoFile):
        self._server = server
        self._infoFile = infoFile
        self.br = URLopener()

    def hasNewInfo(self):
        f = open(self._infoFile,'r').read()
        m = md5.new(f).hexdigest()
        response = self.br.open(self._server+'/hash').read()
        response = response.replace("\n","")
        return (m!=response)

    def generateTimeStamp(self):
        return str(time.gmtime().tm_year)+"_"+str(time.gmtime().tm_mday)+"_"+str(time.gmtime().tm_hour)+"_"+str(time.gmtime().tm_min)

    def fetchNewInfo(self):
        response = self.br.open(self._server+'/info.json').read()
        oldInfo = open(self._infoFile,'r').read()
        open(self._infoFile+"."+self.generateTimeStamp(),'w').write(oldInfo)
        open(self._infoFile,'w').write(response)
Beispiel #11
0
    def call_remote(self, category, params):
        '''
        The meetup api is set up such that the root url does not
        change much other than the'name' of the thing you call into.

        In other words, I can just use category to sprintf my way to a
        valid url, then tack on the rest of the query string specified
        in params.
        '''
        url = self.root_url
        url = url % (category)
        # Every call has to include key
        url = url + "?" + params + "&key=" + self.key
        client = URLopener()
        request = client.open(url)
        raw_str = request.read()
        results = json.loads(raw_str)
        # Let the caller interpret the results of the call. Both the
        # meta info and the results are passed back
        return results
Beispiel #12
0
    def call_remote(self,category,params):
        '''
        The meetup api is set up such that the root url does not
        change much other than the'name' of the thing you call into.

        In other words, I can just use category to sprintf my way to a
        valid url, then tack on the rest of the query string specified
        in params.
        '''
        url = self.root_url
        url = url % (category)
        # Every call has to include key
        url = url + "?" + params + "&key=" + self.key
        client = URLopener()
        request = client.open(url)
        raw_str = request.read()
        results = json.loads(raw_str)
        # Let the caller interpret the results of the call. Both the
        # meta info and the results are passed back
        return results
memoHeadings = {}

posCount = 0
for elem in courtElems:
    if not elem.text: continue
    country = normalStr(elem.text)
    if country == "Congo RDC": continue
    strIO = cStringIO.StringIO()
    urlStub = elem.attrib['href']
    if urlStub == '/wlg/courts/nofr/usstates/lxctusa.htm': continue
    if urlStub == '/wlg/courts/nofr/oeur/lxctjap.htm': continue

    print country

    countryHtml = urlh.open(siteRoot + urlStub).read()
    options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
    countryHtml = tidy.parseString(countryHtml,**options)
    countryHtml.write(strIO)
    strIO.seek(0)
    countryHtml = strIO.read()
    strIO.close()

    countryHtml = re.sub('xmlns="[^"]+"',"",countryHtml)
    countryDoc = etree.fromstring(countryHtml)
    courtHeadingElems = countryDoc.xpath("//font[@color='#009944']")
    for e in courtHeadingElems:
        heading = normalStr(e.text)
        #if not allHeadings.has_key(heading): 
        #    posCount += 1
        #    continue
Beispiel #14
0
class mangareader_downloader(object):
	def __init__(self,manga_name,chapter,end_chapter,manga_location,dl_manager):
		self.manga_location = manga_location
		self.manga_name	    = manga_name
		self.chapter		= chapter
		self.end_chapter	= end_chapter
		self.flag		    = False
		self.current_image  = "000"
		self.img			= ""
		self.next_link	    = ""
		self.current_page   = "http://www.mangareader.net/"+self.manga_name+"/"+self.chapter+"/"
		self.next_regex	    = "<span class=\"next\"><a href=\"([^\"]*)\">Next</a></span>"

		self.nb_of_pages    = 0
		self.page_counter   = 2

		self.br             = URLopener()
		self.response       = ""
		self.response_lines = ""
		self.dl_manager	    = dl_manager
		self.resolved       = {
			'http://i0':'188.132.173.122',
			'http://i1':'188.132.173.3',
			'http://i2':'188.132.173.6',
			'http://i3':'188.132.173.9',
			'http://i4':'188.132.173.12',
			'http://i5':'188.132.173.15',
			'http://i6':'188.132.173.18',
			'http://i7':'188.132.173.21',
			'http://i8':'188.132.173.24',
			'http://i9':'188.132.173.27',
			'http://i10':'188.132.173.30',
			'http://i11':'188.132.173.33',
			'http://i12':'188.132.173.36',
			'http://i13':'188.132.173.39',
			'http://i14':'188.132.173.42',
			'http://i15':'188.132.173.45',
			'http://i16':'188.132.173.48',
			'http://i17':'188.132.173.51',
			'http://i18':'188.132.173.54',
			'http://i19':'188.132.173.57',
			'http://i20':'188.132.173.60',
			'http://i21':'188.132.173.63',
			'http://i22':'188.132.173.66',
			'http://i23':'188.132.173.69',
			'http://i24':'188.132.173.72',
			'http://i25':'188.132.173.75',
			'http://i26':'188.132.173.78',
			'http://i27':'188.132.173.81',
			'http://i28':'188.132.173.84',
			'http://i29':'188.132.173.87',
			'http://i30':'188.132.173.90',
			'http://i31':'188.132.173.93',
			'http://i32':'188.132.173.96',
			'http://i33':'188.132.173.99',
			'http://i34':'188.132.173.126',
			'http://i35':'188.132.173.129',
			'http://i36':'188.132.173.132',
			'http://i37':'188.132.173.135',
			'http://i38':'188.132.173.138',
			'http://i39':'188.132.173.141',
			'http://i40':'188.132.173.144',
			'http://i41':'188.132.173.200',
			'http://i1000':'188.132.173.200',
			'http://i999':'188.132.173.12',
			'http://i998':'188.132.173.48',
			'http://i997':'188.132.173.72',
			'http://i996':'188.132.173.96',
			'http://i995':'188.132.173.144',
			'http://i994':'188.132.173.200'
		}

	def increase_current(self):
		self.current_image = str(int(self.current_image)+1)
		if len(self.current_image) == 1:
			self.current_image = "00"+self.current_image
		elif len(self.current_image) == 2:
			self.current_image = "0"+self.current_image
		self.page_counter+=1

	def increase_chapter(self):
		self.nb_of_pages   = 0
		self.page_counter  = 1
		self.chapter       = str(int(self.chapter)+1)
		self.current_image = "000"
		self.next_link     = "http://www.mangareader.net/"+self.manga_name+"/"+self.chapter+"/"+str(self.page_counter)
		self.page_counter +=1

	def check_chapter_end(self):
		if self.page_counter-1 == self.nb_of_pages:
			return True
		else :
			return False

	def not_published(self):
		if "is not published yet. Once" in self.response or self.chapter == str(int(self.end_chapter)+1):
			return True
		return False

	def go_to_next_page(self):
		if not self.check_chapter_end():
			self.increase_current()
		else:
			self.increase_chapter()
		self.current_page = self.next_link

	def scrap_page(self):
		if self.nb_of_pages == 0:
			for a in self.response_lines:
				if "</select> of " in a:
					self.nb_of_pages = int(re.findall("</select> of (\d+)",a)[0])
					break
		for a in self.response_lines:
			if '"><img id=\"img\"' in a:
				self.img	   = re.findall("src=\"([^\"]*)\" alt",a)[0]
				break
		self.next_link = "http://www.mangareader.net/"+self.manga_name+"/"+self.chapter+"/"+str(self.page_counter)

	def manage_chapters(self):
		if not os.path.exists(self.manga_location):
			os.mkdir(self.manga_location)
		os.chdir(self.manga_location)
		if not os.path.exists(self.manga_name):
			os.mkdir(self.manga_name)
		os.chdir(self.manga_name)
		if not os.path.exists(self.manga_name+"-"+self.chapter):
			os.mkdir(self.manga_name+"-"+self.chapter)
		os.chdir(self.manga_name+"-"+self.chapter)

	def download_image(self):
		self.manage_chapters()
		caching = self.img.split('.')[0]
		if caching in self.resolved:
			self.img = self.img.replace(
				caching+".mangareader.net",
				"http://"+self.resolved[caching])
		if self.dl_manager == 'default':
			urlretrieve(self.img, self.current_image+'.jpg' )
		else:
			status = 1
			while int(status) != 0:
				status = os.system(self.dl_manager +" "+self.img+ " -o "+self.current_image+".jpg")
		print "[*] Image saved to "+ os.getcwd() + "/"+self.current_image+".jpg"

	def start_downloading(self):
		try:
			self.response       = self.br.open(self.current_page).read()
			self.response_lines = self.response.split("\n")
			if not self.not_published():
				self.scrap_page()
				self.manage_chapters()
				self.download_image()
				self.go_to_next_page()
			else :
				self.flag = True
		except Exception,e:
			print e
			time.sleep(2)
			self.start_downloading()
  We need to liaise with Shima-san about contributing to the
  data set.
'''

import os,sys,re
from urllib import URLopener
from lxml import etree
import json



obj = {}

urlh = URLopener()

html = urlh.open("http://en.wikipedia.org/wiki/List_of_supreme_courts_by_country").read()

doc = etree.fromstring(html)

entries = doc.xpath("//table[@class='wikitable']//tr")

stops = ["the","of","a"]

def makeID(court):
    courtID = court.lower()
    courtID = re.split("\s+",courtID)
    for i in range(len(courtID)-1,-1,-1):
        word = courtID[i]
        if word in stops:
            courtID = courtID[0:i] + courtID[i+1:]
    return ".".join(courtID)
  word.

  We need to liaise with Shima-san about contributing to the
  data set.
'''

import os, sys, re
from urllib import URLopener
from lxml import etree
import json

obj = {}

urlh = URLopener()

html = urlh.open(
    "http://en.wikipedia.org/wiki/List_of_supreme_courts_by_country").read()

doc = etree.fromstring(html)

entries = doc.xpath("//table[@class='wikitable']//tr")

stops = ["the", "of", "a"]


def makeID(court):
    courtID = court.lower()
    courtID = re.split("\s+", courtID)
    for i in range(len(courtID) - 1, -1, -1):
        word = courtID[i]
        if word in stops:
            courtID = courtID[0:i] + courtID[i + 1:]
Beispiel #17
0
class mangareader_downloader(object):
    def __init__(self, manga_name, chapter, end_chapter, manga_location,
                 dl_manager):
        self.manga_location = manga_location
        self.manga_name = manga_name
        self.chapter = chapter
        self.end_chapter = end_chapter
        self.flag = False
        self.current_image = "000"
        self.img = ""
        self.next_link = ""
        self.current_page = "http://www.mangareader.net/" + self.manga_name + "/" + self.chapter + "/"
        self.next_regex = "<span class=\"next\"><a href=\"([^\"]*)\">Next</a></span>"

        self.nb_of_pages = 0
        self.page_counter = 2

        self.br = URLopener()
        self.response = ""
        self.response_lines = ""
        self.dl_manager = dl_manager
        self.resolved = {
            'http://i0': '188.132.173.122',
            'http://i1': '188.132.173.3',
            'http://i2': '188.132.173.6',
            'http://i3': '188.132.173.9',
            'http://i4': '188.132.173.12',
            'http://i5': '188.132.173.15',
            'http://i6': '188.132.173.18',
            'http://i7': '188.132.173.21',
            'http://i8': '188.132.173.24',
            'http://i9': '188.132.173.27',
            'http://i10': '188.132.173.30',
            'http://i11': '188.132.173.33',
            'http://i12': '188.132.173.36',
            'http://i13': '188.132.173.39',
            'http://i14': '188.132.173.42',
            'http://i15': '188.132.173.45',
            'http://i16': '188.132.173.48',
            'http://i17': '188.132.173.51',
            'http://i18': '188.132.173.54',
            'http://i19': '188.132.173.57',
            'http://i20': '188.132.173.60',
            'http://i21': '188.132.173.63',
            'http://i22': '188.132.173.66',
            'http://i23': '188.132.173.69',
            'http://i24': '188.132.173.72',
            'http://i25': '188.132.173.75',
            'http://i26': '188.132.173.78',
            'http://i27': '188.132.173.81',
            'http://i28': '188.132.173.84',
            'http://i29': '188.132.173.87',
            'http://i30': '188.132.173.90',
            'http://i31': '188.132.173.93',
            'http://i32': '188.132.173.96',
            'http://i33': '188.132.173.99',
            'http://i34': '188.132.173.126',
            'http://i35': '188.132.173.129',
            'http://i36': '188.132.173.132',
            'http://i37': '188.132.173.135',
            'http://i38': '188.132.173.138',
            'http://i39': '188.132.173.141',
            'http://i40': '188.132.173.144',
            'http://i41': '188.132.173.200',
            'http://i1000': '188.132.173.200',
            'http://i999': '188.132.173.12',
            'http://i998': '188.132.173.48',
            'http://i997': '188.132.173.72',
            'http://i996': '188.132.173.96',
            'http://i995': '188.132.173.144',
            'http://i994': '188.132.173.200'
        }

    def increase_current(self):
        self.current_image = str(int(self.current_image) + 1)
        if len(self.current_image) == 1:
            self.current_image = "00" + self.current_image
        elif len(self.current_image) == 2:
            self.current_image = "0" + self.current_image
        self.page_counter += 1

    def increase_chapter(self):
        self.nb_of_pages = 0
        self.page_counter = 1
        self.chapter = str(int(self.chapter) + 1)
        self.current_image = "000"
        self.next_link = "http://www.mangareader.net/" + self.manga_name + "/" + self.chapter + "/" + str(
            self.page_counter)
        self.page_counter += 1

    def check_chapter_end(self):
        if self.page_counter - 1 == self.nb_of_pages:
            return True
        else:
            return False

    def not_published(self):
        if "is not published yet. Once" in self.response or self.chapter == str(
                int(self.end_chapter) + 1):
            return True
        return False

    def go_to_next_page(self):
        if not self.check_chapter_end():
            self.increase_current()
        else:
            self.increase_chapter()
        self.current_page = self.next_link

    def scrap_page(self):
        if self.nb_of_pages == 0:
            for a in self.response_lines:
                if "</select> of " in a:
                    self.nb_of_pages = int(
                        re.findall("</select> of (\d+)", a)[0])
                    break
        for a in self.response_lines:
            if '"><img id=\"img\"' in a:
                self.img = re.findall("src=\"([^\"]*)\" alt", a)[0]
                break
        self.next_link = "http://www.mangareader.net/" + self.manga_name + "/" + self.chapter + "/" + str(
            self.page_counter)

    def manage_chapters(self):
        if not os.path.exists(self.manga_location):
            os.mkdir(self.manga_location)
        os.chdir(self.manga_location)
        if not os.path.exists(self.manga_name):
            os.mkdir(self.manga_name)
        os.chdir(self.manga_name)
        if not os.path.exists(self.manga_name + "-" + self.chapter):
            os.mkdir(self.manga_name + "-" + self.chapter)
        os.chdir(self.manga_name + "-" + self.chapter)

    def download_image(self):
        self.manage_chapters()
        caching = self.img.split('.')[0]
        if caching in self.resolved:
            self.img = self.img.replace(caching + ".mangareader.net",
                                        "http://" + self.resolved[caching])
        if self.dl_manager == 'default':
            urlretrieve(self.img, self.current_image + '.jpg')
        else:
            status = 1
            while int(status) != 0:
                status = os.system(self.dl_manager + " " + self.img + " -o " +
                                   self.current_image + ".jpg")
        print "[*] Image saved to " + os.getcwd(
        ) + "/" + self.current_image + ".jpg"

    def start_downloading(self):
        try:
            self.response = self.br.open(self.current_page).read()
            self.response_lines = self.response.split("\n")
            if not self.not_published():
                self.scrap_page()
                self.manage_chapters()
                self.download_image()
                self.go_to_next_page()
            else:
                self.flag = True
        except Exception, e:
            print e
            time.sleep(2)
            self.start_downloading()
Beispiel #18
0
#!/usr/bin/env python

from re import sub
from BeautifulSoup import BeautifulSoup
from urllib import URLopener

opener = URLopener()
html = opener.open('http://www.dailyzen.com/').read()

html = html[html.index('<!--Add Quote for correct day-->'):]
html1 = html[:html.index('<br>')]

html2 = html[html.index('<A class="artist">'):]
html2 = html2[:html2.index('</a></i>')]
html2 = sub('<A class="artist">','',html2).strip()

zen = BeautifulSoup(html1)
zen = zen.prettify().strip()

for x in ['<!--Add Quote for correct day-->','<br />','<p>','</p>','^\n','\n$']:
    zen = sub(x,'',zen).strip()

zen = sub('\n \n \n','\n \n',zen).strip()

print
print zen
print
print '\t\t',html2
Beispiel #19
0
memoHeadings = {}

posCount = 0
for elem in courtElems:
    if not elem.text: continue
    country = normalStr(elem.text)
    if country == "Congo RDC": continue
    strIO = cStringIO.StringIO()
    urlStub = elem.attrib['href']
    if urlStub == '/wlg/courts/nofr/usstates/lxctusa.htm': continue
    if urlStub == '/wlg/courts/nofr/oeur/lxctjap.htm': continue

    print country

    countryHtml = urlh.open(siteRoot + urlStub).read()
    options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
    countryHtml = tidy.parseString(countryHtml, **options)
    countryHtml.write(strIO)
    strIO.seek(0)
    countryHtml = strIO.read()
    strIO.close()

    countryHtml = re.sub('xmlns="[^"]+"', "", countryHtml)
    countryDoc = etree.fromstring(countryHtml)
    courtHeadingElems = countryDoc.xpath("//font[@color='#009944']")
    for e in courtHeadingElems:
        heading = normalStr(e.text)
        #if not allHeadings.has_key(heading):
        #    posCount += 1
        #    continue
Beispiel #20
0
def test_ping_play1():
    from urllib import URLopener
    u = URLopener()
    text = "<title>pypy.js various demos</title>"
    assert u.open("http://play1.pypy.org/").read().find(text) != -1
Beispiel #21
0
def test_ping_play1():
    from urllib import URLopener
    u = URLopener()
    text = "<title>pypy.js various demos</title>"
    assert u.open("http://play1.pypy.org/").read().find(text) != -1
Beispiel #22
0
 def open(self, *args):
     f = URLopener.open(self, *args)
     return XML(f)