def read_file_content(self, file_url=None):
        """Return name of temp file in which remote file is saved."""
        if not file_url:
            file_url = self.url
            pywikibot.warning("file_url is not given. "
                              "Set to self.url by default.")
        pywikibot.output(u'Reading file %s' % file_url)
        resume = False
        rlen = 0
        _contents = None
        dt = 15
        uo = URLopener()
        retrieved = False

        while not retrieved:
            if resume:
                pywikibot.output(u"Resume download...")
                uo.addheader('Range', 'bytes=%s-' % rlen)

            infile = uo.open(file_url)

            if 'text/html' in infile.info().getheader('Content-Type'):
                pywikibot.output(u"Couldn't download the image: "
                                 "the requested URL was not found on server.")
                return

            content_len = infile.info().getheader('Content-Length')
            accept_ranges = infile.info().getheader('Accept-Ranges') == 'bytes'

            if resume:
                _contents += infile.read()
            else:
                _contents = infile.read()

            infile.close()
            retrieved = True

            if content_len:
                rlen = len(_contents)
                content_len = int(content_len)
                if rlen < content_len:
                    retrieved = False
                    pywikibot.output(
                        u"Connection closed at byte %s (%s left)" %
                        (rlen, content_len))
                    if accept_ranges and rlen > 0:
                        resume = True
                    pywikibot.output(u"Sleeping for %d seconds..." % dt)
                    time.sleep(dt)
                    if dt <= 60:
                        dt += 15
                    elif dt < 360:
                        dt += 60
            else:
                pywikibot.log(
                    u"WARNING: length check of retrieved data not possible.")
        handle, tempname = tempfile.mkstemp()
        with os.fdopen(handle, "wb") as t:
            t.write(_contents)
        return tempname
Beispiel #2
0
class RemoteFile(object):
    def __init__(self, url):
        self.opener = URLopener()
        self.url = url
        self.filename = url.rstrip('/').rsplit('/', 1)[-1]
        self.offset = 0

    def seek(self, offset, whence=0):
        assert whence == 0
        self.offset = offset

    def read(self, size):
        start = self.offset
        end = start + size - 1
        assert end > start
        h = 'Range', 'bytes={}-{}'.format(start, end)
        stderr.write('Fetching {} {}\n'.format(self.filename, h[1]))
        self.opener.addheaders.append(h)
        data = self.opener.open(self.url).read()
        return data

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
Beispiel #3
0
def get_imagelinks(url):
    """Given a URL, get all images linked to by the page at that URL."""
    # Check if BeautifulSoup is imported.
    if isinstance(BeautifulSoup, ImportError):
        raise BeautifulSoup

    links = []
    uo = URLopener()
    with uo.open(url) as f:
        soup = BeautifulSoup(f.read())

    if not shown:
        tagname = "a"
    elif shown == "just":
        tagname = "img"
    else:
        tagname = ["a", "img"]

    for tag in soup.findAll(tagname):
        link = tag.get("src", tag.get("href", None))
        if link:
            ext = os.path.splitext(link)[1].lower().strip('.')
            if ext in fileformats:
                links.append(urllib.basejoin(url, link))
    return links
Beispiel #4
0
def get_imagelinks(url):
    """Given a URL, get all images linked to by the page at that URL."""
    # Check if BeautifulSoup is imported.
    if isinstance(BeautifulSoup, ImportError):
        raise BeautifulSoup

    links = []
    uo = URLopener()
    with uo.open(url) as f:
        soup = BeautifulSoup(f.read())

    if not shown:
        tagname = "a"
    elif shown == "just":
        tagname = "img"
    else:
        tagname = ["a", "img"]

    for tag in soup.findAll(tagname):
        link = tag.get("src", tag.get("href", None))
        if link:
            ext = os.path.splitext(link)[1].lower().strip('.')
            if ext in fileformats:
                links.append(urllib.basejoin(url, link))
    return links
Beispiel #5
0
    def read_file_content(self, file_url=None):
        """Return name of temp file in which remote file is saved."""
        if not file_url:
            file_url = self.url
            pywikibot.warning("file_url is not given. "
                              "Set to self.url by default.")
        pywikibot.output(u'Reading file %s' % file_url)
        resume = False
        rlen = 0
        _contents = None
        dt = 15
        uo = URLopener()
        retrieved = False

        while not retrieved:
            if resume:
                pywikibot.output(u"Resume download...")
                uo.addheader('Range', 'bytes=%s-' % rlen)

            infile = uo.open(file_url)

            if 'text/html' in infile.info().getheader('Content-Type'):
                pywikibot.output(u"Couldn't download the image: "
                                 "the requested URL was not found on server.")
                return

            content_len = infile.info().getheader('Content-Length')
            accept_ranges = infile.info().getheader('Accept-Ranges') == 'bytes'

            if resume:
                _contents += infile.read()
            else:
                _contents = infile.read()

            infile.close()
            retrieved = True

            if content_len:
                rlen = len(_contents)
                content_len = int(content_len)
                if rlen < content_len:
                    retrieved = False
                    pywikibot.output(
                        u"Connection closed at byte %s (%s left)"
                        % (rlen, content_len))
                    if accept_ranges and rlen > 0:
                        resume = True
                    pywikibot.output(u"Sleeping for %d seconds..." % dt)
                    time.sleep(dt)
                    if dt <= 60:
                        dt += 15
                    elif dt < 360:
                        dt += 60
            else:
                pywikibot.log(
                    u"WARNING: length check of retrieved data not possible.")
        handle, tempname = tempfile.mkstemp()
        with os.fdopen(handle, "wb") as t:
            t.write(_contents)
        return tempname
Beispiel #6
0
def call_api(url, wait=1):
    time.sleep(wait)
    req = URLopener()
    req.addheader('Authorization', 'token ' + TOKEN)
    fp = req.open(url)
    data = json.load(fp)
    fp.close()
    return data
Beispiel #7
0
class HttpFetcherBasic(HttpFetcher):
    def __init__(self, url):
        super().__init__(url)
        self.urlop = URLopener()
        for hdr, val in (tuple(x.split("=", 1)) if "=" in x else (x, "")
                         for x in url.fragment.split("&") if x):
            self.urlop.addheader(hdr, val)

    def open(self, url):
        return self.urlop.open(url)
class RemoteFile(object):
    def __init__(self, url):
        self.opener = URLopener()
        self.url = url
        self.filename = url.rstrip('/').rsplit('/', 1)[-1]
        self.offset = 0

    def seek(self, offset, whence=0):
        assert whence == 0
        self.offset = offset

    def read(self, size):
        start = self.offset
        end = start + size - 1
        assert end > start
        h = 'Range', 'bytes={}-{}'.format(start, end)
        stderr.write('Fetching {} {}\n'.format(self.filename, h[1]))
        self.opener.addheaders.append(h)
        data = self.opener.open(self.url).read()
        return data
def get_imagelinks(url):
    """Given a URL, get all images linked to by the page at that URL."""
    links = []
    uo = URLopener()
    file = uo.open(url)
    soup = BeautifulSoup.BeautifulSoup(file.read())
    file.close()
    if not shown:
        tagname = "a"
    elif shown == "just":
        tagname = "img"
    else:
        tagname = ["a", "img"]

    for tag in soup.findAll(tagname):
        link = tag.get("src", tag.get("href", None))
        if link:
            ext = os.path.splitext(link)[1].lower().strip('.')
            if ext in fileformats:
                links.append(urllib.basejoin(url, link))
    return links
Beispiel #10
0
def get_imagelinks(url):
    """Given a URL, get all images linked to by the page at that URL."""
    links = []
    uo = URLopener()
    file = uo.open(url)
    soup = BeautifulSoup.BeautifulSoup(file.read())
    file.close()
    if not shown:
        tagname = "a"
    elif shown == "just":
        tagname = "img"
    else:
        tagname = ["a", "img"]

    for tag in soup.findAll(tagname):
        link = tag.get("src", tag.get("href", None))
        if link:
            ext = os.path.splitext(link)[1].lower().strip('.')
            if ext in fileformats:
                links.append(urllib.basejoin(url, link))
    return links
def test6(url = "http://example.com"):
    od = URLopener()
    # ruleid: insecure-urlopener-open
    od.open(url)
Beispiel #12
0
    def read_file_content(self, file_url=None):
        """Return name of temp file in which remote file is saved."""
        if not file_url:
            file_url = self.url
            pywikibot.warning('file_url is not given. '
                              'Set to self.url by default.')
        pywikibot.output('Reading file {}'.format(file_url))
        resume = False
        rlen = 0
        _contents = None
        dt = 15
        uo = URLopener()
        retrieved = False

        while not retrieved:
            if resume:
                pywikibot.output('Resume download...')
                uo.addheader('Range', 'bytes={}-'.format(rlen))

            with closing(uo.open(file_url)) as infile:
                info = infile.info()

                info_get = info.get
                content_type = info_get('Content-Type')
                content_len = info_get('Content-Length')
                accept_ranges = info_get('Accept-Ranges')

                if 'text/html' in content_type:
                    pywikibot.output(
                        "Couldn't download the image: "
                        'the requested URL was not found on server.')
                    return

                valid_ranges = accept_ranges == 'bytes'

                if resume:
                    _contents += infile.read()
                else:
                    _contents = infile.read()

            retrieved = True
            if content_len:
                rlen = len(_contents)
                content_len = int(content_len)
                if rlen < content_len:
                    retrieved = False
                    pywikibot.output(
                        'Connection closed at byte {} ({} left)'.format(
                            rlen, content_len))
                    if valid_ranges and rlen > 0:
                        resume = True
                    pywikibot.output('Sleeping for {} seconds...'.format(dt))
                    pywikibot.sleep(dt)
                    if dt <= 60:
                        dt += 15
                    elif dt < 360:
                        dt += 60
            else:
                pywikibot.log(
                    'WARNING: length check of retrieved data not possible.')
        handle, tempname = tempfile.mkstemp()
        with os.fdopen(handle, 'wb') as t:
            t.write(_contents)
        return tempname
Beispiel #13
0
        os.mkdir(vol_dir)

    url_1 = "ftp://s220ftp.tipo.gov.tw/PatentPubXML_" + str(num) + "/"
    files = get_ftp_ls(url_1)
    files = [f.decode() for f in files]
    dirs = [d for d in files if "." not in d]
    print("num of dirs in " + str(num) + " folder: " + str(len(dirs)))

    for d in dirs[:]:
        url_2 = url_1 + d + "/"
        files = get_ftp_ls(url_2)
        filename = [f.decode() for f in files if f.lower().endswith(b'.xml')]
        if len(filename) < 1:
            print("fail url:", url_2)
        else:
            filename = filename[0]

            if not filename in os.listdir(vol_dir):
                url_3 = url_2 + filename
                store_path = os.path.join(vol_dir, filename)
                opener = URLopener()
                with opener.open(url_3) as remote_file, open(
                        store_path, 'wb') as local_file:
                    shutil.copyfileobj(remote_file, local_file)

            document_count += 1
            if document_count % 1000 == 0:
                print("num of docs downloaded:", document_count)

    start_num += 1
def test2():
    od = URLopener()
    # ruleid: insecure-urlopener-open-ftp
    url = "ftp://example.com"
    od.open(url)
def test6_ok(url = "https://example.com"):
    od = URLopener()
    # ok: insecure-urlopener-open
    od.open(url)
Beispiel #16
0
# easy manipulation.
soup = BeautifulSoup(result.content, "html.parser")
print('Page loaded successfully.')

# Finding all the images with a width of '160', in this way it is possible to
# select the needed images because are the only ones with this size.
imgs = soup.findAll('img', width='160')
names = []  # Used to store the path of images on the page.

# Loop over the iterable result of the search to get the path from the 'src'
# attribute.
for x in imgs:
    names.append(x['src'])

# Looping over the paths of needed images to read them, open, transform into
# PIL image object and save with proper name.
for x in names:
    img = opener.open(MAIN_PATH + x)
    img = img.read()
    img = Image.open(BytesIO(img))
    IMG_NAME = '_'.join((x[15:25], x[78:80], '00')) + '.gif'
    img.save(os.path.join(SAVE_FOLDER, IMG_NAME))
img.close()
print('Images saved correctly in ' + SAVE_FOLDER)
new_imgs = os.listdir(SAVE_FOLDER)
for img in new_imgs:
    print("---> " + img)

# Just to not directly close the prompt.
input('Press ENTER to exit')
def test1():
    od = URLopener()
    # ruleid: insecure-urlopener-open-ftp
    od.open("ftp://example.com")
def test2_ok():
    od = URLopener()
    # ok: insecure-urlopener-open-ftp
    url = "ftps://example.com"
    od.open(url)
def test1_ok():
    od = URLopener()
    # ok: insecure-urlopener-open
    od.open("https://example.com")
def getImage(url):
    MAX_TITLE_DESC = 100
    MAX_TITLE = 255

    uo = URLopener()
    file = uo.open(url)
    soup = BeautifulSoup.BeautifulSoup(file.read())
    file.close()
    outImage = Image()

    imgTag = soup.find("img", { "class" : "imageWithCaption" })
    link = imgTag.get("src", imgTag.get("href", None))
    if link:
        outImage.url = urllib.basejoin(url, link)
        caption = soup.find("div", { "id" : "caption" })
        captionTxt = caption.string
        #Kuressaare linnus, vaade põhjast (SM F 3761:473 F); Saaremaa Muuseum; Faili nimi:smf_3761_473.jpg
        (capPart1, museumName, capPart3) = captionTxt.split(';')
        museumName = museumName.strip()
        matchItemRef = re.search("^(.+)\((.+?)\)$", capPart1)
        if (matchItemRef and matchItemRef.group(2)): 
            outImage.source = u'[%s %s, %s]' % ( url, museumName, matchItemRef.group(2) )
            outImage.source.strip()

        mainTable = soup.find("table", {"class" : "data highlighted"})
        outDesc = u"<table>\n"
        outDesc += getWikiTable(mainTable, outImage)
        
        mainTable = soup.find("table", {"class" : "data"})
        outDesc += getWikiTable(mainTable, outImage)

        mainTable = soup.find("table", {"class" : "data full_length"})
        outDesc += getWikiTable(mainTable, outImage)
        outDesc += u"</table>\n"

        titleStart = matchItemRef.group(1).strip()
        if ( len(titleStart) > MAX_TITLE_DESC ):
            #shorten title beginning
            titleStart = titleStart[:MAX_TITLE_DESC]
        outImage.name = titleStart + u', ' + outImage.accession_number + u'.jpg'
        outImage.name = cleanUpTitle( outImage.name )
        if ( len(outImage.name) > MAX_TITLE ):
            #shorten title
            outImage.name = outImage.name[:MAX_TITLE]
        
        outImage.description = '{{et|1=' + outDesc + '}}'
        outImage.license = '{{PD-old}}'
        
        ##add categories
        museumName = museumName.encode('utf_8')
        if museumData.get(museumName) and museumData.get(museumName).get('enName'):
            museumEnName = museumData.get(museumName).get('enName')
            outImage.institution = u'{{Institution:' + museumEnName + u'}}'
            museumCat = u'Images from the ' + museumEnName
            outImage.categories.append( museumCat )
        else:
            print "Museum enName not found for %s ! \n" % url
            return None

            
    return outImage