Esempio n. 1
0
    def fetch_url(self, url):
        filename = url.split('/')[-1]
        utilities.download_file(url, self.processingDir)

        xmldoc = minidom.parse(os.path.join(self.processingDir, filename))

        MainPubDate = xmldoc.getElementsByTagName('pubDate')[0].firstChild.data
        epochPubDate = datetime.datetime.strptime(MainPubDate, "%a, %d %b %Y %H:%M:%S +0200").strftime('%s')
        print "main date " + MainPubDate

        # if (epochPubDate <= self.lastFetchDate):
        #     return 0

        itemlist = xmldoc.getElementsByTagName('item')

        for elt in itemlist :
            # TODO : Test object first
            title = elt.getElementsByTagName('title')[0].firstChild.data
            link = elt.getElementsByTagName('link')[0].firstChild.data
            pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data
#            print "link " + link

            if (epochPubDate <= self.lastFetchDate):
                break

            if (not os.path.isfile(os.path.join(self.processingDir, link.split('/')[-1]))):
                print "Downloading %s" % (link)
                utilities.download_file(link, self.processingDir)
Esempio n. 2
0
    def fetch_url(self, url):
        filename = url.split('/')[-1]
        utilities.download_file(url, self.processingDir)

        xmldoc = minidom.parse(os.path.join(self.processingDir, filename))

        MainPubDate = xmldoc.getElementsByTagName('pubDate')[0].firstChild.data
        epochPubDate = datetime.datetime.strptime(
            MainPubDate, "%a, %d %b %Y %H:%M:%S +0200").strftime('%s')
        print "main date " + MainPubDate

        # if (epochPubDate <= self.lastFetchDate):
        #     return 0

        itemlist = xmldoc.getElementsByTagName('item')

        for elt in itemlist:
            # TODO : Test object first
            title = elt.getElementsByTagName('title')[0].firstChild.data
            link = elt.getElementsByTagName('link')[0].firstChild.data
            pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data

            if (epochPubDate <= self.lastFetchDate):
                break

            if (not os.path.isfile(
                    os.path.join(self.processingDir,
                                 link.split('/')[-1]))):
                print "Downloading %s" % (link)
                utilities.download_file(link, self.processingDir)

            self.processOffer(link)
Esempio n. 3
0
    def fetch_offer(self, url):
        if (not os.path.isfile(os.path.join(self.processingDir, url.split('/')[-1]))):
            print "Downloading %s" % (url)
            utilities.download_file(url, self.processingDir)
        else:
            print "Download failed. File already there."

        return os.path.join("", url.split('/')[-1])
Esempio n. 4
0
    def fetch_url(self, url):
        filename = url.split('/')[-1]
        utilities.download_file(url, self.processingDir)

        xmlfile = os.path.join(self.processingDir, filename)
        fileObj = codecs.open( xmlfile, "r", "utf-8" )
        content = fileObj.read()
        xmldoc = minidom.parseString( content )
        fileObj.close()
        #xmldoc = minidom.parse(xmlfile)

        MainPubDate = xmldoc.getElementsByTagName('lastBuildDate')[0].firstChild.data
        MainPubDate = MainPubDate[:MainPubDate.rindex(' ')]
        epochPubDate = datetime.datetime.strptime(MainPubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s')

        if (epochPubDate <= self.lastFetchDate):
            return 0

        itemlist = xmldoc.getElementsByTagName('item')

        for elt in itemlist :
            # TODO : Test object first
            title = elt.getElementsByTagName('title')[0].firstChild.data
            link = elt.getElementsByTagName('link')[0].firstChild.data.split("?")[0] + "index.html"
            pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data
            pubDate = pubDate[:pubDate.rindex(' ')]

            if (epochPubDate <= self.lastFetchDate):
                break

#            if (not os.path.isfile(os.path.join(self.processingDir, link.split('/')[-1]))):
            offer = ProgressiveOffer()
            guid = elt.getElementsByTagName('guid')[0].firstChild.data
            offer.ref = guid.split('/')[-2]
            print "Processing %s" % (offer.ref)
            offer.date_add = int(time.time())
            loc = Location()
            offer.lat = loc.lat
            offer.lon = loc.lon
            offer.title = title.encode( 'iso-8859-1' )
            offer.url = link
            offer.date_pub = datetime.datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s')
            offer.content = elt.getElementsByTagName('description')[0].firstChild.data
            offer.content = offer.content.encode( 'iso-8859-1' )

            offer.company = 'Progressive Recruitment'

            offer.location = 'NA'
            offer.cleanLocation()

            offer.contract = 'NA'
            offer.cleanContract()

            offer.salary = 'NA'
            offer.cleanSalary()

            offer.experience = 'NA'
            offer.add_db() 
Esempio n. 5
0
    def fetch_offer(self, url):
        if (not os.path.isfile(
                os.path.join(self.processingDir,
                             url.split('/')[-1]))):
            print "Downloading %s" % (url)
            utilities.download_file(url, self.processingDir)
        else:
            print "Download failed. File already there."

        return os.path.join("", url.split('/')[-1])
Esempio n. 6
0
    def fetch_url(self, url):
        filename = url.split("/")[-1]
        utilities.download_file(url, self.processingDir)

        xmldoc = minidom.parse(os.path.join(self.processingDir, filename))

        MainPubDate = xmldoc.getElementsByTagName("pubDate")[0].firstChild.data

        itemlist = xmldoc.getElementsByTagName("item")

        for elt in itemlist:
            # TODO : Test object first
            title = elt.getElementsByTagName("title")[0].firstChild.data
            link = elt.getElementsByTagName("link")[0].firstChild.data.split("?")[0]
            pubDate = elt.getElementsByTagName("pubDate")[0].firstChild.data

        if not os.path.isfile(os.path.join(self.processingDir, link.split("/")[-1])):
            print "Downloading %s" % (link)
            utilities.download_file(link, self.processingDir)
def download_item(module, item, session):
    filename = clean_file_name(item.name)
    moduleFilename = clean_file_name(module.name)

    fullPath = "{0}{1}/{2}".format(DOWNLOAD_DIR, moduleFilename, filename)

    print("  - Downloading ({0}-{1}) {2}...".format(item.itemType, item.itemId,
                                                    item.name))

    if item.itemType == 'Attachment':
        download_file(
            "{0}/courses/{1}/files/{2}/download".format(
                CANVAS_URL, COURSE_ID, item.itemId), session, fullPath)
    else:
        # Add .html extension for webpage downloads
        fullPath += ".html"
        download_file(
            "{0}/courses/{1}/modules/items/{2}".format(CANVAS_URL, COURSE_ID,
                                                       item.modId), session,
            fullPath)
Esempio n. 8
0
def download_files(job_name, output_id, output_folder=None):
    """ Downloads the files from the output of the job locally

    Parameters
    ----------
    job_name:  [str] The name of the job  e.g run_cntk, run_pytorch
    output_id: [str] The id of the output you want to download the files from e.g stdOuterr, notebooks
    """
    if output_folder:
        logger.info('Downloading files to {}'.format(output_folder))

    files = client.jobs.list_output_files(
        config.group_name, job_name,
        models.JobsListOutputFilesOptions(output_id))
    for file in files:
        logger.info('Downloading {}'.format(file.name))
        file_name = path.join(output_folder,
                              file.name) if output_folder else file.name
        ut.download_file(file.download_url, file_name)
    print("All files Downloaded")
Esempio n. 9
0
    def fetch_url(self, url):
        filename = url.split('/')[-1]
        utilities.download_file(url, self.processingDir)

        xmldoc = minidom.parse(os.path.join(self.processingDir, filename))

        MainPubDate = xmldoc.getElementsByTagName('pubDate')[0].firstChild.data

        itemlist = xmldoc.getElementsByTagName('item')

        for elt in itemlist:
            # TODO : Test object first
            title = elt.getElementsByTagName('title')[0].firstChild.data
            link = elt.getElementsByTagName('link')[0].firstChild.data.split(
                "?")[0]
            pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data

        if (not os.path.isfile(
                os.path.join(self.processingDir,
                             link.split('/')[-1]))):
            print "Downloading %s" % (link)
            utilities.download_file(link, self.processingDir)
    def epoch_init(self):
        self.all_images = []
        self.all_landmarks = []

        if self.tar_idx < 10:
            tarfilestr = "00" + str(self.tar_idx)
        elif self.tar_idx < 100:
            tarfilestr = "0" + str(self.tar_idx)
        else:
            tarfilestr = str(self.tar_idx)

        download_file(
            "https://s3.amazonaws.com/google-landmark/train/images_{}.tar".
            format(tarfilestr),
            "images.tar",
            bar=False)
        #print(os.listdir())
        tar = tarfile.open('images.tar')
        tar.extractall("imagesfolder")
        tar.close()

        self.total = self.pickfiles("imagesfolder")
        self.tar_idx += 1
        print("tar", self.tar_idx - 1, "total:", self.total)
Esempio n. 11
0
    def fetch_url(self, url):
        filename = url.split('/')[-1]
        utilities.download_file(url, self.processingDir)

        xmlfile = os.path.join(self.processingDir, filename)
        fileObj = codecs.open(xmlfile, "r", "utf-8")
        content = fileObj.read()
        xmldoc = minidom.parseString(content)
        fileObj.close()
        #xmldoc = minidom.parse(xmlfile)

        MainPubDate = xmldoc.getElementsByTagName(
            'lastBuildDate')[0].firstChild.data
        MainPubDate = MainPubDate[:MainPubDate.rindex(' ')]
        epochPubDate = datetime.datetime.strptime(
            MainPubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s')

        if (epochPubDate <= self.lastFetchDate):
            return 0

        itemlist = xmldoc.getElementsByTagName('item')

        for elt in itemlist:
            # TODO : Test object first
            title = elt.getElementsByTagName('title')[0].firstChild.data
            link = elt.getElementsByTagName('link')[0].firstChild.data.split(
                "?")[0] + "index.html"
            pubDate = elt.getElementsByTagName('pubDate')[0].firstChild.data
            pubDate = pubDate[:pubDate.rindex(' ')]

            if (epochPubDate <= self.lastFetchDate):
                break


#            if (not os.path.isfile(os.path.join(self.processingDir, link.split('/')[-1]))):
            offer = ProgressiveOffer()
            guid = elt.getElementsByTagName('guid')[0].firstChild.data
            offer.ref = guid.split('/')[-2]
            print "Processing %s" % (offer.ref)
            offer.date_add = int(time.time())
            loc = Location()
            offer.lat = loc.lat
            offer.lon = loc.lon
            offer.title = title.encode('iso-8859-1')
            offer.url = link
            offer.date_pub = datetime.datetime.strptime(
                pubDate, "%a, %d %b %Y %H:%M:%S").strftime('%s')
            offer.content = elt.getElementsByTagName(
                'description')[0].firstChild.data
            offer.content = offer.content.encode('iso-8859-1')

            offer.company = 'Progressive Recruitment'

            offer.location = 'NA'
            offer.cleanLocation()

            offer.contract = 'NA'
            offer.cleanContract()

            offer.salary = 'NA'
            offer.cleanSalary()

            offer.experience = 'NA'
            offer.add_db()