Example #1
0
    def run(self):
        """
        Get doublicates of article of sameas webservice
        create datasources with resources
        updates resources (download content, save content to disk if it is new or was updated)
        """

        self.done = 0
        directoryBaseURL = self.config['directoryURL']
        dbPediaURL = self.config['dbPediaURL']
        directoryURL = "%s%s%s" % (directoryBaseURL, dbPediaURL, self.article)
        page = json.load(urllib2.urlopen(directoryURL))
        duplicates = page[0]["duplicates"]
        self.total = len(duplicates)

        # create resources and append resources to datasources
        for url in duplicates:
            #DEBUG only list freebase and geonames
            if True or "freebase" in url or "geonames" in url:
                resource = Resource(url)
                if resource.domain not in self.datasources:
                    datasource = Datasource(resource.domain, self.lastdate)
                    self.datasources[resource.domain] = datasource
                datasource.resources.append(resource)

        # update datasources, dublicate detection, creation of json
        for domain, datasource in self.datasources.iteritems():
            if not self._stop.is_set():  #do not proceed if stop is set
                datasource.update()
                self.done += 1

        self.completed = 1
        self.callback(self.datasources)
Example #2
0
                raise Exception('类型' + str(type(entity.media)) + '无法识别')

            cur_file = Path(filename)
            if cur_file.exists():
                log.info('文件已存在' + str(filename))
            else:
                log.info('开始下载[' + str(filename) + '], 当前进度' + str(index) + "/" + str(total))
                client.download_media(entity, filename)
                end_time = datetime.datetime.now()
                log.info('下载完成[' + str(filename) + '], 耗时' + str(end_time - start_time))

        except BaseException as e:
            try:
                log.info('下载失败,尝试删除文件[' + filename + ']')
                cur_file.unlink()
            except IsADirectoryError as ie:
                log.error('删除文件[' + filename + ']失败' + ie)
            log.error('Exception:' + str(index) + ':' + str(e))
    log.info(_type + '类型文件下载结束')


if config.job['type_video'] == 1:
    download(InputMessagesFilterVideo)

if config.job['type_photo'] == 1:
    download(InputMessagesFilterPhotos)

client.disconnect()
if args.d is not None:
    exec_id = my_source.update('update job_exec set end_time = sysdate() where id = ' + str(exec_id))