예제 #1
0
def readURL(url, destination):
    #TODO: replace the split of destination with cross-os compatible operation
    if os.path.isdir(destination.rsplit('/', 1)[-2]) is False:
        os.mkdir(destination.rsplit('/', 1)[-2])
    if destination is None:
        log.error("Destination is not specified when downloading data",
                  error=True)
    if (os.path.isfile(destination)):
        log.error("%s file already exists ..." % destination, error=True)
    r = requests.get(url, stream=True)
    if r.status_code != 200:
        log.error('URL: %s does not exist' % url, error=True)
    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024
    wrote = 0
    with open(destination, 'wb') as f:
        log.status("Downloading from %s to %s..." % (url, destination))
        for data in tqdm(r.iter_content(block_size),
                         total=math.ceil(total_size // block_size),
                         unit='KB',
                         unit_scale=True,
                         leave=False):
            wrote = wrote + len(data)
            f.write(data)
    f.close()
    if total_size != 0 and wrote != total_size:
        log.error("Error downloading the data ...")
    log.success("Download complete!")
    return True
예제 #2
0
    def __unify_dataset(self, active=True):
        log.status("Unify was called ...")

        all_vidids = {}
        violators = []

        for seq_key in list(self.computational_sequences.keys()):
            for vidid in list(
                    self.computational_sequences[seq_key].data.keys()):
                vidid = vidid.split('[')[0]
                all_vidids[vidid] = True

        for vidid in list(all_vidids.keys()):
            for seq_key in list(self.computational_sequences.keys()):
                if not any([
                        vidid_in_seq for vidid_in_seq in
                        self.computational_sequences[seq_key].data.keys()
                        if vidid_in_seq[:len(vidid)] == vidid
                ]):
                    violators.append(vidid)
        if len(violators) > 0:
            for violator in violators:
                log.error(
                    "%s entry is not shared among all sequences, removing it ..."
                    % violator,
                    error=False)
                if active == True:
                    self.__remove_id(violator)
        if active == False and len(violators) > 0:
            log.error(
                "%d violators remain, alignment will fail if called ..." %
                len(violators),
                error=True)

        log.success("Unify finished, dataset is compatible for alignment ...")
예제 #3
0
    def __init__(self, recipe, destination=None):
        self.computational_sequences = {}

        if type(recipe) is str:
            if os.path.isdir(recipe) is False:
                log.error("Dataset folder does not exist ...", error=True)

            from os import listdir
            from os.path import isfile, join
            computational_sequence_list = [
                f for f in listdir(recipe)
                if isfile(join(recipe, f)) and f[-4:] == '.csd'
            ]
            for computational_sequence_fname in computational_sequence_list:
                this_sequence = computational_sequence(
                    join(recipe, computational_sequence_fname))
                self.computational_sequences[
                    this_sequence.metadata["root name"]] = this_sequence

        if type(recipe) is dict:
            for entry, address in recipe.items():
                self.computational_sequences[entry] = computational_sequence(
                    address, destination)

        if len(self.computational_sequences.keys()) == 0:
            log.error("Dataset failed to initialize ...", error=True)

        log.success("Dataset initialized successfully ... ")
예제 #4
0
def validateMetadataIntegrity(metadata, rootName, which=True):
    log.status(
        "Checking the integrity of the metadata in <%s> computational sequence ..."
        % rootName)
    failure = False
    if type(metadata) is not dict:
        log.error(
            "<%s> computational sequence metadata is not key-value pairs!",
            error=True)
    presenceFlag = [
        mtd in metadata.keys() for mtd in featuresetMetadataTemplate
    ]
    #check if all the metadata is set
    if all(presenceFlag) is False:
        #which one is not set
        if which:
            missings = [
                x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag)
                if y is False
            ]
            log.error("Missing metadata in <%s> computational sequence: %s" %
                      (rootName, str(missings)),
                      error=False)
        failure = True
        #if failed before
    if failure:
        log.error(
            msgstring=
            "<%s> computational sequence does not have all the required metadata ..."
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence metadata in correct format" %
                    rootName)
    return True
예제 #5
0
	def _checkIntegrity(self,error=True):
		if not hasattr(self,'metadata') or not hasattr(self,'data'):
			log.error("computational sequence is blank (data or metadata is missing)")
		log.status("Checking the integrity of the <%s> computational sequence ..."%self.metadata["root name"])
		#TODO: hash check not implemented yet
		datavalid=validateDataIntegrity(self.data,self.metadata["root name"],which=False)
		metadatavalid=validateMetadataIntegrity(self.metadata,self.metadata["root name"],which=False)
		if datavalid and metadatavalid:
			log.success("<%s> computational sequence is valid!"%self.metadata["root name"])
예제 #6
0
def readCSD(resource, destination=None):

    if (resource is None):
        raise log.error("No resource specified for computational sequence!",
                        error=True)
    if os.path.isfile(resource) is False:
        log.error("%s file not found, please check the path ..." % resource,
                  error=True)
    try:
        h5handle = h5py.File('%s' % resource, 'r')
    except:
        raise log.error(
            "%s resource is not a valid hdf5 computational sequence  ..." %
            resource,
            error=True)
    log.success("Computational sequence read from file %s ..." % resource)
    return h5handle, dict(h5handle[list(
        h5handle.keys())[0]]["data"]), metadataToDict(h5handle[list(
            h5handle.keys())[0]]["metadata"])
예제 #7
0
def writeCSD(data, metadata, rootName, destination):
    #check the data to make sure it is in correct format
    validateDataIntegrity(data, rootName)
    validateMetadataIntegrity(metadata, rootName)

    log.status("Writing the <%s> computational sequence data to %s" %
               (rootName, destination))
    #opening the file
    writeh5Handle = h5py.File(destination, 'w')
    #creating the root handle
    rootHandle = writeh5Handle.create_group(rootName)

    #writing the data
    dataHandle = rootHandle.create_group("data")
    pbar = tqdm(total=len(data.keys()),
                unit=" Computational Sequence Entries",
                leave=False)
    for vid in data:
        vidHandle = dataHandle.create_group(vid)
        vidHandle.create_dataset("features", data=data[vid]["features"])
        vidHandle.create_dataset("intervals", data=data[vid]["intervals"])
        pbar.update(1)
    pbar.close()
    log.success("<%s> computational sequence data successfully wrote to %s" %
                (rootName, destination))
    log.status("Writing the <%s> computational sequence metadata to %s" %
               (rootName, destination))
    #writing the metadata
    metadataHandle = rootHandle.create_group("metadata")
    for metadataKey in metadata.keys():
        metadataHandle.create_dataset(
            metadataKey, (1, ),
            dtype=h5py.special_dtype(vlen=unicode)
            if sys.version_info.major is 2 else h5py.special_dtype(vlen=str))
        cast_operator = unicode if sys.version_info.major is 2 else str
        metadataHandle[metadataKey][0] = cast_operator(metadata[metadataKey])

    writeh5Handle.close()
    log.success(
        "<%s> computational sequence metadata successfully wrote to %s" %
        (rootName, destination))
    log.success("<%s> computational sequence successfully wrote to %s ..." %
                (rootName, destination))
예제 #8
0
    def align(self, reference, collapse_functions=None, replace=True):
        aligned_output = {}

        for sequence_name in self.computational_sequences.keys():
            aligned_output[sequence_name] = {}
        if reference not in self.computational_sequences.keys():
            log.error("Computational sequence <%s> does not exist in dataset" %
                      reference,
                      error=True)
        refseq = self.computational_sequences[reference].data
        # unifying the dataset, removing any entries that are not in the reference computational sequence
        self.__unify_dataset()

        # building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary
        log.status(
            "Alignment based on <%s> computational sequence will start shortly ..."
            % reference)
        relevant_entries = self.__get_relevant_entries(reference)

        pbar = tqdm(total=len(refseq.keys()),
                    unit=" Computational Sequence Entries",
                    leave=False)
        pbar.set_description("Overall Progress")
        for entry_key in list(refseq.keys()):
            pbar_small = tqdm(total=refseq[entry_key]['intervals'].shape[0],
                              unit=" Segments",
                              leave=False)
            pbar_small.set_description("Aligning %s" % entry_key)
            for i in range(refseq[entry_key]['intervals'].shape[0]):
                #interval for the reference sequence
                ref_time = refseq[entry_key]['intervals'][i, :]
                #we drop zero or very small sequence lengths - no align for those
                if (abs(ref_time[0] - ref_time[1]) < epsilon):
                    pbar_small.update(1)
                    continue

                #aligning all sequences (including ref sequence) to ref sequence
                for otherseq_key in list(self.computational_sequences.keys()):
                    if otherseq_key != reference:
                        intersects, intersects_features = self.__intersect_and_copy(
                            ref_time,
                            relevant_entries[otherseq_key][entry_key], epsilon)
                    else:
                        intersects, intersects_features = refseq[entry_key][
                            'intervals'][i, :][None, :], refseq[entry_key][
                                'features'][i, :][None, :]
                    #there were no intersections between reference and subject computational sequences for the entry
                    if intersects.shape[0] == 0:
                        continue
                    #collapsing according to the provided functions
                    if type(collapse_functions) is list:

                        intersects, intersects_features = self.__collapse(
                            intersects, intersects_features,
                            collapse_functions)
                    #two following lines were commented due to incompatibility with our task
                    #if(intersects.shape[0]!=intersects_features.shape[0]):
                    #	log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference))
                    aligned_output[otherseq_key][entry_key + "[%d]" % i] = {}
                    aligned_output[otherseq_key][entry_key + "[%d]" %
                                                 i]["intervals"] = intersects
                    aligned_output[otherseq_key][
                        entry_key +
                        "[%d]" % i]["features"] = intersects_features
            pbar_small.update(1)
            pbar_small.close()
            pbar.update(1)
        pbar.close()
        log.success("Alignment to <%s> complete." % reference)
        if replace is True:
            log.status(
                "Replacing dataset content with aligned computational sequences"
            )
            self.__set_computational_sequences(aligned_output)
            return None
        else:
            log.status(
                "Creating new dataset with aligned computational sequences")
            newdataset = mmdataset({})
            newdataset.__set_computational_sequences(aligned_output,
                                                     metadata_copy=False)
            return newdataset
예제 #9
0
def validateDataIntegrity(data, rootName, which=True):
    log.status(
        "Checking the integrity of the data in <%s> computational sequence ..."
        % rootName)

    pbar = tqdm(total=len(data.keys()),
                unit=" Computational Sequence Entries",
                leave=False)
    failure = False
    if (type(data) is not dict):
        #this will cause the rest of the pipeline to crash - RuntimeError
        log.error(
            "%s computational sequence data is not in heirarchy format ...",
            error=True)
    try:
        #for each video check the shapes of the intervals and features
        for vid in data.keys():
            #check the intervals first - if failure simply show a warning - no exit since we want to identify all the cases
            if len(data[vid]["intervals"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong intervals array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #check the features next
            if len(data[vid]["features"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong features array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #if the first dimension of intervals and features doesn't match
            if data[vid]["features"].shape[0] != data[vid]["intervals"].shape[
                    0]:
                if which:
                    log.error(
                        "Video <%s> in <%s> computational sequence - features and intervals have different first dimensions. "
                        % (vid, rootName),
                        error=False)
                failure = True
            pbar.update(1)
    #some other thing has happened! - RuntimeError
    except:
        if which:
            log.error(
                "<%s> computational sequence data itegrity could not be checked. "
                % rootName,
                error=True)
        pbar.close()
    pbar.close()

    #failure during intervals and features check
    if failure:
        log.error(
            "<%s> computational sequence data integrity check failed due to inconsistency in intervals and features. "
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence data in correct format." %
                    rootName)
        return True