def validateMetadataIntegrity(metadata, rootName, which=True):
    log.status(
        "Checking the integrity of the metadata in <%s> computational sequence ..."
        % rootName)
    failure = False
    if type(metadata) is not dict:
        log.error(
            "<%s> computational sequence metadata is not key-value pairs!",
            error=True)
    presenceFlag = [
        mtd in metadata.keys() for mtd in featuresetMetadataTemplate
    ]
    #check if all the metadata is set
    if all(presenceFlag) is False:
        #which one is not set
        if which:
            missings = [
                x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag)
                if y is False
            ]
            log.error("Missing metadata in <%s> computational sequence: %s" %
                      (rootName, str(missings)),
                      error=False)
        failure = True
        #if failed before
    if failure:
        log.error(
            msgstring=
            "<%s> computational sequence does not have all the required metadata ..."
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence metadata in correct format" %
                    rootName)
    return True
	def hard_unify(self,active=True):
		log.status("Hard unify was called ...")

		all_vidids={}
		violators=[]
		
		all_keys={}
		for seq_key in list(self.computational_sequences.keys()):
			all_keys[seq_key]=[vidid for vidid in self.computational_sequences[seq_key].data.keys()]
		
		valids=set.intersection(*[set(all_keys[x]) for x in all_keys])
		for seq_key in list(self.computational_sequences.keys()):
			hard_unify_compatible=all(["[" in vidid for vidid in self.computational_sequences[seq_key].data.keys()])
			if hard_unify_compatible is False:
				log.error("Hard unify can only be done on aligned computational sequences, %s violated this ... Exiting ..."%seq_key)
			violators=set([vidid for vidid in self.computational_sequences[seq_key].data.keys()])-valids
			for violator in violators:
				if active==True:
					log.error("%s entry is not shared among all sequences, removing it ..."%violator,error=False)
					self[seq_key]._remove_id(violator,purge=False)

			if active==False and len(violators)>0:
				log.error("%d violators remain, alignment will fail if called ..."%len(violators),error=True)
		
		log.success("Hard unify completed ...")
def readURL(url, destination):
    #TODO: replace the split of destination with cross-os compatible operation
    if os.path.isdir(destination.rsplit('/', 1)[-2]) is False:
        os.mkdir(destination.rsplit('/', 1)[-2])
    if destination is None:
        log.error("Destination is not specified when downloading data",
                  error=True)
    # if(os.path.isfile(destination)):
    # 	log.error("%s file already exists ..."%destination,error=True)

    if os.path.isfile(destination):
        log.success("File already downloaded, use the old file")
    else:
        r = requests.get(url, stream=True)
        if r.status_code != 200:
            log.error('URL: %s does not exist' % url, error=True)
        # Total size in bytes.
        total_size = int(r.headers.get('content-length', 0))
        block_size = 1024
        wrote = 0
        with open(destination, 'wb') as f:
            log.status("Downloading from %s to %s..." % (url, destination))
            for data in tqdm(r.iter_content(block_size),
                             total=math.ceil(total_size // block_size),
                             unit='KB',
                             unit_scale=True,
                             leave=False):
                wrote = wrote + len(data)
                f.write(data)
        f.close()
        if total_size != 0 and wrote != total_size:
            log.error("Error downloading the data ...")
        log.success("Download complete!")

    return True
def validate_metadata_format(metadata, root_name, verbose=True):
    log.status(
        "Checking the format of the metadata in <%s> computational sequence ..."
        % root_name)
    failure = False
    if type(metadata) is not dict:
        log.error(
            "<%s> computational sequence metadata is not key-value pairs!",
            error=True)
    presenceFlag = [
        mtd in metadata.keys() for mtd in featuresetMetadataTemplate
    ]
    #check if all the metadata is set
    if all(presenceFlag) is False:
        #verbose one is not set
        if verbose:
            missings = [
                x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag)
                if y is False
            ]
            log.error("Missing metadata in <%s> computational sequence: %s" %
                      (root_name, str(missings)),
                      error=False)
        failure = True
    if failure:
        log.error(
            msgstring=
            "<%s> computational sequence does not have all the required metadata ... continuing "
            % root_name,
            error=False)
        return False
    else:
        log.success("<%s> computational sequence metadata in correct format." %
                    root_name)
        return True
Beispiel #5
0
 def impute(self, ref_key, imputation_fn=numpy.zeros):
     log.status("Imputation called ...")
     other_keys = list(self.keys())
     other_keys.remove(ref_key)
     other_keys_dims = {
         x: list(self[x][self[x].keys()[0]]["features"].shape[1:])
         for x in other_keys
     }
     pbar = tqdm(total=len(self[ref_key].keys()),
                 unit=" Reference Computational Sequence Entries",
                 leave=False)
     pbar.set_description("Imputation Progress")
     for seg_key in self[ref_key].keys():
         for other_key in other_keys:
             try:
                 self[other_key][seg_key]
             except:
                 self[other_key][seg_key] = {
                     "intervals": self[ref_key][seg_key]["intervals"],
                     "features":
                     imputation_fn([1] + other_keys_dims[other_key])
                 }
         pbar.update(1)
     pbar.close()
     log.success("Imputation completed ...")
Beispiel #6
0
def writeCSD(data,metadata,rootName,destination):
	#check the data to make sure it is in correct format
	validateDataIntegrity(data,rootName)
	validateMetadataIntegrity(metadata,rootName)

	log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination))	
	#opening the file
	writeh5Handle=h5py.File(destination,'w')
	#creating the root handle
	rootHandle=writeh5Handle.create_group(rootName)

	#writing the data
	dataHandle=rootHandle.create_group("data")
	pbar = tqdm(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False)
	for vid in data:
		vidHandle=dataHandle.create_group(vid)
		vidHandle.create_dataset("features",data=data[vid]["features"])
		vidHandle.create_dataset("intervals",data=data[vid]["intervals"])
		pbar.update(1)
	pbar.close()
	log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination))
	log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination))
	#writing the metadata
	metadataHandle=rootHandle.create_group("metadata")
	for metadataKey in metadata.keys():
		metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str))
		cast_operator=unicode if sys.version_info.major is 2 else str
		metadataHandle[metadataKey][0]=cast_operator(metadata[metadataKey])

	writeh5Handle.close()
	log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination))
	log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
Beispiel #7
0
    def __unify_dataset(self, active=True):
        log.status("Unify was called ...")
        all_vidids = {}
        violators = []
        for seq_key in list(self.computational_sequences.keys()):
            for vidid in list(
                    self.computational_sequences[seq_key].data.keys()):
                vidid = vidid.split('[')[0]
                all_vidids[vidid] = True

        for vidid in list(all_vidids.keys()):
            for seq_key in list(self.computational_sequences.keys()):
                if not any([
                        vidid_in_seq for vidid_in_seq in
                        self.computational_sequences[seq_key].data.keys()
                        if vidid_in_seq[:len(vidid)] == vidid
                ]):
                    violators.append(vidid)
        if len(violators) > 0:
            for violator in violators:
                log.error(
                    "%s entry is not shared among all sequences, removing it ..."
                    % violator,
                    error=False)
                if active == True:
                    self.__remove_id(violator)
        if active == False and len(violators) > 0:
            log.error(
                "%d violators remain, alignment will fail if called ..." %
                len(violators),
                error=True)

        log.success("Unify finished, dataset is compatible for alignment ...")
	def _checkIntegrity(self,error=True):
		if not hasattr(self,'metadata') or not hasattr(self,'data'):
			log.error("computational sequence is blank (data or metadata is missing)")
		log.status("Checking the integrity of the <%s> computational sequence ..."%self.metadata["root name"])
		#TODO: hash check not implemented yet
		datavalid=validateDataIntegrity(self.data,self.metadata["root name"],which=False)
		metadatavalid=validateMetadataIntegrity(self.metadata,self.metadata["root name"],which=False)
		if datavalid and metadatavalid:
			log.success("<%s> computational sequence is valid!"%self.metadata["root name"])
Beispiel #9
0
def validateDataIntegrity(data, rootName, which=True):
    log.status(
        "Checking the integrity of the data in <%s> computational sequence ..."
        % rootName)
    failure = False
    if (type(data) is not dict):
        #this will cause the rest of the pipeline to crash - RuntimeError
        log.error(
            "%s computational sequence data is not in heirarchy format ...",
            error=True)
    try:
        #for each video check the shapes of the intervals and features
        for vid in data.keys():
            #check the intervals first - if failure simply show a warning - no exit since we want to identify all the cases
            if len(data[vid]["intervals"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong intervals array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #check the features next
            if len(data[vid]["features"].shape) != 2:
                if which:
                    log.error(
                        "Video <%s> in  <%s> computational sequence has wrong features array shape. "
                        % (vid, rootName),
                        error=False)
                failure = True
            #if the first dimension of intervals and features doesn't match
            if data[vid]["features"].shape[0] != data[vid]["intervals"].shape[
                    0]:
                if which:
                    log.error(
                        "Video <%s> in <%s> computational sequence - features and intervals have different first dimensions. "
                        % (vid, rootName),
                        error=False)
                failure = True
    #some other thing has happened! - RuntimeError
    except:
        if which:
            log.error(
                "<%s> computational sequence data itegrity could not be checked. "
                % rootName,
                error=True)

    #failure during intervals and features check
    if failure:
        log.error(
            "<%s> computational sequence data integrity check failed due to inconsistency in intervals and features. "
            % rootName,
            error=True)
    else:
        log.success("<%s> computational sequence data in correct format." %
                    rootName)
        return True
Beispiel #10
0
def process_data(folders=["cmumosei_highlevel", "cmumosei_labels"]):
    log.status(
        "You can also download all the outputs of this code from here: http://immortal.multicomp.cs.cmu.edu/ACL20Challenge/"
    )

    cmumosei_dataset = {}
    for folder in folders:
        cmumosei_dataset[folder.split("_")[1]] = mmdatasdk.mmdataset(folder)

    #performs word alignment. Labels are not part of the word alignment process.
    # cmumosei_dataset["highlevel"].align("glove_vectors")

    #replacing missing modality information for words - some words may experience failed COVAREP, etc.
    # cmumosei_dataset["highlevel"].impute('glove_vectors')

    #this writes the word aligned computational sequences to the disk
    deploy(cmumosei_dataset["highlevel"], "word_aligned_highlevel")

    #if you want to load the word aligned from the disk, comment out the lines for align and impute, and uncomment the line below.
    #----I am here -------
    cmumosei_dataset["highlevel"] = mmdatasdk.mmdataset(
        "word_aligned_highlevel")

    #now aligning to the labels - first adding labels to the dataset
    cmumosei_dataset["highlevel"].computational_sequences[
        "All Labels"] = cmumosei_dataset["labels"]["All Labels"]
    #the actual alignment without collapse function this time
    cmumosei_dataset["highlevel"].align("All Labels")
    #removing sentences which have missing modality information
    cmumosei_dataset["highlevel"].hard_unify()

    #writing the final aligned to disk
    deploy(cmumosei_dataset["highlevel"], "final_aligned")

    #reading from the disk - if the above process is done.
    #cmumosei_dataset["highlevel"]=mmdatasdk.mmdataset("final_aligned")

    #getting the final tensors for machine learning - pass the folds to this function to get data based on tr,va,te folds.
    tensors = cmumosei_dataset["highlevel"].get_tensors(
        seq_len=50,
        non_sequences=["All Labels"],
        direction=False,
        folds=[
            mmdatasdk.cmu_mosei.standard_folds.standard_train_fold,
            mmdatasdk.cmu_mosei.standard_folds.standard_valid_fold,
            mmdatasdk.cmu_mosei.standard_folds.standard_test_fold
        ])

    fold_names = ["train", "valid", "test"]

    for i in range(3):
        #output the shape of the tensors
        for csd in list(cmumosei_dataset["highlevel"].keys()):
            print("Shape of the %s computational sequence for %s fold is %s" %
                  (csd, fold_names[i], tensors[i][csd].shape))
	def deploy(self,destination):
		self.completeAllMissingMetadata()
		self._checkIntegrity()
		log.status("Deploying the <%s> computational sequence to %s"%(destination,self.metadata['root name']))
		#generating the unique identifiers
		self.metadata['uuid']=uuid.uuid4()
		#TODO: add SHA256 check + midification should not be possible without private key
		self.metadata['md5']=None
		log.status("Your unique identifier for <%s> computational sequence is %s"%(self.metadata["root name"],self.metadata['uuid']))
		writeCSD(self.data,self.metadata,self.metadata["root name"],destination)
		self.mainFile=destination
Beispiel #12
0
	def deploy(self,destination,compression="gzip",compression_opts=9,full_chunk_shape=True):
		self.complete_all_missing_metadata()
		self.__check_format()
		log.status("Deploying the <%s> computational sequence to %s"%(destination,self.metadata['root name']))
		#generating the unique identifiers
		self.metadata['uuid']=str(uuid.uuid4())
		#TODO: add SHA256 check + midification should not be possible without private key
		self.metadata['md5']=None
		log.status("Your unique identifier for <%s> computational sequence is %s"%(self.metadata["root name"],self.metadata['uuid']))
		write_CSD(self.data,self.metadata,self.metadata["root name"],destination,compression=compression,compression_opts=compression_opts,full_chunk_shape=full_chunk_shape)
		self.main_file=destination
    def get_relevant_entries(self, reference):
        '''
        loading all data in the dataset into a dictionary. Did not take more than 2 minutes when running
        '''

        relevant_entries = {}
        relevant_entries_np = {}

        #otherseq_key: OpenFace, wordvec, etc
        for otherseq_key in set(list(
                self.computational_sequences.keys())) - set([reference]):
            relevant_entries[otherseq_key] = {}
            relevant_entries_np[otherseq_key] = {}
            sub_compseq = self.computational_sequences[otherseq_key]
            # for some_id in all video ids
            for key in list(sub_compseq.data.keys()):
                keystripped = key.split('[')[0]
                if keystripped not in relevant_entries[otherseq_key]:
                    relevant_entries[otherseq_key][keystripped] = {}
                    relevant_entries[otherseq_key][keystripped][
                        "intervals"] = []
                    relevant_entries[otherseq_key][keystripped][
                        "features"] = []

                relev_intervals = self.computational_sequences[
                    otherseq_key].data[key]["intervals"]
                relev_features = self.computational_sequences[
                    otherseq_key].data[key]["features"]
                if len(relev_intervals.shape) < 2:
                    relev_intervals = relev_intervals[None, :]
                    relev_features = relev_features[None, :]

                relevant_entries[otherseq_key][keystripped][
                    "intervals"].append(relev_intervals)
                relevant_entries[otherseq_key][keystripped]["features"].append(
                    relev_features)

            for key in list(relevant_entries[otherseq_key].keys()):
                relev_intervals_np = np.concatenate(
                    relevant_entries[otherseq_key][key]["intervals"], axis=0)
                relev_features_np = np.concatenate(
                    relevant_entries[otherseq_key][key]["features"], axis=0)
                sorted_indices = sorted(range(relev_intervals_np.shape[0]),
                                        key=lambda x: relev_intervals_np[x, 0])
                relev_intervals_np = relev_intervals_np[sorted_indices, :]
                relev_features_np = relev_features_np[sorted_indices, :]

                relevant_entries_np[otherseq_key][key] = {}
                relevant_entries_np[otherseq_key][key][
                    "intervals"] = relev_intervals_np
                relevant_entries_np[otherseq_key][key][
                    "features"] = relev_features_np
            log.status("Pre-alignment done for <%s> ..." % otherseq_key)
        return relevant_entries_np
    def get_relevant_entries(self, reference):
        relevant_entries = {}
        relevant_entries_np = {}

        #pbar = tqdm(total=count,unit=" Computational Sequence Entries",leave=False)

        #otherseq_key: OpenFace, wordvec, etc
        for otherseq_key in set(list(
                self.computational_sequences.keys())) - set([reference]):
            relevant_entries[otherseq_key] = {}
            relevant_entries_np[otherseq_key] = {}
            sub_compseq = self.computational_sequences[otherseq_key]
            # for some_id in all video ids
            for key in list(sub_compseq.data.keys()):
                keystripped = key.split('[')[0]
                if keystripped not in relevant_entries[otherseq_key]:
                    relevant_entries[otherseq_key][keystripped] = {}
                    relevant_entries[otherseq_key][keystripped][
                        "intervals"] = []
                    relevant_entries[otherseq_key][keystripped][
                        "features"] = []

                relev_intervals = self.computational_sequences[
                    otherseq_key].data[key]["intervals"]
                relev_features = self.computational_sequences[
                    otherseq_key].data[key]["features"]
                if len(relev_intervals.shape) < 2:
                    relev_intervals = relev_intervals[None, :]
                    relev_features = relev_features[None, :]

                relevant_entries[otherseq_key][keystripped][
                    "intervals"].append(relev_intervals)
                relevant_entries[otherseq_key][keystripped]["features"].append(
                    relev_features)

            for key in list(relevant_entries[otherseq_key].keys()):
                relev_intervals_np = numpy.concatenate(
                    relevant_entries[otherseq_key][key]["intervals"], axis=0)
                relev_features_np = numpy.concatenate(
                    relevant_entries[otherseq_key][key]["features"], axis=0)
                sorted_indices = sorted(range(relev_intervals_np.shape[0]),
                                        key=lambda x: relev_intervals_np[x, 0])
                relev_intervals_np = relev_intervals_np[sorted_indices, :]
                relev_features_np = relev_features_np[sorted_indices, :]

                relevant_entries_np[otherseq_key][key] = {}
                relevant_entries_np[otherseq_key][key][
                    "intervals"] = relev_intervals_np
                relevant_entries_np[otherseq_key][key][
                    "features"] = relev_features_np
            log.status("Pre-alignment done for <%s> ..." % otherseq_key)
        return relevant_entries_np
	def align(self,reference,collapse_functions=None,replace=True):
		aligned_output={}

		for sequence_name in self.computational_sequences.keys():
			aligned_output[sequence_name]={}
		if reference not in self.computational_sequences.keys():
			log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True)
		refseq=self.computational_sequences[reference].data
		#unifying the dataset, removing any entries that are not in the reference computational sequence
		self.unify()

		#building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary
		log.status("Pre-alignment based on <%s> computational sequence started ..."%reference)
		relevant_entries=self.__get_relevant_entries(reference)
		log.status("Alignment starting ...")

		pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False)
		pbar.set_description("Overall Progress")
		for entry_key in list(refseq.keys()):
			pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False)
			pbar_small.set_description("Aligning %s"%entry_key)
			for i in range(refseq[entry_key]['intervals'].shape[0]):
				#interval for the reference sequence
				ref_time=refseq[entry_key]['intervals'][i,:]
				#we drop zero or very small sequence lengths - no align for those
				if (abs(ref_time[0]-ref_time[1])<epsilon):
					pbar_small.update(1)
					continue

				#aligning all sequences (including ref sequence) to ref sequence
				for otherseq_key in list(self.computational_sequences.keys()):
					if otherseq_key != reference:
						intersects,intersects_features=self.__intersect_and_copy(ref_time,relevant_entries[otherseq_key][entry_key],epsilon)
					else:
						intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:]
					#there were no intersections between reference and subject computational sequences for the entry
					if intersects.shape[0] == 0:
						continue
					#collapsing according to the provided functions
					if type(collapse_functions) is list:
						intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions)
					if(intersects.shape[0]!=intersects_features.shape[0]):
						log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference),error=True)
					aligned_output[otherseq_key][entry_key+"[%d]"%i]={}
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features
				pbar_small.update(1)
			pbar_small.close()
			pbar.update(1)
		pbar.close()
		log.success("Alignment to <%s> complete."%reference)
		if replace is True:
			log.status("Replacing dataset content with aligned computational sequences")
			self.__set_computational_sequences(aligned_output)
			return None
		else:
			log.status("Creating new dataset with aligned computational sequences")
			newdataset=mmdataset({})
			newdataset.__set_computational_sequences(aligned_output,metadata_copy=False)
			return newdataset
Beispiel #16
0
	def complete_all_missing_metadata(self):

		missings=[x for (x,y) in zip(featuresetMetadataTemplate,[metadata in self.metadata.keys() for metadata in featuresetMetadataTemplate]) if y is False]
		#python2 vs python 3
		#TODO: Add read from file
		root_name_ext=''
		if hasattr(self,"root_name"):
			root_name_ext=" for <%s> computational sequence"%self.root_name
		for missing in missings:
			self.metadata[missing]=log.status("Please input %s%s: "%(missing,root_name_ext),require_input=True)
Beispiel #17
0
    def revert(self, replace=True):
        reverted_dataset = {x: {} for x in self.keys()}
        log.status("Revert was called ...")
        if len(self.keys()) == 0:
            log.error(
                "The dataset contains no computational sequences ... Exiting!",
                error=True)
        self.unify()
        all_keys = self[self.keys()[0]].keys()
        if len(all_keys) == 0:
            log.error(
                "No entries in computational sequences or unify found no shared entries ... Exiting!"
            )

        unique_unnumbered_entries = {}

        for key in all_keys:
            if key.split('[')[0] not in unique_unnumbered_entries:
                unique_unnumbered_entries[key.split('[')[0]] = []
            unique_unnumbered_entries[key.split('[')[0]].append(
                int(key.split('[')[1][:-1]))

        pbar = tqdm(total=len(unique_unnumbered_entries.keys()),
                    unit=" Unique Sequence Entries",
                    leave=False)
        pbar.set_description("Reversion Progress")
        for key in unique_unnumbered_entries.keys():
            unique_unnumbered_entries[key].sort()
            for cs_key in reverted_dataset.keys():
                intervals = numpy.concatenate([
                    self[cs_key][str('%s[%d]' % (key, i))]["intervals"]
                    for i in unique_unnumbered_entries[key]
                ],
                                              axis=0)
                features = numpy.concatenate([
                    self[cs_key][str('%s[%d]' % (key, i))]["features"]
                    for i in unique_unnumbered_entries[key]
                ],
                                             axis=0)
                reverted_dataset[cs_key][key] = {
                    "intervals": intervals,
                    "features": features
                }
            pbar.update(1)
        pbar.close()
        log.success("Reversion completed ...")
        if replace is True:
            log.status(
                "Replacing dataset content with reverted computational sequences"
            )
            self.__set_computational_sequences(reverted_dataset)
            return None
        else:
            log.status(
                "Creating new dataset with reverted computational sequences")
            newdataset = mmdataset({})
            newdataset.__set_computational_sequences(reverted_dataset,
                                                     metadata_copy=False)
            return newdataset
def readURL(url,destination):
	if destination is None:
		log.error("Destination is not specified when downloading data",error=True)
	if(os.path.isfile(destination)):
		log.error("%s file already exists ..."%destination,error=True)
	r = requests.get(url, stream=True)
	if r.status_code != 200:
		log.error('URL: %s does not exist'%url,error=True) 
	# Total size in bytes.
	total_size = int(r.headers.get('content-length', 0)); 
	block_size = 1024
	wrote = 0 
	with open(destination, 'wb') as f:
		log.status("Downloading from %s to %s..."%(url,destination))
		for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True):
			wrote = wrote  + len(data)
			f.write(data)
	f.close()
	if total_size != 0 and wrote != total_size:
		log.error("Error downloading the data ...")
	log.success("Download complete!")
	return True
Beispiel #19
0
	def align(self,reference,collapse_functions=None,replace=True):
		aligned_output={}

		for sequence_name in self.computational_sequences.keys():
			aligned_output[sequence_name]={}
		if reference not in self.computational_sequences.keys():
			log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True)
		refseq=self.computational_sequences[reference].data
		#this for loop is for entry_key - for example video id or the identifier of the data entries
		log.status("Alignment based on <%s> computational sequence started ..."%reference)
		self.__unify_dataset()

		pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False)
		pbar.set_description("Overall Progress")
		for entry_key in list(refseq.keys()):
			pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False)
			pbar_small.set_description("Aligning %s"%entry_key)
			for i in range(refseq[entry_key]['intervals'].shape[0]):
				#interval for the reference sequence
				ref_time=refseq[entry_key]['intervals'][i,:]
				#we drop zero or very small sequence lengths - no align for those
				if (abs(ref_time[0]-ref_time[1])<epsilon):
					pbar_small.update(1)
					continue

				#aligning all sequences (including ref sequence) to ref sequence
				for otherseq_key in list(self.computational_sequences.keys()):
					if entry_key.split('[')[0] not in self.computational_sequences[otherseq_key]._get_entries_stripped():
						log.error("The dataset does not have unified entry ids across computational sequences. Please call intersect first ...")
					if otherseq_key != reference:
						intersects,intersects_features=self.__intersect_and_copy(entry_key,ref_time,self.computational_sequences[otherseq_key],epsilon)
					else:
						intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:]
					#there were no intersections between reference and subject computational sequences for the entry
					if intersects.shape[0] == 0:
						continue
					#collapsing according to the provided functions
					if type(collapse_functions) is list:
						intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions)
					if(intersects.shape[0]!=intersects_features.shape[0]):
						log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference))
					aligned_output[otherseq_key][entry_key+"[%d]"%i]={}
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features
				pbar_small.update(1)
			pbar_small.close()
			pbar.update(1)
		pbar.close()
		log.success("Alignment to <%s> complete."%reference)
		if replace is True:
			log.status("Replacing dataset content with aligned computational sequences")
			self.__set_computational_sequences(aligned_output)
			return None
		else:
			log.status("Creating new dataset with aligned computational sequences")
			newdataset=mmdataset({})
			newdataset.__set_computational_sequences(aligned_output)
			return newdataset
def process_data(folders=["cmumosei_highlevel", "cmumosei_labels"]):
    log.status(
        "You only need to run this script once. CMU-MOSEI processing requires a combination of 300GB in RAM and swap combined. As optimized as the process is, it may take up to a day to finish."
    )
    log.status(
        "Alternatively, you can send us your computational sequences to align for you if you don't have enough computational power for alignment."
    )
    log.status(
        "The standard aligned features are availabel on the challenge github.")

    cmumosei_challenge_acl20 = {}
    for folder in folders:
        cmumosei_challenge_acl20[folder.split("_")[1]] = mmdatasdk.mmdataset(
            folder)

    #performs word alignment. Labels are not part of the word alignment process.
    cmumosei_challenge_acl20["highlevel"].align("glove_vectors")
    #replacing missing modality information for words - some words may experience failed COVAREP, etc.
    cmumosei_challenge_acl20["highlevel"].impute('glove_vectors')
    #this writes the word aligned computational sequences to the disk
    deploy(cmumosei_challenge_acl20["highlevel"], "word_aligned_highlevel")
    #if you want to load the word aligned from the disk, comment out the lines for align and impute, and uncomment the line below.
    #cmumosei_challenge_acl20["highlevel"]=mmdatasdk.mmdataset("word_aligned_highlevel")

    #now aligning to the labels - first adding labels to the dataset
    cmumosei_challenge_acl20["highlevel"].computational_sequences[
        "Emotion Labels"] = cmumosei_challenge_acl20["labels"][
            "Emotion Labels"]
    #the actual alignment without collapse function this time
    cmumosei_challenge_acl20["highlevel"].align("Emotion Labels")
    #removing sentences which have missing modality information
    cmumosei_challenge_acl20["highlevel"].hard_unify()

    #writing the final aligned to disk
    deploy(cmumosei_challenge_acl20["highlevel"], "final_aligned")

    #reading from the disk - if the above process is done.
    #cmumosei_challenge_acl20["highlevel"]=mmdatasdk.mmdataset("final_aligned")

    #getting the final tensors for machine learning - pass the folds to this function to get data based on tr,va,te folds.
    tensors = cmumosei_challenge_acl20["highlevel"].get_tensors(
        seq_len=50,
        non_sequences=["Emotion Labels"],
        direction=False,
        folds=[
            mmdatasdk.cmu_mosei.standard_folds.standard_train_fold,
            mmdatasdk.cmu_mosei.standard_folds.standard_valid_fold,
            mmdatasdk.cmu_mosei.standard_folds.standard_test_fold
        ])

    fold_names = ["train", "valid", "test"]

    for i in range(3):
        #output the shape of the tensors
        for csd in list(cmumosei_challenge_acl20["highlevel"].keys()):
            print("Shape of the %s computational sequence for %s fold is %s" %
                  (csd, fold_names[i], tensors[i][csd].shape))
Beispiel #21
0
    def unify(self, active=True):
        log.status("Unify was called ...")

        all_vidids = {}
        violators = []

        all_keys = {}
        for seq_key in list(self.computational_sequences.keys()):
            all_keys[seq_key] = [
                vidid.split("[")[0]
                for vidid in self.computational_sequences[seq_key].data.keys()
            ]

        valids = set.intersection(*[set(all_keys[x]) for x in all_keys])
        violators = set()
        for seq_key in list(self.computational_sequences.keys()):
            violators = violators.union(
                set([
                    vidid.split("[")[0] for vidid in
                    self.computational_sequences[seq_key].data.keys()
                ]) - valids)

        if len(violators) > 0:
            for violator in violators:
                log.error(
                    "%s entry is not shared among all sequences, removing it ..."
                    % violator,
                    error=False)
                if active == True:
                    self.remove_id(violator, purge=True)
        if active == False and len(violators) > 0:
            log.error(
                "%d violators remain, alignment will fail if called ..." %
                len(violators),
                error=True)

        log.success("Unify completed ...")
def read_URL(url, destination):
    if destination is None:
        log.error("Destination is not specified when downloading data",
                  error=True)

    if os.path.isdir(destination.rsplit(os.sep, 1)[-2]) is False:
        os.mkdir(destination.rsplit(os.sep, 1)[-2])

    if (os.path.isfile(destination)):
        log.error("%s file already exists ..." % destination, error=True)

    r = requests.get(url, stream=True)
    if r.status_code != 200:
        log.error('URL: %s does not exist' % url, error=True)
    # Total size in bytes.
    total_size = int(r.headers.get('content-length', 0))
    block_size = 1024
    unit = total_size / block_size
    wrote = 0
    with open(destination, 'wb') as f:
        log.status("Downloading from %s to %s..." % (url, destination))
        pbar = log.progress_bar(total=math.ceil(total_size // block_size),
                                data=r.iter_content(block_size),
                                postfix="Total in kBs",
                                unit='kB',
                                leave=False)
        for data in pbar:  #unit_scale=True,
            wrote = wrote + len(data)
            f.write(data)
    pbar.close()

    if total_size != 0 and wrote != total_size:
        log.error("Error downloading the data to %s ..." % destination,
                  error=True)

    log.success("Download complete!")
    return True
def write_CSD(data,metadata,rootName,destination,compression,compression_opts,full_chunk_shape):

	log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination))
	if compression is not None:
		log.advise("Compression with %s and opts -%d"%(compression,compression_opts))
	#opening the file
	writeh5Handle=h5py.File(destination,'w')
	#creating the root handle
	rootHandle=writeh5Handle.create_group(rootName)

	#writing the data
	dataHandle=rootHandle.create_group("data")
	pbar = log.progress_bar(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False)
	for vid in data:
		vidHandle=dataHandle.create_group(vid)
		if compression is not None:
			vidHandle.create_dataset("features",data=data[vid]["features"],compression=compression,compression_opts=compression_opts)
			vidHandle.create_dataset("intervals",data=data[vid]["intervals"],compression=compression,compression_opts=compression_opts)
		else:
			vidHandle.create_dataset("features",data=data[vid]["features"])
			vidHandle.create_dataset("intervals",data=data[vid]["intervals"])
			
		pbar.update(1)
	pbar.close()
	log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination))
	log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination))
	#writing the metadata
	metadataHandle=rootHandle.create_group("metadata")
	for metadataKey in metadata.keys():
		metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str))
		cast_operator=unicode if sys.version_info.major is 2 else str
		metadataHandle[metadataKey][0]=cast_operator(json.dumps(metadata[metadataKey]))
	writeh5Handle.close()

	log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination))
	log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
    def upsampling_and_save(self,
                            reference,
                            id_idx,
                            collapse_function=None,
                            epsilon=10e-6):
        folder = '/data/mifs_scratch/yw454/cmumosei_aligned'
        #not_enough_label_file = './mosei_notenough_lable_videos.txt'

        ##self.computational_sequences.keys are COVERAP, OpenFace, WordVec, etc
        #for sequence_name in self.computational_sequences.keys():
        #    #init a dictionary to store different featues seperately
        #    aligned_output[sequence_name]={}

        if reference not in self.computational_sequences.keys():
            log.error("Computational sequence <%s> does not exist in dataset" %
                      reference,
                      error=True)

        modality = list(self.computational_sequences.keys())
        support = ['COVAREP', 'WordVec']
        for m in modality:
            if m not in support:
                raise ValueError('feature type not supported {}'.format(m))

        #get data of reference feature
        refseq = self.computational_sequences[reference].data
        #unifying the dataset, removing any entries that are not in the reference computational sequence
        self.unify()

        #building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary
        log.status(
            "Pre-alignment based on <%s> computational sequence started ..." %
            reference)

        relevant_entries = self.get_relevant_entries(reference)
        log.status("Alignment starting ...")

        pbar = log.progress_bar(total=len(refseq.keys()),
                                unit=" Computational Sequence Entries",
                                leave=False)
        pbar.set_description("Overall Progress")
        # for some_id in all video ids
        for entry_key in list(refseq.keys()):
            not_enough_label = False

            if entry_key not in ALL_VIDEO:
                continue

            if entry_key in id_idx:
                stored_idx = id_idx.index(entry_key)
                if stored_idx <= 2132:
                    #if stored_idx != 1781:
                    continue

            video_code = id_idx.index(entry_key)
            video_code = str(video_code).zfill(6)
            for otherseq_key in list(self.computational_sequences.keys()):
                if otherseq_key == reference:
                    # save reference (COVAREP) data
                    processed_feature = refseq[entry_key]['features'][:, :]
                else:
                    #save upsampled (wordvec) data
                    processed_feature = self.upsampling(
                        relevant_entries[otherseq_key][entry_key], entry_key)

                save_htk_format(processed_feature, otherseq_key, folder,
                                video_code)

                print('alignment saved for video {} feature {}.'.format(
                    video_code, otherseq_key))

            pbar.update(1)
        pbar.close()
    def align_upsampling_and_save(self,
                                  reference,
                                  id_idx,
                                  collapse_function=None,
                                  epsilon=10e-6):
        folder = '/data/mifs_scratch/yw454/cmumosei_aligned'
        log_file = './mosei_alignment_log.txt'
        #aligned_output = {}
        count = 0

        ##self.computational_sequences.keys are COVERAP, OpenFace, WordVec, etc
        #for sequence_name in self.computational_sequences.keys():
        #    #init a dictionary to store different featues seperately
        #    aligned_output[sequence_name]={}

        if reference not in self.computational_sequences.keys():
            log.error("Computational sequence <%s> does not exist in dataset" %
                      reference,
                      error=True)

        #get data of reference feature
        refseq = self.computational_sequences[reference].data
        #unifying the dataset, removing any entries that are not in the reference computational sequence
        self.unify()

        #building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary
        log.status(
            "Pre-alignment based on <%s> computational sequence started ..." %
            reference)

        relevant_entries = self.get_relevant_entries(reference)
        log.status("Alignment starting ...")

        pbar = log.progress_bar(total=len(refseq.keys()),
                                unit=" Computational Sequence Entries",
                                leave=False)
        pbar.set_description("Overall Progress")
        # for some_id in all video ids
        for entry_key in list(refseq.keys()):

            if entry_key in id_idx:
                stored_idx = id_idx.index(entry_key)
                #if stored_idx < 104 or (stored_idx > 104 and stored_idx < 1781):
                if stored_idx < 1781 or stored_idx == 1815:
                    continue

            all_intersects = {}
            all_intersect_features = {}

            #for sequence_name in self.computational_sequences.keys():
            #    all_intersects[sequence_name] = []
            #    all_intersect_features[sequence_name] = []

            ref_all = refseq[entry_key]['intervals']

            #aligning all sequences to ref sequence (previous: align refer to refer as well, now: not include refer)
            #otherseq_key: other features; entry_key: some video id

            for otherseq_key in list(self.computational_sequences.keys()):
                if otherseq_key != reference:
                    feature_info = 'reference: {}, other feature {}, video id: {}'.format(
                        reference, otherseq_key, entry_key)
                    intersects, intersects_features = self.intersect_and_copy_upsampling(
                        ref_all, relevant_entries[otherseq_key][entry_key],
                        epsilon, log_file, feature_info)
                else:
                    intersects, intersects_features = refseq[entry_key][
                        'intervals'][:, :], refseq[entry_key]['features'][:, :]

                #print(type(intersects[0]))
                #print(type(intersects_features[0]))
                #print(len(intersects[0]))
                #print(len(intersects_features[0]))
                all_intersects[otherseq_key] = intersects
                all_intersect_features[otherseq_key] = intersects_features

            #save features per video
            for sequence_name in self.computational_sequences.keys():
                video_code = id_idx.index(entry_key)
                video_code = str(video_code).zfill(6)

                save_htk_format(all_intersect_features[sequence_name],
                                sequence_name, folder, video_code)
                save_intervals(all_intersects[sequence_name], sequence_name,
                               folder, video_code)
                print('alignment saved for video {} feature {}.'.format(
                    video_code, sequence_name))

            pbar.update(1)
        pbar.close()
Beispiel #26
0
	def align(self,reference,replace=True):
		aligned_output={}
		for sequence_name in self.computational_sequences.keys():
			aligned_output[sequence_name]={}
		if reference not in self.computational_sequences.keys():
			log.error("Computational sequence %s does not exist in dataset"%reference,error=True)
		refseq=self.computational_sequences[reference].data
		#this for loop is for entry_key - for example video id or the identifier of the data entries
		log.status("Alignment based on %s computational sequence started ..."%reference)
		pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries")
		pbar.set_description("Overall Progress")
		for entry_key in list(refseq.keys()):
			pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0])
			pbar_small.set_description("Aligning %s"%entry_key)
			#intervals for the reference sequence
			for i in range(refseq[entry_key]['intervals'].shape[0]):
				ref_time=refseq[entry_key]['intervals'][i,:]
				if (abs(ref_time[0]-ref_time[1])<epsilon):
					pbar_small.update(1)
					continue
				#aligning all sequences (including ref sequence) to ref sequence
				for otherseq_key in list(self.computational_sequences.keys()):
					otherseq=self.computational_sequences[otherseq_key].data[entry_key]
					#list to contain intersection for (otherseq_key,i)
					list_intervals=[]
					list_features=[]
					#checking all intervals of the otherseq for intersection
					for j in range(otherseq["intervals"].shape[0]):
						sub_time=otherseq["intervals"][j]
						this_features=otherseq["features"][j,:]
						intersect,intersect_start,intersect_end=self.__intersect(ref_time,sub_time)
						if intersect == True:
							list_intervals.append([intersect_start,intersect_end])
							list_features.append(this_features)
					
					aligned_output[otherseq_key][entry_key+"[%d]"%i]={}
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=numpy.array(list_intervals,dtype='float32')
					aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=numpy.array(list_features,dtype='float32')
					if (len(aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape)!=2):
						print ("F**k")
						print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape)
						print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"].shape)
						print (ref_time,i)
						print (refseq[entry_key]['features'][i,:].shape)
						time.sleep(10)
				pbar_small.update(1)
			pbar_small.visible=False
			pbar_small.close()
			pbar.update(1)
		pbar.visible=False
		pbar.close()
		log.success("Alignment to %s done."%reference)
		if replace is True:
			log.status("Replacing dataset content with aligned computational sequences")
			self.__set_computational_sequences(aligned_output)
			return None
		else:
			log.status("Creating new dataset with aligned computational sequences")
			newdataset=mmdataset({})
			newdataset.__set_computational_sequences(aligned_output)
			return newdataset	
		print()