def validateMetadataIntegrity(metadata, rootName, which=True): log.status( "Checking the integrity of the metadata in <%s> computational sequence ..." % rootName) failure = False if type(metadata) is not dict: log.error( "<%s> computational sequence metadata is not key-value pairs!", error=True) presenceFlag = [ mtd in metadata.keys() for mtd in featuresetMetadataTemplate ] #check if all the metadata is set if all(presenceFlag) is False: #which one is not set if which: missings = [ x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag) if y is False ] log.error("Missing metadata in <%s> computational sequence: %s" % (rootName, str(missings)), error=False) failure = True #if failed before if failure: log.error( msgstring= "<%s> computational sequence does not have all the required metadata ..." % rootName, error=True) else: log.success("<%s> computational sequence metadata in correct format" % rootName) return True
def hard_unify(self,active=True): log.status("Hard unify was called ...") all_vidids={} violators=[] all_keys={} for seq_key in list(self.computational_sequences.keys()): all_keys[seq_key]=[vidid for vidid in self.computational_sequences[seq_key].data.keys()] valids=set.intersection(*[set(all_keys[x]) for x in all_keys]) for seq_key in list(self.computational_sequences.keys()): hard_unify_compatible=all(["[" in vidid for vidid in self.computational_sequences[seq_key].data.keys()]) if hard_unify_compatible is False: log.error("Hard unify can only be done on aligned computational sequences, %s violated this ... Exiting ..."%seq_key) violators=set([vidid for vidid in self.computational_sequences[seq_key].data.keys()])-valids for violator in violators: if active==True: log.error("%s entry is not shared among all sequences, removing it ..."%violator,error=False) self[seq_key]._remove_id(violator,purge=False) if active==False and len(violators)>0: log.error("%d violators remain, alignment will fail if called ..."%len(violators),error=True) log.success("Hard unify completed ...")
def readURL(url, destination): #TODO: replace the split of destination with cross-os compatible operation if os.path.isdir(destination.rsplit('/', 1)[-2]) is False: os.mkdir(destination.rsplit('/', 1)[-2]) if destination is None: log.error("Destination is not specified when downloading data", error=True) # if(os.path.isfile(destination)): # log.error("%s file already exists ..."%destination,error=True) if os.path.isfile(destination): log.success("File already downloaded, use the old file") else: r = requests.get(url, stream=True) if r.status_code != 200: log.error('URL: %s does not exist' % url, error=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)) block_size = 1024 wrote = 0 with open(destination, 'wb') as f: log.status("Downloading from %s to %s..." % (url, destination)) for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size // block_size), unit='KB', unit_scale=True, leave=False): wrote = wrote + len(data) f.write(data) f.close() if total_size != 0 and wrote != total_size: log.error("Error downloading the data ...") log.success("Download complete!") return True
def validate_metadata_format(metadata, root_name, verbose=True): log.status( "Checking the format of the metadata in <%s> computational sequence ..." % root_name) failure = False if type(metadata) is not dict: log.error( "<%s> computational sequence metadata is not key-value pairs!", error=True) presenceFlag = [ mtd in metadata.keys() for mtd in featuresetMetadataTemplate ] #check if all the metadata is set if all(presenceFlag) is False: #verbose one is not set if verbose: missings = [ x for (x, y) in zip(featuresetMetadataTemplate, presenceFlag) if y is False ] log.error("Missing metadata in <%s> computational sequence: %s" % (root_name, str(missings)), error=False) failure = True if failure: log.error( msgstring= "<%s> computational sequence does not have all the required metadata ... continuing " % root_name, error=False) return False else: log.success("<%s> computational sequence metadata in correct format." % root_name) return True
def impute(self, ref_key, imputation_fn=numpy.zeros): log.status("Imputation called ...") other_keys = list(self.keys()) other_keys.remove(ref_key) other_keys_dims = { x: list(self[x][self[x].keys()[0]]["features"].shape[1:]) for x in other_keys } pbar = tqdm(total=len(self[ref_key].keys()), unit=" Reference Computational Sequence Entries", leave=False) pbar.set_description("Imputation Progress") for seg_key in self[ref_key].keys(): for other_key in other_keys: try: self[other_key][seg_key] except: self[other_key][seg_key] = { "intervals": self[ref_key][seg_key]["intervals"], "features": imputation_fn([1] + other_keys_dims[other_key]) } pbar.update(1) pbar.close() log.success("Imputation completed ...")
def writeCSD(data,metadata,rootName,destination): #check the data to make sure it is in correct format validateDataIntegrity(data,rootName) validateMetadataIntegrity(metadata,rootName) log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination)) #opening the file writeh5Handle=h5py.File(destination,'w') #creating the root handle rootHandle=writeh5Handle.create_group(rootName) #writing the data dataHandle=rootHandle.create_group("data") pbar = tqdm(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False) for vid in data: vidHandle=dataHandle.create_group(vid) vidHandle.create_dataset("features",data=data[vid]["features"]) vidHandle.create_dataset("intervals",data=data[vid]["intervals"]) pbar.update(1) pbar.close() log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination)) log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination)) #writing the metadata metadataHandle=rootHandle.create_group("metadata") for metadataKey in metadata.keys(): metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str)) cast_operator=unicode if sys.version_info.major is 2 else str metadataHandle[metadataKey][0]=cast_operator(metadata[metadataKey]) writeh5Handle.close() log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination)) log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
def __unify_dataset(self, active=True): log.status("Unify was called ...") all_vidids = {} violators = [] for seq_key in list(self.computational_sequences.keys()): for vidid in list( self.computational_sequences[seq_key].data.keys()): vidid = vidid.split('[')[0] all_vidids[vidid] = True for vidid in list(all_vidids.keys()): for seq_key in list(self.computational_sequences.keys()): if not any([ vidid_in_seq for vidid_in_seq in self.computational_sequences[seq_key].data.keys() if vidid_in_seq[:len(vidid)] == vidid ]): violators.append(vidid) if len(violators) > 0: for violator in violators: log.error( "%s entry is not shared among all sequences, removing it ..." % violator, error=False) if active == True: self.__remove_id(violator) if active == False and len(violators) > 0: log.error( "%d violators remain, alignment will fail if called ..." % len(violators), error=True) log.success("Unify finished, dataset is compatible for alignment ...")
def _checkIntegrity(self,error=True): if not hasattr(self,'metadata') or not hasattr(self,'data'): log.error("computational sequence is blank (data or metadata is missing)") log.status("Checking the integrity of the <%s> computational sequence ..."%self.metadata["root name"]) #TODO: hash check not implemented yet datavalid=validateDataIntegrity(self.data,self.metadata["root name"],which=False) metadatavalid=validateMetadataIntegrity(self.metadata,self.metadata["root name"],which=False) if datavalid and metadatavalid: log.success("<%s> computational sequence is valid!"%self.metadata["root name"])
def validateDataIntegrity(data, rootName, which=True): log.status( "Checking the integrity of the data in <%s> computational sequence ..." % rootName) failure = False if (type(data) is not dict): #this will cause the rest of the pipeline to crash - RuntimeError log.error( "%s computational sequence data is not in heirarchy format ...", error=True) try: #for each video check the shapes of the intervals and features for vid in data.keys(): #check the intervals first - if failure simply show a warning - no exit since we want to identify all the cases if len(data[vid]["intervals"].shape) != 2: if which: log.error( "Video <%s> in <%s> computational sequence has wrong intervals array shape. " % (vid, rootName), error=False) failure = True #check the features next if len(data[vid]["features"].shape) != 2: if which: log.error( "Video <%s> in <%s> computational sequence has wrong features array shape. " % (vid, rootName), error=False) failure = True #if the first dimension of intervals and features doesn't match if data[vid]["features"].shape[0] != data[vid]["intervals"].shape[ 0]: if which: log.error( "Video <%s> in <%s> computational sequence - features and intervals have different first dimensions. " % (vid, rootName), error=False) failure = True #some other thing has happened! - RuntimeError except: if which: log.error( "<%s> computational sequence data itegrity could not be checked. " % rootName, error=True) #failure during intervals and features check if failure: log.error( "<%s> computational sequence data integrity check failed due to inconsistency in intervals and features. " % rootName, error=True) else: log.success("<%s> computational sequence data in correct format." % rootName) return True
def process_data(folders=["cmumosei_highlevel", "cmumosei_labels"]): log.status( "You can also download all the outputs of this code from here: http://immortal.multicomp.cs.cmu.edu/ACL20Challenge/" ) cmumosei_dataset = {} for folder in folders: cmumosei_dataset[folder.split("_")[1]] = mmdatasdk.mmdataset(folder) #performs word alignment. Labels are not part of the word alignment process. # cmumosei_dataset["highlevel"].align("glove_vectors") #replacing missing modality information for words - some words may experience failed COVAREP, etc. # cmumosei_dataset["highlevel"].impute('glove_vectors') #this writes the word aligned computational sequences to the disk deploy(cmumosei_dataset["highlevel"], "word_aligned_highlevel") #if you want to load the word aligned from the disk, comment out the lines for align and impute, and uncomment the line below. #----I am here ------- cmumosei_dataset["highlevel"] = mmdatasdk.mmdataset( "word_aligned_highlevel") #now aligning to the labels - first adding labels to the dataset cmumosei_dataset["highlevel"].computational_sequences[ "All Labels"] = cmumosei_dataset["labels"]["All Labels"] #the actual alignment without collapse function this time cmumosei_dataset["highlevel"].align("All Labels") #removing sentences which have missing modality information cmumosei_dataset["highlevel"].hard_unify() #writing the final aligned to disk deploy(cmumosei_dataset["highlevel"], "final_aligned") #reading from the disk - if the above process is done. #cmumosei_dataset["highlevel"]=mmdatasdk.mmdataset("final_aligned") #getting the final tensors for machine learning - pass the folds to this function to get data based on tr,va,te folds. tensors = cmumosei_dataset["highlevel"].get_tensors( seq_len=50, non_sequences=["All Labels"], direction=False, folds=[ mmdatasdk.cmu_mosei.standard_folds.standard_train_fold, mmdatasdk.cmu_mosei.standard_folds.standard_valid_fold, mmdatasdk.cmu_mosei.standard_folds.standard_test_fold ]) fold_names = ["train", "valid", "test"] for i in range(3): #output the shape of the tensors for csd in list(cmumosei_dataset["highlevel"].keys()): print("Shape of the %s computational sequence for %s fold is %s" % (csd, fold_names[i], tensors[i][csd].shape))
def deploy(self,destination): self.completeAllMissingMetadata() self._checkIntegrity() log.status("Deploying the <%s> computational sequence to %s"%(destination,self.metadata['root name'])) #generating the unique identifiers self.metadata['uuid']=uuid.uuid4() #TODO: add SHA256 check + midification should not be possible without private key self.metadata['md5']=None log.status("Your unique identifier for <%s> computational sequence is %s"%(self.metadata["root name"],self.metadata['uuid'])) writeCSD(self.data,self.metadata,self.metadata["root name"],destination) self.mainFile=destination
def deploy(self,destination,compression="gzip",compression_opts=9,full_chunk_shape=True): self.complete_all_missing_metadata() self.__check_format() log.status("Deploying the <%s> computational sequence to %s"%(destination,self.metadata['root name'])) #generating the unique identifiers self.metadata['uuid']=str(uuid.uuid4()) #TODO: add SHA256 check + midification should not be possible without private key self.metadata['md5']=None log.status("Your unique identifier for <%s> computational sequence is %s"%(self.metadata["root name"],self.metadata['uuid'])) write_CSD(self.data,self.metadata,self.metadata["root name"],destination,compression=compression,compression_opts=compression_opts,full_chunk_shape=full_chunk_shape) self.main_file=destination
def get_relevant_entries(self, reference): ''' loading all data in the dataset into a dictionary. Did not take more than 2 minutes when running ''' relevant_entries = {} relevant_entries_np = {} #otherseq_key: OpenFace, wordvec, etc for otherseq_key in set(list( self.computational_sequences.keys())) - set([reference]): relevant_entries[otherseq_key] = {} relevant_entries_np[otherseq_key] = {} sub_compseq = self.computational_sequences[otherseq_key] # for some_id in all video ids for key in list(sub_compseq.data.keys()): keystripped = key.split('[')[0] if keystripped not in relevant_entries[otherseq_key]: relevant_entries[otherseq_key][keystripped] = {} relevant_entries[otherseq_key][keystripped][ "intervals"] = [] relevant_entries[otherseq_key][keystripped][ "features"] = [] relev_intervals = self.computational_sequences[ otherseq_key].data[key]["intervals"] relev_features = self.computational_sequences[ otherseq_key].data[key]["features"] if len(relev_intervals.shape) < 2: relev_intervals = relev_intervals[None, :] relev_features = relev_features[None, :] relevant_entries[otherseq_key][keystripped][ "intervals"].append(relev_intervals) relevant_entries[otherseq_key][keystripped]["features"].append( relev_features) for key in list(relevant_entries[otherseq_key].keys()): relev_intervals_np = np.concatenate( relevant_entries[otherseq_key][key]["intervals"], axis=0) relev_features_np = np.concatenate( relevant_entries[otherseq_key][key]["features"], axis=0) sorted_indices = sorted(range(relev_intervals_np.shape[0]), key=lambda x: relev_intervals_np[x, 0]) relev_intervals_np = relev_intervals_np[sorted_indices, :] relev_features_np = relev_features_np[sorted_indices, :] relevant_entries_np[otherseq_key][key] = {} relevant_entries_np[otherseq_key][key][ "intervals"] = relev_intervals_np relevant_entries_np[otherseq_key][key][ "features"] = relev_features_np log.status("Pre-alignment done for <%s> ..." % otherseq_key) return relevant_entries_np
def get_relevant_entries(self, reference): relevant_entries = {} relevant_entries_np = {} #pbar = tqdm(total=count,unit=" Computational Sequence Entries",leave=False) #otherseq_key: OpenFace, wordvec, etc for otherseq_key in set(list( self.computational_sequences.keys())) - set([reference]): relevant_entries[otherseq_key] = {} relevant_entries_np[otherseq_key] = {} sub_compseq = self.computational_sequences[otherseq_key] # for some_id in all video ids for key in list(sub_compseq.data.keys()): keystripped = key.split('[')[0] if keystripped not in relevant_entries[otherseq_key]: relevant_entries[otherseq_key][keystripped] = {} relevant_entries[otherseq_key][keystripped][ "intervals"] = [] relevant_entries[otherseq_key][keystripped][ "features"] = [] relev_intervals = self.computational_sequences[ otherseq_key].data[key]["intervals"] relev_features = self.computational_sequences[ otherseq_key].data[key]["features"] if len(relev_intervals.shape) < 2: relev_intervals = relev_intervals[None, :] relev_features = relev_features[None, :] relevant_entries[otherseq_key][keystripped][ "intervals"].append(relev_intervals) relevant_entries[otherseq_key][keystripped]["features"].append( relev_features) for key in list(relevant_entries[otherseq_key].keys()): relev_intervals_np = numpy.concatenate( relevant_entries[otherseq_key][key]["intervals"], axis=0) relev_features_np = numpy.concatenate( relevant_entries[otherseq_key][key]["features"], axis=0) sorted_indices = sorted(range(relev_intervals_np.shape[0]), key=lambda x: relev_intervals_np[x, 0]) relev_intervals_np = relev_intervals_np[sorted_indices, :] relev_features_np = relev_features_np[sorted_indices, :] relevant_entries_np[otherseq_key][key] = {} relevant_entries_np[otherseq_key][key][ "intervals"] = relev_intervals_np relevant_entries_np[otherseq_key][key][ "features"] = relev_features_np log.status("Pre-alignment done for <%s> ..." % otherseq_key) return relevant_entries_np
def align(self,reference,collapse_functions=None,replace=True): aligned_output={} for sequence_name in self.computational_sequences.keys(): aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True) refseq=self.computational_sequences[reference].data #unifying the dataset, removing any entries that are not in the reference computational sequence self.unify() #building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary log.status("Pre-alignment based on <%s> computational sequence started ..."%reference) relevant_entries=self.__get_relevant_entries(reference) log.status("Alignment starting ...") pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False) pbar.set_description("Overall Progress") for entry_key in list(refseq.keys()): pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False) pbar_small.set_description("Aligning %s"%entry_key) for i in range(refseq[entry_key]['intervals'].shape[0]): #interval for the reference sequence ref_time=refseq[entry_key]['intervals'][i,:] #we drop zero or very small sequence lengths - no align for those if (abs(ref_time[0]-ref_time[1])<epsilon): pbar_small.update(1) continue #aligning all sequences (including ref sequence) to ref sequence for otherseq_key in list(self.computational_sequences.keys()): if otherseq_key != reference: intersects,intersects_features=self.__intersect_and_copy(ref_time,relevant_entries[otherseq_key][entry_key],epsilon) else: intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:] #there were no intersections between reference and subject computational sequences for the entry if intersects.shape[0] == 0: continue #collapsing according to the provided functions if type(collapse_functions) is list: intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions) if(intersects.shape[0]!=intersects_features.shape[0]): log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference),error=True) aligned_output[otherseq_key][entry_key+"[%d]"%i]={} aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features pbar_small.update(1) pbar_small.close() pbar.update(1) pbar.close() log.success("Alignment to <%s> complete."%reference) if replace is True: log.status("Replacing dataset content with aligned computational sequences") self.__set_computational_sequences(aligned_output) return None else: log.status("Creating new dataset with aligned computational sequences") newdataset=mmdataset({}) newdataset.__set_computational_sequences(aligned_output,metadata_copy=False) return newdataset
def complete_all_missing_metadata(self): missings=[x for (x,y) in zip(featuresetMetadataTemplate,[metadata in self.metadata.keys() for metadata in featuresetMetadataTemplate]) if y is False] #python2 vs python 3 #TODO: Add read from file root_name_ext='' if hasattr(self,"root_name"): root_name_ext=" for <%s> computational sequence"%self.root_name for missing in missings: self.metadata[missing]=log.status("Please input %s%s: "%(missing,root_name_ext),require_input=True)
def revert(self, replace=True): reverted_dataset = {x: {} for x in self.keys()} log.status("Revert was called ...") if len(self.keys()) == 0: log.error( "The dataset contains no computational sequences ... Exiting!", error=True) self.unify() all_keys = self[self.keys()[0]].keys() if len(all_keys) == 0: log.error( "No entries in computational sequences or unify found no shared entries ... Exiting!" ) unique_unnumbered_entries = {} for key in all_keys: if key.split('[')[0] not in unique_unnumbered_entries: unique_unnumbered_entries[key.split('[')[0]] = [] unique_unnumbered_entries[key.split('[')[0]].append( int(key.split('[')[1][:-1])) pbar = tqdm(total=len(unique_unnumbered_entries.keys()), unit=" Unique Sequence Entries", leave=False) pbar.set_description("Reversion Progress") for key in unique_unnumbered_entries.keys(): unique_unnumbered_entries[key].sort() for cs_key in reverted_dataset.keys(): intervals = numpy.concatenate([ self[cs_key][str('%s[%d]' % (key, i))]["intervals"] for i in unique_unnumbered_entries[key] ], axis=0) features = numpy.concatenate([ self[cs_key][str('%s[%d]' % (key, i))]["features"] for i in unique_unnumbered_entries[key] ], axis=0) reverted_dataset[cs_key][key] = { "intervals": intervals, "features": features } pbar.update(1) pbar.close() log.success("Reversion completed ...") if replace is True: log.status( "Replacing dataset content with reverted computational sequences" ) self.__set_computational_sequences(reverted_dataset) return None else: log.status( "Creating new dataset with reverted computational sequences") newdataset = mmdataset({}) newdataset.__set_computational_sequences(reverted_dataset, metadata_copy=False) return newdataset
def readURL(url,destination): if destination is None: log.error("Destination is not specified when downloading data",error=True) if(os.path.isfile(destination)): log.error("%s file already exists ..."%destination,error=True) r = requests.get(url, stream=True) if r.status_code != 200: log.error('URL: %s does not exist'%url,error=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)); block_size = 1024 wrote = 0 with open(destination, 'wb') as f: log.status("Downloading from %s to %s..."%(url,destination)) for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True): wrote = wrote + len(data) f.write(data) f.close() if total_size != 0 and wrote != total_size: log.error("Error downloading the data ...") log.success("Download complete!") return True
def align(self,reference,collapse_functions=None,replace=True): aligned_output={} for sequence_name in self.computational_sequences.keys(): aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence <%s> does not exist in dataset"%reference,error=True) refseq=self.computational_sequences[reference].data #this for loop is for entry_key - for example video id or the identifier of the data entries log.status("Alignment based on <%s> computational sequence started ..."%reference) self.__unify_dataset() pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries",leave=False) pbar.set_description("Overall Progress") for entry_key in list(refseq.keys()): pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0],unit=" Segments",leave=False) pbar_small.set_description("Aligning %s"%entry_key) for i in range(refseq[entry_key]['intervals'].shape[0]): #interval for the reference sequence ref_time=refseq[entry_key]['intervals'][i,:] #we drop zero or very small sequence lengths - no align for those if (abs(ref_time[0]-ref_time[1])<epsilon): pbar_small.update(1) continue #aligning all sequences (including ref sequence) to ref sequence for otherseq_key in list(self.computational_sequences.keys()): if entry_key.split('[')[0] not in self.computational_sequences[otherseq_key]._get_entries_stripped(): log.error("The dataset does not have unified entry ids across computational sequences. Please call intersect first ...") if otherseq_key != reference: intersects,intersects_features=self.__intersect_and_copy(entry_key,ref_time,self.computational_sequences[otherseq_key],epsilon) else: intersects,intersects_features=refseq[entry_key]['intervals'][i,:][None,:],refseq[entry_key]['features'][i,:][None,:] #there were no intersections between reference and subject computational sequences for the entry if intersects.shape[0] == 0: continue #collapsing according to the provided functions if type(collapse_functions) is list: intersects,intersects_features=self.__collapse(intersects,intersects_features,collapse_functions) if(intersects.shape[0]!=intersects_features.shape[0]): log.error("Dimension mismatch between intervals and features when aligning <%s> computational sequences to <%s> computational sequence"%(otherseq_key,reference)) aligned_output[otherseq_key][entry_key+"[%d]"%i]={} aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=intersects aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=intersects_features pbar_small.update(1) pbar_small.close() pbar.update(1) pbar.close() log.success("Alignment to <%s> complete."%reference) if replace is True: log.status("Replacing dataset content with aligned computational sequences") self.__set_computational_sequences(aligned_output) return None else: log.status("Creating new dataset with aligned computational sequences") newdataset=mmdataset({}) newdataset.__set_computational_sequences(aligned_output) return newdataset
def process_data(folders=["cmumosei_highlevel", "cmumosei_labels"]): log.status( "You only need to run this script once. CMU-MOSEI processing requires a combination of 300GB in RAM and swap combined. As optimized as the process is, it may take up to a day to finish." ) log.status( "Alternatively, you can send us your computational sequences to align for you if you don't have enough computational power for alignment." ) log.status( "The standard aligned features are availabel on the challenge github.") cmumosei_challenge_acl20 = {} for folder in folders: cmumosei_challenge_acl20[folder.split("_")[1]] = mmdatasdk.mmdataset( folder) #performs word alignment. Labels are not part of the word alignment process. cmumosei_challenge_acl20["highlevel"].align("glove_vectors") #replacing missing modality information for words - some words may experience failed COVAREP, etc. cmumosei_challenge_acl20["highlevel"].impute('glove_vectors') #this writes the word aligned computational sequences to the disk deploy(cmumosei_challenge_acl20["highlevel"], "word_aligned_highlevel") #if you want to load the word aligned from the disk, comment out the lines for align and impute, and uncomment the line below. #cmumosei_challenge_acl20["highlevel"]=mmdatasdk.mmdataset("word_aligned_highlevel") #now aligning to the labels - first adding labels to the dataset cmumosei_challenge_acl20["highlevel"].computational_sequences[ "Emotion Labels"] = cmumosei_challenge_acl20["labels"][ "Emotion Labels"] #the actual alignment without collapse function this time cmumosei_challenge_acl20["highlevel"].align("Emotion Labels") #removing sentences which have missing modality information cmumosei_challenge_acl20["highlevel"].hard_unify() #writing the final aligned to disk deploy(cmumosei_challenge_acl20["highlevel"], "final_aligned") #reading from the disk - if the above process is done. #cmumosei_challenge_acl20["highlevel"]=mmdatasdk.mmdataset("final_aligned") #getting the final tensors for machine learning - pass the folds to this function to get data based on tr,va,te folds. tensors = cmumosei_challenge_acl20["highlevel"].get_tensors( seq_len=50, non_sequences=["Emotion Labels"], direction=False, folds=[ mmdatasdk.cmu_mosei.standard_folds.standard_train_fold, mmdatasdk.cmu_mosei.standard_folds.standard_valid_fold, mmdatasdk.cmu_mosei.standard_folds.standard_test_fold ]) fold_names = ["train", "valid", "test"] for i in range(3): #output the shape of the tensors for csd in list(cmumosei_challenge_acl20["highlevel"].keys()): print("Shape of the %s computational sequence for %s fold is %s" % (csd, fold_names[i], tensors[i][csd].shape))
def unify(self, active=True): log.status("Unify was called ...") all_vidids = {} violators = [] all_keys = {} for seq_key in list(self.computational_sequences.keys()): all_keys[seq_key] = [ vidid.split("[")[0] for vidid in self.computational_sequences[seq_key].data.keys() ] valids = set.intersection(*[set(all_keys[x]) for x in all_keys]) violators = set() for seq_key in list(self.computational_sequences.keys()): violators = violators.union( set([ vidid.split("[")[0] for vidid in self.computational_sequences[seq_key].data.keys() ]) - valids) if len(violators) > 0: for violator in violators: log.error( "%s entry is not shared among all sequences, removing it ..." % violator, error=False) if active == True: self.remove_id(violator, purge=True) if active == False and len(violators) > 0: log.error( "%d violators remain, alignment will fail if called ..." % len(violators), error=True) log.success("Unify completed ...")
def read_URL(url, destination): if destination is None: log.error("Destination is not specified when downloading data", error=True) if os.path.isdir(destination.rsplit(os.sep, 1)[-2]) is False: os.mkdir(destination.rsplit(os.sep, 1)[-2]) if (os.path.isfile(destination)): log.error("%s file already exists ..." % destination, error=True) r = requests.get(url, stream=True) if r.status_code != 200: log.error('URL: %s does not exist' % url, error=True) # Total size in bytes. total_size = int(r.headers.get('content-length', 0)) block_size = 1024 unit = total_size / block_size wrote = 0 with open(destination, 'wb') as f: log.status("Downloading from %s to %s..." % (url, destination)) pbar = log.progress_bar(total=math.ceil(total_size // block_size), data=r.iter_content(block_size), postfix="Total in kBs", unit='kB', leave=False) for data in pbar: #unit_scale=True, wrote = wrote + len(data) f.write(data) pbar.close() if total_size != 0 and wrote != total_size: log.error("Error downloading the data to %s ..." % destination, error=True) log.success("Download complete!") return True
def write_CSD(data,metadata,rootName,destination,compression,compression_opts,full_chunk_shape): log.status("Writing the <%s> computational sequence data to %s"%(rootName,destination)) if compression is not None: log.advise("Compression with %s and opts -%d"%(compression,compression_opts)) #opening the file writeh5Handle=h5py.File(destination,'w') #creating the root handle rootHandle=writeh5Handle.create_group(rootName) #writing the data dataHandle=rootHandle.create_group("data") pbar = log.progress_bar(total=len(data.keys()),unit=" Computational Sequence Entries",leave=False) for vid in data: vidHandle=dataHandle.create_group(vid) if compression is not None: vidHandle.create_dataset("features",data=data[vid]["features"],compression=compression,compression_opts=compression_opts) vidHandle.create_dataset("intervals",data=data[vid]["intervals"],compression=compression,compression_opts=compression_opts) else: vidHandle.create_dataset("features",data=data[vid]["features"]) vidHandle.create_dataset("intervals",data=data[vid]["intervals"]) pbar.update(1) pbar.close() log.success("<%s> computational sequence data successfully wrote to %s"%(rootName,destination)) log.status("Writing the <%s> computational sequence metadata to %s"%(rootName,destination)) #writing the metadata metadataHandle=rootHandle.create_group("metadata") for metadataKey in metadata.keys(): metadataHandle.create_dataset(metadataKey,(1,),dtype=h5py.special_dtype(vlen=unicode) if sys.version_info.major is 2 else h5py.special_dtype(vlen=str)) cast_operator=unicode if sys.version_info.major is 2 else str metadataHandle[metadataKey][0]=cast_operator(json.dumps(metadata[metadataKey])) writeh5Handle.close() log.success("<%s> computational sequence metadata successfully wrote to %s"%(rootName,destination)) log.success("<%s> computational sequence successfully wrote to %s ..."%(rootName,destination))
def upsampling_and_save(self, reference, id_idx, collapse_function=None, epsilon=10e-6): folder = '/data/mifs_scratch/yw454/cmumosei_aligned' #not_enough_label_file = './mosei_notenough_lable_videos.txt' ##self.computational_sequences.keys are COVERAP, OpenFace, WordVec, etc #for sequence_name in self.computational_sequences.keys(): # #init a dictionary to store different featues seperately # aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence <%s> does not exist in dataset" % reference, error=True) modality = list(self.computational_sequences.keys()) support = ['COVAREP', 'WordVec'] for m in modality: if m not in support: raise ValueError('feature type not supported {}'.format(m)) #get data of reference feature refseq = self.computational_sequences[reference].data #unifying the dataset, removing any entries that are not in the reference computational sequence self.unify() #building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary log.status( "Pre-alignment based on <%s> computational sequence started ..." % reference) relevant_entries = self.get_relevant_entries(reference) log.status("Alignment starting ...") pbar = log.progress_bar(total=len(refseq.keys()), unit=" Computational Sequence Entries", leave=False) pbar.set_description("Overall Progress") # for some_id in all video ids for entry_key in list(refseq.keys()): not_enough_label = False if entry_key not in ALL_VIDEO: continue if entry_key in id_idx: stored_idx = id_idx.index(entry_key) if stored_idx <= 2132: #if stored_idx != 1781: continue video_code = id_idx.index(entry_key) video_code = str(video_code).zfill(6) for otherseq_key in list(self.computational_sequences.keys()): if otherseq_key == reference: # save reference (COVAREP) data processed_feature = refseq[entry_key]['features'][:, :] else: #save upsampled (wordvec) data processed_feature = self.upsampling( relevant_entries[otherseq_key][entry_key], entry_key) save_htk_format(processed_feature, otherseq_key, folder, video_code) print('alignment saved for video {} feature {}.'.format( video_code, otherseq_key)) pbar.update(1) pbar.close()
def align_upsampling_and_save(self, reference, id_idx, collapse_function=None, epsilon=10e-6): folder = '/data/mifs_scratch/yw454/cmumosei_aligned' log_file = './mosei_alignment_log.txt' #aligned_output = {} count = 0 ##self.computational_sequences.keys are COVERAP, OpenFace, WordVec, etc #for sequence_name in self.computational_sequences.keys(): # #init a dictionary to store different featues seperately # aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence <%s> does not exist in dataset" % reference, error=True) #get data of reference feature refseq = self.computational_sequences[reference].data #unifying the dataset, removing any entries that are not in the reference computational sequence self.unify() #building the relevant entries to the reference - what we do in this section is simply removing all the [] from the entry ids and populating them into a new dictionary log.status( "Pre-alignment based on <%s> computational sequence started ..." % reference) relevant_entries = self.get_relevant_entries(reference) log.status("Alignment starting ...") pbar = log.progress_bar(total=len(refseq.keys()), unit=" Computational Sequence Entries", leave=False) pbar.set_description("Overall Progress") # for some_id in all video ids for entry_key in list(refseq.keys()): if entry_key in id_idx: stored_idx = id_idx.index(entry_key) #if stored_idx < 104 or (stored_idx > 104 and stored_idx < 1781): if stored_idx < 1781 or stored_idx == 1815: continue all_intersects = {} all_intersect_features = {} #for sequence_name in self.computational_sequences.keys(): # all_intersects[sequence_name] = [] # all_intersect_features[sequence_name] = [] ref_all = refseq[entry_key]['intervals'] #aligning all sequences to ref sequence (previous: align refer to refer as well, now: not include refer) #otherseq_key: other features; entry_key: some video id for otherseq_key in list(self.computational_sequences.keys()): if otherseq_key != reference: feature_info = 'reference: {}, other feature {}, video id: {}'.format( reference, otherseq_key, entry_key) intersects, intersects_features = self.intersect_and_copy_upsampling( ref_all, relevant_entries[otherseq_key][entry_key], epsilon, log_file, feature_info) else: intersects, intersects_features = refseq[entry_key][ 'intervals'][:, :], refseq[entry_key]['features'][:, :] #print(type(intersects[0])) #print(type(intersects_features[0])) #print(len(intersects[0])) #print(len(intersects_features[0])) all_intersects[otherseq_key] = intersects all_intersect_features[otherseq_key] = intersects_features #save features per video for sequence_name in self.computational_sequences.keys(): video_code = id_idx.index(entry_key) video_code = str(video_code).zfill(6) save_htk_format(all_intersect_features[sequence_name], sequence_name, folder, video_code) save_intervals(all_intersects[sequence_name], sequence_name, folder, video_code) print('alignment saved for video {} feature {}.'.format( video_code, sequence_name)) pbar.update(1) pbar.close()
def align(self,reference,replace=True): aligned_output={} for sequence_name in self.computational_sequences.keys(): aligned_output[sequence_name]={} if reference not in self.computational_sequences.keys(): log.error("Computational sequence %s does not exist in dataset"%reference,error=True) refseq=self.computational_sequences[reference].data #this for loop is for entry_key - for example video id or the identifier of the data entries log.status("Alignment based on %s computational sequence started ..."%reference) pbar = tqdm(total=len(refseq.keys()),unit=" Computational Sequence Entries") pbar.set_description("Overall Progress") for entry_key in list(refseq.keys()): pbar_small=tqdm(total=refseq[entry_key]['intervals'].shape[0]) pbar_small.set_description("Aligning %s"%entry_key) #intervals for the reference sequence for i in range(refseq[entry_key]['intervals'].shape[0]): ref_time=refseq[entry_key]['intervals'][i,:] if (abs(ref_time[0]-ref_time[1])<epsilon): pbar_small.update(1) continue #aligning all sequences (including ref sequence) to ref sequence for otherseq_key in list(self.computational_sequences.keys()): otherseq=self.computational_sequences[otherseq_key].data[entry_key] #list to contain intersection for (otherseq_key,i) list_intervals=[] list_features=[] #checking all intervals of the otherseq for intersection for j in range(otherseq["intervals"].shape[0]): sub_time=otherseq["intervals"][j] this_features=otherseq["features"][j,:] intersect,intersect_start,intersect_end=self.__intersect(ref_time,sub_time) if intersect == True: list_intervals.append([intersect_start,intersect_end]) list_features.append(this_features) aligned_output[otherseq_key][entry_key+"[%d]"%i]={} aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"]=numpy.array(list_intervals,dtype='float32') aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"]=numpy.array(list_features,dtype='float32') if (len(aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape)!=2): print ("F**k") print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["intervals"].shape) print (aligned_output[otherseq_key][entry_key+"[%d]"%i]["features"].shape) print (ref_time,i) print (refseq[entry_key]['features'][i,:].shape) time.sleep(10) pbar_small.update(1) pbar_small.visible=False pbar_small.close() pbar.update(1) pbar.visible=False pbar.close() log.success("Alignment to %s done."%reference) if replace is True: log.status("Replacing dataset content with aligned computational sequences") self.__set_computational_sequences(aligned_output) return None else: log.status("Creating new dataset with aligned computational sequences") newdataset=mmdataset({}) newdataset.__set_computational_sequences(aligned_output) return newdataset print()