Exemple #1
0
	def characterize(self):
		bnx_file=BnxFile(self.input_file)
		for molecule in bnx_file.parse():
			run_id=molecule.run_id
			if run_id not in self.dataset_stats:
				self.dataset_stats[run_id]={"length": 0.0, "labels": 0.0}
			self.dataset_stats[run_id]["length"] += float(molecule.length)
			self.dataset_stats[run_id]["labels"] += float(molecule.num_labels)
		for run_id in self.dataset_stats:
			stats=self.dataset_stats[run_id]
			stats["density"] = stats["labels"] / (stats["length"] / (1000 * 100))
Exemple #2
0
	def subset(self, criteria, output_file=None):
		if output_file==None:
			output_file=self.output_file

		with open(output_file, "w") as o_file:
			bnx_file=BnxFile(self.input_file)
			for header in bnx_file.getHeaders():
				o_file.write(header)
			for molecule in bnx_file.parse():
				if criteria(molecule):
					bnx_file.write(molecule, o_file)
Exemple #3
0
	def generateReducedDataset(self, proportion, output_file=None):
		if proportion > 1 or proportion < 0:
			raise Exception("proportion must be between 1 and 0")
		
		if output_file is None:
			output_file=self.input_file + "_" + str(int(proportion*100))

		with open(output_file, "w") as o_file:
			bnx_file=BnxFile(self.input_file)
			headers=bnx_file.getHeaders()
			for header in headers:
				o_file.write(header)
			o_file.write("# Reduced to " + str(proportion*100) + "% of the original\n")

			total_length=0.0
			molecule_lengths={}
			molecule_ids=[]
			for molecule in bnx_file.parse():
				total_length+=molecule.length
				molecule_lengths[molecule.id] = molecule.length
				molecule_ids.append(molecule.id)
			
			seed=random.random()
			random.seed(seed)
			o_file.write("# Random seed for reduction: " + str(seed) + "\n")

			target_removed_length=total_length*(1.0-proportion)
			removed_length=0
			total_molecules=len(molecule_ids)
			removed=set()
			abridged={}
			while removed_length < target_removed_length:
				list_index=int(random.random()*total_molecules)
				candidate=molecule_ids[list_index]
				removed_length+=molecule_lengths[candidate]

				del[molecule_ids[list_index]]
				total_molecules-=1
				
				if removed_length <= target_removed_length:
					removed.add(candidate)
				else:
					excess_distance=removed_length-target_removed_length
					abridged[candidate]=excess_distance

			for molecule in bnx_file.parse():
				if molecule.id in removed:
					continue
				if molecule.id in abridged:
					molecule.shrink(abridged[molecule.id])
				bnx_file.write(molecule, o_file)
Exemple #4
0
	def getOutputFileExtension(self):
		return BnxFile.getExtension()