Example #1
0
 def extract_metadata_to_pipe(self):
     """Extract metadata from inputfile into a pipe for further processing."""
     local = project_utilities.path_to_local(self.inputfile)
     if len(local) > 0:
         proc = Popen(["sam_metadata_dumper", local], stdout=PIPE,
                      stderr=PIPE)
     else:
         url = project_utilities.path_to_url(inputfile)
         proc = Popen(["sam_metadata_dumper", url], stdout=PIPE,
                      stderr=PIPE)
     if len(local) > 0 and local != self.inputfile:
         os.remove(local)
     return proc
 def extract_metadata_to_pipe(self):
     """Extract metadata from inputfile into a pipe for further processing."""
     local = project_utilities.path_to_local(self.inputfile)
     if len(local) > 0:
         proc = Popen(["sam_metadata_dumper", local], stdout=PIPE,
                     stderr=PIPE)
     else:
         url = project_utilities.path_to_url(inputfile)
         proc = Popen(["sam_metadata_dumper", url], stdout=PIPE,
                     stderr=PIPE)
     if len(local) > 0 and local != self.inputfile:
         os.remove(local)
     return proc
Example #3
0
def getmetadata(inputfile):
    # Set up the experiment name for samweb Python API
    samweb = samweb_cli.SAMWebClient(
        experiment=project_utilities.get_experiment())

    # Extract metadata into a pipe.
    local = project_utilities.path_to_local(inputfile)
    if local != '':
        proc = subprocess.Popen(["sam_metadata_dumper", "-H", local],
                                stdout=subprocess.PIPE)
    else:
        url = project_utilities.path_to_url(inputfile)
        proc = subprocess.Popen(["sam_metadata_dumper", "-H", url],
                                stdout=subprocess.PIPE)
    lines = proc.stdout.readlines()
    if local != '' and local != inputfile:
        os.remove(local)

    # Count the number of lines in the file (for later use!)
    num_lines = len(lines)

    # define an empty python dictionary
    md = {}

    # Read tbe columns from the file and fill the dictionary
    c = 0
    p = 0
    parents = []
    PName = False
    gen = False
    for line in lines:
        c = c + 1
        columns = line.split(" ")
        columns = [col.strip() for col in columns]
        if c >= 4 and c <= num_lines - 2:
            if columns[1] == 'dataTier':
                md['data_tier'] = columns[-1]
                if columns[-1] == 'generated':
                    gen = True
            elif columns[1] == 'endTime':
                E = time.localtime(int(columns[-1]))
                md['end_time'] = str(E[0]) + '-' + str(E[1]) + '-' + str(
                    E[2]) + 'T' + str(E[3]) + ':' + str(E[4]) + ':' + str(E[5])
            elif columns[1] == 'startTime':
                S = time.localtime(int(columns[-1]))
                md['start_time'] = str(S[0]) + '-' + str(S[1]) + '-' + str(
                    S[2]) + 'T' + str(S[3]) + ':' + str(S[4]) + ':' + str(S[5])
            elif columns[1] == 'group':
                md['group'] = columns[-1]
            elif columns[1] == 'eventCount':
                md['event_count'] = columns[-1]
            elif columns[1] == 'fclName':
                md['fcl.name'] = columns[-1]
            elif columns[1] == 'fclVersion':
                md['fcl.version'] = columns[-1]
            elif columns[1] == 'fileFormat':
                md['file_format'] = columns[-1]
            elif columns[1] == 'ubProjectStage':
                md['ub_project.stage'] = columns[-1]
            elif columns[1] == 'ubProjectVersion':
                md['ub_project.version'] = columns[-1]
            elif columns[1] == 'lastEvent':
                md['last_event'] = columns[-1]
            elif columns[1] == 'firstEvent':
                md['first_event'] = columns[-1]
            elif columns[1] == 'fileType':
                md['file_type'] = columns[-1]
            elif columns[1] == 'group':
                md['group'] = columns[-1]
            elif columns[1] == 'group':
                md['group'] = columns[-1]
            elif columns[1] == 'run':
                run = columns[-1]
            elif columns[1] == 'runType':
                run_type = columns[-1]
            elif columns[1] == 'applicationFamily':
                app_family = columns[-1]
            elif columns[1] == 'applicationVersion':
                app_version = columns[-1]
            elif columns[1] == 'process_name':
                app_name = columns[-1]
            elif columns[1] == 'ubProjectName':
                PName = True
                md['ub_project.name'] = columns[-1]
            elif columns[1] == 'parent':
                parents.append({'file_name': columns[-1]})

    # Get the other meta data field parameters
    md['file_name'] = inputfile.split("/")[-1]
    md['file_size'] = os.path.getsize(inputfile)
    # For now, skip the checksum for dCache files.
    md['crc'] = root_metadata.fileEnstoreChecksum(inputfile)
    md['runs'] = [[run, run_type]]
    md['application'] = {
        'family': app_family,
        'name': app_name,
        'version': app_version
    }
    md['parents'] = parents

    # If ub_project.name is not in the internal metadata,
    # for generator files, get the ub_project.name from the fcl_filename (without the '.fcl' part) for gen files.
    # for all other stages, get this from the parents
    if gen == True:
        md['parents'] = []
        if PName == False:
            md['ub_project.name'] = md['fcl.name'].split(".fcl")[0]
    else:
        if PName == False:
            if 'parents' in md:
                parent = md['parents'][0]['file_name']
                mdparent = samweb.getMetadata(parent)
                if 'ub_project.name' in mdparent:
                    md['ub_project.name'] = mdparent['ub_project.name']

    return md
def getmetadata(inputfile, md0={}):
	#exp tag
	exp = 'ub'
	
	# Extract metadata into a pipe.
	local = project_utilities.path_to_local(inputfile)
	if local != '':
		proc = subprocess.Popen(["sam_metadata_dumper", local], stdout=subprocess.PIPE,
					stderr=subprocess.PIPE)
	else:
		url = project_utilities.path_to_url(inputfile)
		proc = subprocess.Popen(["sam_metadata_dumper", url], stdout=subprocess.PIPE,
					stderr=subprocess.PIPE)
	if local != '' and local != inputfile:
		os.remove(local)

	q = Queue.Queue()
	thread = threading.Thread(target=wait_for_subprocess, args=[proc, q])
	thread.start()
	thread.join(timeout=60)
	if thread.is_alive():
		print 'Terminating subprocess because of timeout.'
		proc.terminate()
		thread.join()
	rc = q.get()
	jobout = q.get()
	joberr = q.get()
	if rc != 0:
		raise RuntimeError, 'sam_metadata_dumper returned nonzero exit status %d.' % rc
	
	mdtext=''
	for line in jobout.split('\n'):
		if line[-3:-1] != ' ,':
			mdtext = mdtext + line.replace(", ,",",")
	mdtop = json.JSONDecoder().decode(mdtext)
	if len(mdtop.keys()) == 0:
		print 'No top-level key in extracted metadata.'
		sys.exit(1)
	file_name = mdtop.keys()[0]
	mdart = mdtop[file_name]
	expSpecificMetadata = expMetaData()
	
	if(os.environ['SAM_EXPERIMENT'] == 'uboone'):
	  expSpecificMetadata = ubMetaData()	
	  expSpecificMetadata.defineMetaData('ub')
	
	else:
	  expSpecificMetadata.defineMetaData('') 

	# define an empty python dictionary which will hold sam metadata.
	# Some fields can be copied directly from art metadata to sam metadata.
	# Other fields require conversion.
	md = {}

	# Loop over art metadata.
	for mdkey in mdart.keys():
		mdval = mdart[mdkey]

		# Skip some art-specific fields.

		if mdkey == 'file_format_version':
			pass
		elif mdkey == 'file_format_era':
			pass

		# Ignore primary run_type field (if any).
		# Instead, get run_type from runs field.

		elif mdkey == 'run_type':
			pass

		# Ignore data_stream if it begins with "out".
		# These kinds of stream names are probably junk module labels.

		elif mdkey == 'data_stream' and mdval[:3] == 'out' and \
			    mdval[3] >= '0' and mdval[3] <= '9':
			pass

		# Application family/name/version.

		elif mdkey == 'applicationFamily':
			if not md.has_key('application'):
				md['application'] = {}
			md['application']['family'] = mdval
		elif mdkey == 'process_name':
			if not md.has_key('application'):
				md['application'] = {}
			md['application']['name'] = mdval
		elif mdkey == 'applicationVersion':
			if not md.has_key('application'):
				md['application'] = {}
			md['application']['version'] = mdval

		# Parents.

		elif mdkey == 'parents':
			mdparents = []
			for parent in mdval:
				parent_dict = {'file_name': parent}
				mdparents.append(parent_dict)
			md['parents'] = mdparents

		# Other fields where the key or value requires minor conversion.

		elif mdkey == 'first_event':
			md[mdkey] = mdval[2]
		elif mdkey == 'last_event':
			md[mdkey] = mdval[2]
		elif mdkey in expSpecificMetadata.metadataList:
		    md[expSpecificMetadata.translateKey(mdkey)] = mdval
		
		elif mdkey == 'fclName':
		    md['fcl.name'] = mdval
		
		elif mdkey == 'fclVersion':
		    md['fcl.version']  = mdval    
		
                else:
		    md[mdkey] = mdart[mdkey]
		
		'''
		elif mdkey == '%sProjectName' % exp:
			md['%s_project.name' % exp] = mdval
		elif mdkey == '%sProjectStage' % exp:
			md['%s_project.stage' % exp] = mdval
		elif mdkey == '%sProjectVersion' % exp:
			md['%s_project.version' % exp] = mdval
		'''
		

              
		# For all other keys, copy art metadata directly to sam metadata.
		# This works for run-tuple (run, subrun, runtype) and time stamps.



	# Get the other meta data field parameters						
	md['file_name'] =  inputfile.split("/")[-1]
	if md0.has_key('file_size'):
		md['file_size'] = md0['file_size']
	else:
		md['file_size'] =  os.path.getsize(inputfile)
	if md0.has_key('crc'):
		md['crc'] = md0['crc']
	else:
		md['crc'] = root_metadata.fileEnstoreChecksum(inputfile)
	return md