def mount(name, path, user, address, hostname): """Interactive routine for mounting a storage node located at ROOT.""" import socket # We need to write to the database. di.connect_database(read_write=True) try: node = di.StorageNode.get(name=name) except pw.DoesNotExist: print('Storage node "%s" does not exist. I quit.' % name) if node.mounted: print('Node "%s" is already mounted.' % name) return # Set the default hostname if required if hostname is None: hostname = socket.gethostname() print('I will set the host to "%s".' % hostname) # Set the parameters of this node node.username = user node.address = address node.mounted = True node.host = hostname if path is not None: node.root = path node.save() print('Successfully mounted "%s".' % name)
def unmount(root_or_name): """Unmount a storage node with location or named ROOT_OR_NAME.""" import os import socket # We need to write to the database. di.connect_database(read_write=True) try: node = di.StorageNode.get(name=root_or_name) except pw.DoesNotExist: if root_or_name[-1] == "/": root_or_name = root_or_name[:len(root_or_name) - 1] if not os.path.exists(root_or_name): print("That is neither a node name, nor a path on this host. " "I quit.") exit() try: node = di.StorageNode.get(root=root_or_name, host=socket.gethostname()) except pw.DoesNotExist: print("That is neither a node name nor a root name that is " "known. I quit.") exit() if not node.mounted: print("There is no node mounted there any more.") else: node.mounted = False node.save() print("Node successfully unmounted.")
def import_file(node, root, acq_name, file_name): done = False while not done: try: _import_file(node, root, acq_name, file_name) done = True except pw.OperationalError: log.error("MySQL connexion dropped. Will attempt to reconnect in " "five seconds.") time.sleep(5) di.connect_database(True)
def import_files(node_name, verbose, acq, dry): """Scan the current directory for known acquisition files and add them into the database for NODE. This command is useful for manually maintaining an archive where we can run alpenhornd in the usual manner. """ import glob from ch_util import data_index as di di.connect_database(read_write=True) import peewee as pw # Construct list of acqs to scan if acq is None: acqs = glob.glob("*") else: acqs = acq # Keep track of state as we process the files added_files = [] # Files we have added to the database corrupt_files = [] # Known files which are corrupt registered_files = [] # Files already registered in the database unknown_files = [] # Files not known in the database not_acqs = [] # Directories which were not known acquisitions # Fetch a reference to the node try: node = di.StorageNode.select().where( di.StorageNode.name == node_name).get() except pw.DoesNotExist: print("Unknown node.") return with click.progressbar(acqs, label="Scanning acquisitions") as acq_iter: for acq_name in acq_iter: try: di.parse_acq_name(acq_name) except di.Validation: not_acqs.append(acq_name) continue try: acq = di.ArchiveAcq.select().where( di.ArchiveAcq.name == acq_name).get() except pw.DoesNotExist: not_acqs.append(acq_name) continue files = glob.glob(acq_name + "/*") # Fetch lists of all files in this acquisition, and all # files in this acq with local copies file_names = [f.name for f in acq.files] local_file_names = [ f.name for f in acq.files.join(di.ArchiveFileCopy).where( di.ArchiveFileCopy.node == node) ] for fn in files: f_name = os.path.split(fn)[1] # Check if file exists in database if f_name not in file_names: unknown_files.append(fn) continue # Check if file is already registered on this node if f_name in local_file_names: registered_files.append(fn) else: archive_file = (di.ArchiveFile.select().where( di.ArchiveFile.name == f_name, di.ArchiveFile.acq == acq).get()) if os.path.getsize(fn) != archive_file.size_b: corrupt_files.append(fn) continue added_files.append(fn) if not dry: di.ArchiveFileCopy.create(file=archive_file, node=node, has_file="Y", wants_file="Y") print("\n==== Summary ====") print() print("Added %i files" % len(added_files)) print() print("%i corrupt files." % len(corrupt_files)) print("%i files already registered." % len(registered_files)) print("%i files not known" % len(unknown_files)) print("%i directories were not acquisitions." % len(not_acqs)) if verbose > 0: print() print("Added files:") print() for fn in added_files: print(fn) if verbose > 1: print("Corrupt:") for fn in corrupt_files: print(fn) print() print("Unknown files:") for fn in unknown_files: print(fn) print() print("Unknown acquisitions:") for fn in not_acqs: print(fn) print()
def format_transport(serial_num): """Interactive routine for formatting a transport disc as a storage node; formats and labels the disc as necessary, the adds to the database. The disk is specified using the manufacturers SERIAL_NUM, which is printed on the disk. """ import glob import os if os.getuid() != 0: print("You must be root to run mount on a transport disc. I quit.") return # Find the disc. dev = glob.glob("/dev/disk/by-id/*%s" % serial_num) if len(dev) == 0: print("No disc with that serial number is attached.") return elif len(dev) > 1: print( "Confused: found more than one device matching that serial number:" ) for d in dev: print(" %s" % dev) print("Aborting.") return dev = dev[0] dev_part = "%s-part1" % dev # Figure out if it is formatted. print("Checking to see if disc is formatted. Please wait.") fp = os.popen("parted -s %s print" % dev) formatted = False part_start = False while True: l = fp.readline() if not l: break if l.find("Number" ) == 0 and l.find("Start") > 0 and l.find("File system") > 0: part_start = True elif l.strip() != "" and part_start: formatted = True fp.close() if not formatted: if not click.confirm("Disc is not formatted. Should I format it?"): return print("Creating partition. Please wait.") os.system( "parted -s -a optimal %s mklabel gpt -- mkpart primary 0%% 100%%" % dev) print("Formatting disc. Please wait.") os.system("mkfs.ext4 %s -m 0 -L CH-%s" % (dev_part, serial_num)) else: print("Disc is already formatted.") e2label = get_e2label(dev_part) name = "CH-%s" % serial_num if e2label and e2label != name: print("Disc label %s does not conform to labelling standard, " "which is CH-<serialnum>.") exit elif not e2label: print('Labelling the disc as "%s" (using e2label) ...' % (name)) assert dev_part is not None assert len(name) <= MAX_E2LABEL_LEN stat = os.system("/sbin/e2label %s %s" % (dev_part, name)) if stat: print("Failed to e2label! Stat = %s. I quit." % (stat)) exit() # Ensure the mount path exists. root = "/mnt/%s" % name if not os.path.isdir(root): print("Creating mount point %s." % root) os.mkdir(root) # Check to see if the disc is mounted. fp = os.popen("df") mounted = False dev_part_abs = os.path.realpath(dev_part) while 1: l = fp.readline() if not l: break if l.find(root) > 0: if l[:len(dev_part)] == dev or l[:len(dev_part_abs )] == dev_part_abs: mounted = True else: print("%s is a mount point, but %s is already mounted there." ( root, l.split()[0])) fp.close() try: node = di.StorageNode.get(name=name) except pw.DoesNotExist: print("This disc has not been registered yet as a storage node. " "Registering now.") try: group = di.StorageGroup.get(name="transport") except pw.DoesNotExist: print('Hmmm. Storage group "transport" does not exist. I quit.') exit() # We need to write to the database. di.connect_database(read_write=True) node = di.StorageNode.create(name=name, root=root, group=group, storage_type="T", min_avail_gb=1) print("Successfully created storage node.") print( "Node created but not mounted. Run alpenhorn mount_transport for that." )
def sync(node_name, group_name, acq, force, nice, target, transport, show_acq, show_files): """Copy all files from NODE to GROUP that are not already present. We can also use the --target option to only transfer files that are not available on both the destination group, and the TARGET_GROUP. This is useful for transferring data to a staging location before going to a final archive (e.g. HPSS, transport disks). """ # Make sure we connect RW di.connect_database(read_write=True) try: from_node = di.StorageNode.get(name=node_name) except pw.DoesNotExist: raise Exception('Node "%s" does not exist in the DB.' % node_name) try: to_group = di.StorageGroup.get(name=group_name) except pw.DoesNotExist: raise Exception('Group "%s" does not exist in the DB.' % group_name) # Construct list of file copies that are available on the source node, and # not available on any nodes at the destination. This query is quite complex # so I've broken it up... # First get the nodes at the destination... nodes_at_dest = di.StorageNode.select().where( di.StorageNode.group == to_group) # Then use this to get a list of all files at the destination... files_at_dest = (di.ArchiveFile.select().join(di.ArchiveFileCopy).where( di.ArchiveFileCopy.node << nodes_at_dest, di.ArchiveFileCopy.has_file == "Y")) # Then combine to get all file(copies) that are available at the source but # not at the destination... copy = di.ArchiveFileCopy.select().where( di.ArchiveFileCopy.node == from_node, di.ArchiveFileCopy.has_file == "Y", ~(di.ArchiveFileCopy.file << files_at_dest), ) # If the target option has been specified, only copy nodes also not # available there... if target is not None: # Fetch a reference to the target group try: target_group = di.StorageGroup.get(name=target) except pw.DoesNotExist: raise RuntimeError('Target group "%s" does not exist in the DB.' % target) # First get the nodes at the destination... nodes_at_target = di.StorageNode.select().where( di.StorageNode.group == target_group) # Then use this to get a list of all files at the destination... files_at_target = (di.ArchiveFile.select().join( di.ArchiveFileCopy).where( di.ArchiveFileCopy.node << nodes_at_target, di.ArchiveFileCopy.has_file == "Y", )) # Only match files that are also not available at the target copy = copy.where(~(di.ArchiveFileCopy.file << files_at_target)) # In transport mode (DEPRECATED) we only move files that don't have an # archive copy elsewhere... if transport: import warnings warnings.warn( "Transport mode is deprecated. Try to use --target instead.") # Get list of other archive nodes other_archive_nodes = di.StorageNode.select().where( di.StorageNode.storage_type == "A", di.StorageNode.id != from_node) files_in_archive = (di.ArchiveFile.select().join( di.ArchiveFileCopy).where( di.ArchiveFileCopy.node << other_archive_nodes, di.ArchiveFileCopy.has_file == "Y", )) copy = copy.where(~(di.ArchiveFileCopy.file << files_in_archive)) # Join onto ArchiveFile for later query parts copy = copy.join(di.ArchiveFile) # If requested, limit query to a specific acquisition... if acq is not None: # Fetch acq if specified try: acq = di.ArchiveAcq.get(name=acq) except pw.DoesNotExist: raise Exception('Acquisition "%s" does not exist in the DB.' % acq) # Restrict files to be in the acquisition copy = copy.where(di.ArchiveFile.acq == acq) if not copy.count(): print("No files to copy from node %s." % (node_name)) return # Show acquisitions based summary of files to be copied if show_acq: acqs = [c.file.acq.name for c in copy] import collections for acq, count in collections.Counter(acqs).items(): print("%s [%i files]" % (acq, count)) # Show all files to be copied if show_files: for c in copy: print("%s/%s" % (c.file.acq.name, c.file.name)) size_bytes = copy.aggregate(pw.fn.Sum(di.ArchiveFile.size_b)) size_gb = int(size_bytes) / 1073741824.0 print( "Will request that %d files (%.1f GB) be copied from node %s to group %s." % (copy.count(), size_gb, node_name, group_name)) if not (force or click.confirm("Do you want to proceed?")): print("Aborted.") return dtnow = datetime.datetime.now() # Perform update in a transaction to avoid any clobbering from concurrent updates with di.ArchiveFileCopyRequest._meta.database.atomic(): # Get a list of all the file ids for the copies we should perform files_ids = [c.file_id for c in copy] # Get a list of all the file ids for exisiting requests requests = di.ArchiveFileCopyRequest.select().where( di.ArchiveFileCopyRequest.group_to == to_group, di.ArchiveFileCopyRequest.node_from == from_node, ) req_file_ids = [req.file_id for req in requests] # Separate the files into ones that already have requests and ones that don't files_in = filter(lambda x: x in req_file_ids, files_ids) files_out = filter(lambda x: x not in req_file_ids, files_ids) sys.stdout.write( "Updating %i existing requests and inserting %i new ones.\n" % (len(files_in), len(files_out))) # Perform an update of all the existing copy requests if len(files_in) > 0: update = di.ArchiveFileCopyRequest.update( nice=nice, completed=False, cancelled=False, timestamp=dtnow, n_requests=di.ArchiveFileCopyRequest.n_requests + 1, ) update = update.where( di.ArchiveFileCopyRequest.file << files_in, di.ArchiveFileCopyRequest.group_to == to_group, di.ArchiveFileCopyRequest.node_from == from_node, ) update.execute() # Insert any new requests if len(files_out) > 0: # Construct a list of all the rows to insert insert = [{ "file": fid, "node_from": from_node, "nice": 0, "group_to": to_group, "completed": False, "n_requests": 1, "timestamp": dtnow, } for fid in files_out] # Do a bulk insert of these new rows di.ArchiveFileCopyRequest.insert_many(insert).execute()
def clean(node_name, days, force, now, target, acq): """Clean up NODE by marking older files as potentially removable. If --target is specified we will only remove files already available in the TARGET_GROUP. This is useful for cleaning out intermediate locations such as transport disks. Using the --days flag will only clean correlator and housekeeping files which have a timestamp associated with them. It will not touch other types. If no --days flag is given, all files will be considered for removal. """ import peewee as pw di.connect_database(read_write=True) try: this_node = di.StorageNode.get(di.StorageNode.name == node_name) except pw.DoesNotExist: print("Specified node does not exist.") # Check to see if we are on an archive node if this_node.storage_type == "A": if force or click.confirm("DANGER: run clean on archive node?"): print("%s is an archive node. Forcing clean." % node_name) else: print("Cannot clean archive node %s without forcing." % node_name) return # Select FileCopys on this node. files = di.ArchiveFileCopy.select(di.ArchiveFileCopy.id).where( di.ArchiveFileCopy.node == this_node, di.ArchiveFileCopy.wants_file == "Y") # Limit to acquisition if acq is not None: try: acq = di.ArchiveAcq.get(name=acq) except pw.DoesNotExit: raise RuntimeError("Specified acquisition %s does not exist" % acq) files_in_acq = di.ArchiveFile.select().where(di.ArchiveFile.acq == acq) files = files.where(di.ArchiveFileCopy.file << files_in_acq) # If the target option has been specified, only clean files also available there... if target is not None: # Fetch a reference to the target group try: target_group = di.StorageGroup.get(name=target) except pw.DoesNotExist: raise RuntimeError('Target group "%s" does not exist in the DB.' % target) # First get the nodes at the destination... nodes_at_target = di.StorageNode.select().where( di.StorageNode.group == target_group) # Then use this to get a list of all files at the destination... files_at_target = (di.ArchiveFile.select().join( di.ArchiveFileCopy).where( di.ArchiveFileCopy.node << nodes_at_target, di.ArchiveFileCopy.has_file == "Y", )) # Only match files that are also available at the target files = files.where(di.ArchiveFileCopy.file << files_at_target) # If --days has been set we need to restrict to files older than the given # time. This only works for a few particular file types if days is not None and days > 0: # Get the time for the oldest files to keep oldest = datetime.datetime.now() - datetime.timedelta(days) oldest_unix = ephemeris.ensure_unix(oldest) # List of filetypes we want to update, needs a human readable name and a # FileInfo table. filetypes = [["correlation", di.CorrFileInfo], ["housekeeping", di.HKFileInfo]] file_ids = [] # Iterate over file types for cleaning for name, infotable in filetypes: # Filter to fetch only ones with a start time older than `oldest` oldfiles = (files.join(di.ArchiveFile).join(infotable).where( infotable.start_time < oldest_unix)) local_file_ids = list(oldfiles) # Get number of correlation files count = oldfiles.count() if count > 0: size_bytes = (di.ArchiveFileCopy.select().where( di.ArchiveFileCopy.id << local_file_ids).join( di.ArchiveFile).aggregate( pw.fn.Sum(di.ArchiveFile.size_b))) size_gb = int(size_bytes) / 2**30.0 print("Cleaning up %i %s files (%.1f GB) from %s " % (count, name, size_gb, node_name)) file_ids += local_file_ids # If days is not set, then just select all files that meet the requirements so far else: file_ids = list(files) count = files.count() if count > 0: size_bytes = (di.ArchiveFileCopy.select().where( di.ArchiveFileCopy.id << file_ids).join( di.ArchiveFile).aggregate(pw.fn.Sum( di.ArchiveFile.size_b))) size_gb = int(size_bytes) / 1073741824.0 print("Cleaning up %i files (%.1f GB) from %s " % (count, size_gb, node_name)) # If there are any files to clean, ask for confirmation and the mark them in # the database for removal if len(file_ids) > 0: if force or click.confirm(" Are you sure?"): print(" Marking files for cleaning.") state = "N" if now else "M" update = di.ArchiveFileCopy.update( wants_file=state).where(di.ArchiveFileCopy.id << file_ids) n = update.execute() print("Marked %i files for cleaning" % n) else: print(" Cancelled") else: print("No files selected for cleaning on %s." % node_name)
def verify(node_name, md5, fixdb, acq): """Verify the archive on NODE against the database.""" import os try: this_node = di.StorageNode.get(di.StorageNode.name == node_name) except pw.DoesNotExist: print("Specified node does not exist.") return ## Use a complicated query with a tuples construct to fetch everything we ## need in a single query. This massively speeds up the whole process versus ## fetching all the FileCopy's then querying for Files and Acqs. lfiles = (di.ArchiveFile.select( di.ArchiveFile.name, di.ArchiveAcq.name, di.ArchiveFile.size_b, di.ArchiveFile.md5sum, di.ArchiveFileCopy.id, ).join(di.ArchiveAcq).switch(di.ArchiveFile).join( di.ArchiveFileCopy).where(di.ArchiveFileCopy.node == this_node, di.ArchiveFileCopy.has_file == "Y").tuples()) missing_files = [] corrupt_files = [] missing_ids = [] corrupt_ids = [] nfiles = 0 with click.progressbar(lfiles, label="Scanning files") as lfiles_iter: for filename, acqname, filesize, md5sum, fc_id in lfiles_iter: # Skip if not in specified acquisitions if len(acq) > 0 and acqname not in acq: continue nfiles += 1 filepath = this_node.root + "/" + acqname + "/" + filename # Check if file is plain missing if not os.path.exists(filepath): missing_files.append(filepath) missing_ids.append(fc_id) continue if md5: file_md5 = di.md5sum_file(filepath) corrupt = file_md5 != md5sum else: corrupt = os.path.getsize(filepath) != filesize if corrupt: corrupt_files.append(filepath) corrupt_ids.append(fc_id) continue if len(missing_files) > 0: print() print("=== Missing files ===") for fname in missing_files: print(fname) if len(corrupt_files) > 0: print() print("=== Corrupt files ===") for fname in corrupt_files: print(fname) print() print("=== Summary ===") print(" %i total files" % nfiles) print(" %i missing files" % len(missing_files)) print(" %i corrupt files" % len(corrupt_files)) print() # Fix up the database by marking files as missing, and marking # corrupt files for verification by alpenhornd. if fixdb: # Make sure we connect RW di.connect_database(read_write=True) if (len(missing_files) > 0) and click.confirm("Fix missing files"): missing_count = (di.ArchiveFileCopy.update(has_file="N").where( di.ArchiveFileCopy.id << missing_ids).execute()) print(" %i marked as missing" % missing_count) if (len(corrupt_files) > 0) and click.confirm("Fix corrupt files"): corrupt_count = (di.ArchiveFileCopy.update(has_file="M").where( di.ArchiveFileCopy.id << corrupt_ids).execute()) print(" %i corrupt files marked for verification" % corrupt_count)
def update_node_requests(node): """Process file copy requests onto this node.""" global done_transport_this_cycle # Ensure we are not on an HPSS node if is_hpss_node(node): log.error("Cannot process HPSS node here.") return avail_gb = node.avail_gb # Skip if node is too full if avail_gb < (node.min_avail_gb + 10): log.info("Node %s is nearly full. Skip transfers." % node.name) return # Calculate the total archive size from the database size_query = (di.ArchiveFile.select(fn.Sum(di.ArchiveFile.size_b)).join( di.ArchiveFileCopy).where(di.ArchiveFileCopy.node == node, di.ArchiveFileCopy.has_file == "Y")) size = size_query.scalar(as_tuple=True)[0] current_size_gb = float(0.0 if size is None else size) / 2**30.0 # Stop if the current archive size is bigger than the maximum (if set, i.e. > 0) if current_size_gb > node.max_total_gb and node.max_total_gb > 0.0: log.info( "Node %s has reached maximum size (current: %.1f GB, limit: %.1f GB)" % (node.name, current_size_gb, node.max_total_gb)) return # ... OR if this is a transport node quit if the transport cycle is done. if node.storage_type == "T" and done_transport_this_cycle: log.info("Ignoring transport node %s" % node.name) return start_time = time.time() # Fetch requests to process from the database requests = di.ArchiveFileCopyRequest.select().where( ~di.ArchiveFileCopyRequest.completed, ~di.ArchiveFileCopyRequest.cancelled, di.ArchiveFileCopyRequest.group_to == node.group, ) # Add in constraint that node_from cannot be an HPSS node requests = requests.join( di.StorageNode).where(di.StorageNode.address != "HPSS") for req in requests: # Only continue if the node is actually mounted if not req.node_from.mounted: continue # For transport disks we should only copy onto the transport # node if the from_node is local, this should prevent pointlessly # rsyncing across the network if node.storage_type == "T" and node.host != req.node_from.host: log.debug( "Skipping request for %s/%s from remote node [%s] onto local " "transport disks" % (req.file.acq.name, req.file.name, req.node_from.name)) continue # Only proceed if the source file actually exists (and is not corrupted). try: di.ArchiveFileCopy.get( di.ArchiveFileCopy.file == req.file, di.ArchiveFileCopy.node == req.node_from, di.ArchiveFileCopy.has_file == "Y", ) except pw.DoesNotExist: log.error( "Skipping request for %s/%s since it is not available on " 'node "%s". [file_id=%i]' % (req.file.acq.name, req.file.name, req.node_from.name, req.file.id)) continue # Only proceed if the destination file does not already exist. try: di.ArchiveFileCopy.get( di.ArchiveFileCopy.file == req.file, di.ArchiveFileCopy.node == node, di.ArchiveFileCopy.has_file == "Y", ) log.info("Skipping request for %s/%s since it already exists on " 'this node ("%s"), and updating DB to reflect this.' % (req.file.acq.name, req.file.name, node.name)) di.ArchiveFileCopyRequest.update(completed=True).where( di.ArchiveFileCopyRequest.file == req.file).where( di.ArchiveFileCopyRequest.group_to == node.group).execute() continue except pw.DoesNotExist: pass # Check that there is enough space available. if node.avail_gb * 2**30.0 < 2.0 * req.file.size_b: log.warning('Node "%s" is full: not adding datafile "%s/%s".' % (node.name, req.file.acq.name, req.file.name)) continue # Constuct the origin and destination paths. from_path = "%s/%s/%s" % (req.node_from.root, req.file.acq.name, req.file.name) if req.node_from.host != node.host: from_path = "%s@%s:%s" % ( req.node_from.username, req.node_from.address, from_path, ) to_path = "%s/%s/" % (node.root, req.file.acq.name) if not os.path.isdir(to_path): log.info('Creating directory "%s".' % to_path) os.mkdir(to_path) # Giddy up! log.info('Transferring file "%s/%s".' % (req.file.acq.name, req.file.name)) st = time.time() # Attempt to transfer the file. Each of the methods below needs to set a # return code `ret` and give an `md5sum` of the transferred file. # First we need to check if we are copying over the network if req.node_from.host != node.host: # First try bbcp which is a fast multistream transfer tool. bbcp can # calculate the md5 hash as it goes, so we'll do that to save doing # it at the end. if command_available("bbcp"): cmd = "bbcp -f -z --port 4200 -W 4M -s 16 -o -E md5= %s %s" % ( from_path, to_path, ) ret, stdout, stderr = run_command(cmd.split()) # Attempt to parse STDERR for the md5 hash if ret == 0: mo = re.search("md5 ([a-f0-9]{32})", stderr) if mo is None: log.error( "BBCP transfer has gone awry. STDOUT: %s\n STDERR: %s" % (stdout, stderr)) ret = -1 md5sum = mo.group(1) else: md5sum = None # Next try rsync over ssh. We need to explicitly calculate the md5 # hash after the fact elif command_available("rsync"): cmd = ( 'rsync -z%s --rsync-path="ionice -c4 -n4 rsync" -e "ssh -q" %s %s' % (RSYNC_FLAG, from_path, to_path)) ret, stdout, stderr = run_command(cmd.split()) md5sum = (di.md5sum_file("%s/%s" % (to_path, req.file.name)) if ret == 0 else None) # If we get here then we have no idea how to transfer the file... else: log.warn("No commands available to complete this transfer.") ret = -1 # Okay, great we're just doing a local transfer. else: # First try to just hard link the file. This will only work if we # are on the same filesystem. As there's no actual copying it's # probably unecessary to calculate the md5 check sum, so we'll just # fake it. try: link_path = "%s/%s/%s" % (node.root, req.file.acq.name, req.file.name) # Check explicitly if link already exists as this and # being unable to link will both raise OSError and get # confused. if os.path.exists(link_path): log.error("File %s already exists. Clean up manually." % link_path) ret = -1 else: os.link(from_path, link_path) ret = 0 md5sum = ( req.file.md5sum ) # As we're linking the md5sum can't change. Skip the check here... # If we couldn't just link the file, try copying it with rsync. except OSError: if command_available("rsync"): cmd = "rsync -%s %s %s" % (RSYNC_FLAG, from_path, to_path) ret, stdout, stderr = run_command(cmd.split()) md5sum = (di.md5sum_file("%s/%s" % (to_path, req.file.name)) if ret == 0 else None) else: log.warn( "No commands available to complete this transfer.") ret = -1 # Check the return code... if ret: # If the copy didn't work, then the remote file may be corrupted. log.error("Rsync failed. Marking source file suspect.") di.ArchiveFileCopy.update(has_file="M").where( di.ArchiveFileCopy.file == req.file, di.ArchiveFileCopy.node == req.node_from, ).execute() continue et = time.time() # Check integrity. if md5sum == req.file.md5sum: size_mb = req.file.size_b / 2**20.0 trans_time = et - st rate = size_mb / trans_time log.info( "Pull complete (md5sum correct). Transferred %.1f MB in %i " "seconds [%.1f MB/s]" % (size_mb, int(trans_time), rate)) # Update the FileCopy (if exists), or insert a new FileCopy try: done = False while not done: try: fcopy = (di.ArchiveFileCopy.select().where( di.ArchiveFileCopy.file == req.file, di.ArchiveFileCopy.node == node, ).get()) fcopy.has_file = "Y" fcopy.wants_file = "Y" fcopy.save() done = True except pw.OperationalError: log.error( "MySQL connexion dropped. Will attempt to reconnect in " "five seconds.") time.sleep(5) di.connect_database(True) except pw.DoesNotExist: di.ArchiveFileCopy.insert(file=req.file, node=node, has_file="Y", wants_file="Y").execute() # Mark any FileCopyRequest for this file as completed di.ArchiveFileCopyRequest.update(completed=True).where( di.ArchiveFileCopyRequest.file == req.file).where( di.ArchiveFileCopyRequest.group_to == node.group).execute() if node.storage_type == "T": # This node is getting the transport king. done_transport_this_cycle = True # Update local estimate of available space avail_gb = avail_gb - req.file.size_b / 2**30.0 else: log.error('Error with md5sum check: %s on node "%s", but %s on ' 'this node, "%s".' % (req.file.md5sum, req.node_from.name, md5sum, node.name)) log.error('Removing file "%s/%s".' % (to_path, req.file.name)) try: os.remove("%s/%s" % (to_path, req.file.name)) except: log.error("Could not remove file.") # Since the md5sum failed, the remote file may be corrupted. log.error("Marking source file suspect.") di.ArchiveFileCopy.update(has_file="M").where( di.ArchiveFileCopy.file == req.file, di.ArchiveFileCopy.node == req.node_from, ).execute() if time.time() - start_time > max_time_per_node_operation: break # Don't hog all the time.
def _import_file(node, root, acq_name, file_name): """Import a file into the DB. This routine adds the following to the database, if they do not already exist (or might be corrupted). - The acquisition that the file is a part of. - Information on the acquisition, if it is of type "corr". - The file. - Information on the file, if it is of type "corr". - Indicates that the file exists on this node. """ global import_done curr_done = True fullpath = "%s/%s/%s" % (root, acq_name, file_name) log.debug("Considering %s for import." % fullpath) # Skip the file if ch_master.py still has a lock on it. if os.path.isfile("%s/%s/.%s.lock" % (root, acq_name, file_name)): log.debug('Skipping "%s", which is locked by ch_master.py.' % fullpath) return # Parse the path try: ts, inst, atype = di.parse_acq_name(acq_name) except di.Validation: log.info("Skipping non-acquisition path %s." % acq_name) return if import_done is not None: i = bisect.bisect_left(import_done, fullpath) if i != len(import_done) and import_done[i] == fullpath: log.debug("Skipping already-registered file %s." % fullpath) return # Figure out which acquisition this is; add if necessary. try: acq = di.ArchiveAcq.get(di.ArchiveAcq.name == acq_name) log.debug('Acquisition "%s" already in DB. Skipping.' % acq_name) except pw.DoesNotExist: acq = add_acq(acq_name) if acq is None: return log.info('Acquisition "%s" added to DB.' % acq_name) # What kind of file do we have? ftype = di.detect_file_type(file_name) if ftype is None: log.info('Skipping unrecognised file "%s/%s".' % (acq_name, file_name)) return # Make sure information about the acquisition exists in the DB. if atype == "corr" and ftype.name == "corr": if not acq.corrinfos.count(): try: di.CorrAcqInfo.create( acq=acq, **get_acqcorrinfo_keywords_from_h5(fullpath)) log.info( 'Added information for correlator acquisition "%s" to ' "DB." % acq_name) except: log.warning('Missing info for acquistion "%s": HDF5 datasets ' "empty. Leaving fields NULL." % (acq_name)) di.CorrAcqInfo.create(acq=acq) elif atype == "hk" and ftype.name == "hk": try: keywords = get_acqhkinfo_keywords_from_h5("%s/%s" % (root, acq_name)) except: log.warning("Could no open atmel_id.dat file. Skipping.") keywords = [] for kw in keywords: if not sum(1 for _ in di.HKAcqInfo.select().where( di.HKAcqInfo.acq == acq).where( di.HKAcqInfo.atmel_name == kw["atmel_name"])): try: di.HKAcqInfo.create(acq=acq, **kw) log.info( 'Added information for housekeeping acquisition "%s", ' "board %s to DB." % (acq_name, kw["atmel_name"])) except: log.warning( 'Missing info for acquisition "%s": atmel_id.dat ' "file missing or corrupt. Skipping this acquisition." % acq_name) return elif atype == "rawadc": if not acq.rawadcinfos.count(): di.RawadcAcqInfo.create( acq=acq, **get_acqrawadcinfo_keywords_from_h5(acq_name)) log.info('Added information for raw ADC acquisition "%s" to ' "DB." % acq_name) elif atype == "raw": try: raw_info = acq.rawinfos.get() except pw.DoesNotExist: try: raw_info = di.RawAcqInfo.create( acq=acq, **get_rawinfo_keywords("%s/%s" % (root, acq_name))) log.info('Added information for raw acquisition "%s" to DB.' % acq_name) except: log.info('Missing info in settings.dat for acquisition "%s". ' "Skipping this acquisition." % acq_name) return # Add the file, if necessary. try: file = di.ArchiveFile.get(di.ArchiveFile.name == file_name, di.ArchiveFile.acq == acq) size_b = file.size_b log.debug('File "%s/%s" already in DB. Skipping.' % (acq_name, file_name)) except pw.DoesNotExist: log.debug("Computing md5sum.") md5sum = di.md5sum_file(fullpath, cmd_line=True) size_b = os.path.getsize(fullpath) done = False while not done: try: file = di.ArchiveFile.create(acq=acq, type=ftype, name=file_name, size_b=size_b, md5sum=md5sum) done = True except pw.OperationalError: log.error( "MySQL connexion dropped. Will attempt to reconnect in " "five seconds.") time.sleep(5) di.connect_database(True) log.info('File "%s/%s" added to DB.' % (acq_name, file_name)) # Register the copy of the file here on the collection server, if (1) it does # not exist, or (2) it does exist but has been labelled as corrupt. If (2), # check again. if not file.copies.where(di.ArchiveFileCopy.node == node).count(): copy = di.ArchiveFileCopy.create(file=file, node=node, has_file="Y", wants_file="Y") log.info('Registered file copy "%s/%s" to DB.' % (acq_name, file_name)) # Make sure information about the file exists in the DB. if ftype.name == "corr": # Add if (1) there is no corrinfo or (2) the corrinfo is missing. if not file.corrinfos.count(): try: di.CorrFileInfo.create( file=file, **get_filecorrinfo_keywords_from_h5(fullpath)) log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) except: if not file.corrinfos.count(): di.CorrFileInfo.create(file=file) log.warning('Missing info for file "%s/%s": HDF5 datasets ' "empty or unreadable. Leaving fields NULL." % (acq_name, file_name)) elif not file.corrinfos[0].start_time: try: i = file.corrinfos[0] k = get_filecorrinfo_keywords_from_h5(fullpath) except: log.debug('Still missing info for file "%s/%s".') else: i.start_time = k["start_time"] i.finish_time = k["finish_time"] i.chunk_number = k["chunk_number"] i.freq_number = k["freq_number"] i.save() log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) if ftype.name == "hk": # Add if (1) there is no hkinfo or (2) the hkinfo is missing. if not file.hkinfos.count(): try: di.HKFileInfo.create( file=file, **get_filehkinfo_keywords_from_h5(fullpath)) log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) except: if not file.corrinfos.count(): di.HKFileInfo.create(file=file) log.warning('Missing info for file "%s/%s": HDF5 datasets ' "empty or unreadable. Leaving fields NULL." % (acq_name, file_name)) elif not file.hkinfos[0].start_time: try: i = file.hkinfos[0] k = get_filehkinfo_keywords_from_h5(fullpath) except: log.debug('Still missing info for file "%s/%s".') else: i.start_time = k["start_time"] i.finish_time = k["finish_time"] i.atmel_name = k["atmel_name"] i.chunk_number = k["chunk_number"] i.save() log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) if ftype.name == "weather": # Add if (1) there is no weatherinfo or (2) the weatherinfo is missing. if not file.weatherinfos.count(): # try: di.WeatherFileInfo.create( file=file, **get_fileweatherinfo_keywords_from_h5(fullpath)) log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) # except: # if not file.corrinfos.count(): # di.WeatherFileInfo.create(file=file) # log.warning("Missing info for file \"%s/%s\": HDF5 datasets " \ # "empty or unreadable. Leaving fields NULL." % # (acq_name, file_name)) elif not file.weatherinfos[0].start_time: try: i = file.weatherinfos[0] k = get_fileweatherinfo_keywords_from_h5(fullpath) except: log.debug('Still missing info for file "%s/%s".') else: i.start_time = k["start_time"] i.finish_time = k["finish_time"] i.date = k["date"] i.save() log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) if ftype.name == "raw": # Add if (1) there is no rawinfo or (2) the rawinfo is missing. if not file.rawinfos.count(): try: di.RawFileInfo.create(file=file, **get_filerawinfo_keywords( raw_info, size_b, file_name)) log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) except: if not file.corrinfos.count(): di.RawFileInfo.create(file=file) log.warning( 'Missing info for file "%s/%s". Leaving fields NULL.' % (acq_name, file_name)) elif not file.rawinfos[0].start_time: try: i = file.rawinfos[0] k = get_filerawinfo_keywords(raw_info, size_b, file_name) except: log.debug('Still missing info for file "%s/%s".') else: i.start_time = k["start_time"] i.chunk_number = k["chunk_number"] i.save() log.info('Added information for file "%s/%s" to DB.' % (acq_name, file_name)) if import_done is not None: bisect.insort_left(import_done, fullpath) with open(LOCAL_IMPORT_RECORD, "w") as fp: fp.write("\n".join(import_done))
def setUp(self): try: di.connect_database(read_write=False) except: raise unittest.SkipTest("Skipping test as couldn't connect to db.")
"""Call backs for the HPSS interface. """ import click import peewee as pw from ch_util import data_index as di from . import logger # Import logger here to avoid connection # messages for transfer # Get a reference to the log log = logger.get_log() # Reconnect to the database read/write di.connect_database(read_write=True) @click.group() def cli(): """Call back commands for updating the database from a shell script after an HPSS transfer.""" @cli.command() @click.argument("file_id", type=int) @click.argument("node_id", type=int) def push_failed(file_id, node_id): """Update the database to reflect that the HPSS transfer failed. INTERNAL COMMAND. NOT FOR HUMAN USE!