Exemple #1
0
def load_lcms_files(mzml_files):
    """Parse mzML files and load them into LcmsRun objects.

    Note: This should be done automatically for runs in
    /project/projectdirs/metatlas/raw_data/<username>

    Parameters
    ----------
    mzml_files: list of str
       List of paths to mzml_files.

    Returns
    -------
    runs: list
       List of LcmsRun objects.
    """
    runs = []
    for fname in mzml_files:
        hdf5_file = fname.replace('.mzML', '.h5')
        if os.path.exists(hdf5_file):
            print('File already exists: %s' % hdf5_file)
            continue
        try:
            from metatlas import LcmsRun, mzml_to_hdf
            hdf_file = mzml_to_hdf(fname)
            user = getpwuid(os.stat(fname).st_uid).pw_name
            filename = os.path.splitext(os.path.basename(fname))[0]
            dirname = os.path.dirname(fname)
            experiment = os.path.basename(dirname)
            description = experiment + ' ' + filename
            ctime = os.stat(fname).st_ctime
            run = LcmsRun(name=filename,
                          description=description,
                          created_by=user,
                          modified_by=user,
                          created=ctime,
                          last_modified=ctime,
                          mzml_file=fname,
                          hdf5_file=hdf_file)
            runs.append(run)
        except Exception as e:
            print(e)
    store(runs)
    return runs
Exemple #2
0
def update_metatlas(directory):
    readonly_files = defaultdict(set)
    other_errors = defaultdict(list)
    directory = os.path.abspath(directory)

    # Sleep a random amount of time to avoid running at the same time as
    # other processes.
    time.sleep(random.random() * 2)
    mzml_files = check_output('find %s -name "*.mzML"' % directory, shell=True)
    mzml_files = mzml_files.decode('utf-8').splitlines()

    # Find valid h5 files newer than the format version timestamp.
    delta = int((time.time() - VERSION_TIMESTAMP) / 60)
    check = 'find %s -name "*.h5" -mmin -%s -size +2k' % (directory, delta)
    valid_files = check_output(check, shell=True).decode('utf-8').splitlines()
    valid_files = set(valid_files)

    new_files = []
    for mzml_file in mzml_files:
        if mzml_file.replace('.mzML', '.h5') not in valid_files:
            new_files.append(mzml_file)

    patt = re.compile(
        r".+\/raw_data\/(?P<username>[^/]+)\/(?P<experiment>[^/]+)\/(?P<path>.+)"
    )

    sys.stdout.write('Found %s files\n' % len(new_files))
    sys.stdout.flush()

    for (ind, fname) in enumerate(new_files):
        sys.stdout.write('(%s of %s): %s\n' % (ind + 1, len(new_files), fname))
        sys.stdout.flush()

        # Get relevant information about the file.
        info = patt.match(os.path.abspath(fname))
        if info:
            info = info.groupdict()
        else:
            sys.stdout.write("Invalid path name: %s\n" % fname)
            sys.stdout.flush()
            continue
        dirname = os.path.dirname(fname)
        try:
            username = pwd.getpwuid(os.stat(fname).st_uid).pw_name
        except OSError:
            try:
                username = pwd.getpwuid(os.stat(dirname).st_uid).pw_name
            except Exception:
                username = info['username']

        # Change to read only.
        try:
            os.chmod(fname, 0o660)
        except Exception as e:
            sys.stderr.write(str(e) + '\n')
            sys.stderr.flush()

        # Copy the original file to a pasteur backup.
        if os.environ['USER'] == 'pasteur':
            pasteur_path = fname.replace('raw_data', 'pasteur_backup')
            dname = os.path.dirname(pasteur_path)
            if not os.path.exists(dname):
                os.makedirs(dname)
            try:
                shutil.copy(fname, pasteur_path)
            except IOError as e:
                readonly_files[username].add(dirname)
                continue

        # Get a lock on the mzml file to prevent interference.
        try:
            fid = open(fname, 'r')
            fcntl.flock(fid, fcntl.LOCK_EX | fcntl.LOCK_NB)
        except IOError:
            fid.close()
            msg = '%s already converting in another process\n' % fname
            sys.stderr.write(msg)
            sys.stderr.flush()
            continue

        # Convert to HDF and store the entry in the database.
        try:
            hdf5_file = fname.replace('mzML', 'h5')
            mzml_to_hdf(fname, hdf5_file, True)
            os.chmod(hdf5_file, 0o660)
            description = info['experiment'] + ' ' + info['path']
            ctime = os.stat(fname).st_ctime
            # Add this to the database unless it is already there
            try:
                runs = retrieve('lcmsrun', username='******', mzml_file=fname)
            except Exception:
                runs = list()
            if not len(runs):
                run = LcmsRun(name=info['path'],
                              description=description,
                              username=info['username'],
                              experiment=info['experiment'],
                              creation_time=ctime,
                              last_modified=ctime,
                              mzml_file=fname,
                              hdf5_file=hdf5_file)
                store(run)
        except Exception as e:
            if 'exists but it can not be written' in str(e):
                readonly_files[username].add(dirname)
            else:
                msg = traceback.format_exception(*sys.exc_info())
                msg.insert(0, 'Cannot convert %s' % fname)
                other_errors[info['username']].append('\n'.join(msg))
            sys.stderr.write(str(e) + '\n')
            sys.stderr.flush()
            try:
                os.remove(hdf5_file)
            except:
                pass
        finally:
            fid.close()

    # Handle errors.
    from metatlas.metatlas_objects import find_invalid_runs
    invalid_runs = find_invalid_runs(_override=True)

    if readonly_files:
        for (username, dirnames) in readonly_files.items():
            body = ("Please log in to NERSC and run 'chmod 777' on the "
                    "following directories:\n%s" % ('\n'.join(dirnames)))
            send_mail('Metatlas Files are Inaccessible', username, body)
    if invalid_runs:
        grouped = defaultdict(list)
        for run in invalid_runs:
            grouped[run.username].append(run.mzml_file)
        for (username, filenames) in grouped.items():
            body = 'You have runs that are not longer accessible\n'
            body += 'To remove them from the database, run the following on ipython.nersc.gov:\n\n'
            body += 'from metatlas.metatlas_objects import find_invalid_runs, remove_objects\n'
            body += 'remove_objects(find_invalid_runs())\n\n'
            body += 'The invalid runs are:\n%s' % ('\n'.join(filenames))
            send_mail('Metatlas Runs are Invalid', username, body)
    if other_errors:
        for (username, errors) in other_errors.items():
            body = 'Errored files found while loading in Metatlas files:\n\n%s' % '\n********************************\n'.join(
                errors)
            send_mail('Errors loading Metatlas files', username, body)
    sys.stdout.write('Done!\n')
    sys.stdout.flush()
def update_metatlas(directory):
    readonly_files = defaultdict(set)
    other_errors = defaultdict(list)
    directory = os.path.abspath(directory)

    # Sleep a random amount of time to avoid running at the same time as
    # other processes.
    time.sleep(random.random() * 2)
    mzml_files = check_output('find %s -name "*.mzML"' % directory, shell=True)
    mzml_files = mzml_files.decode('utf-8').splitlines()

    # Find valid h5 files newer than the format version timestamp.
    delta = int((time.time() - VERSION_TIMESTAMP) / 60)
    check = 'find %s -name "*.h5" -mmin -%s -size +2k' % (directory, delta)
    valid_files = check_output(check, shell=True).decode('utf-8').splitlines()
    valid_files = set(valid_files)

    new_files = []
    for mzml_file in mzml_files:
        if mzml_file.replace('.mzML', '.h5') not in valid_files:
            new_files.append(mzml_file)



    patt = re.compile(r".+\/raw_data\/(?P<username>[^/]+)\/(?P<experiment>[^/]+)\/(?P<path>.+)")

    sys.stdout.write('Found %s files\n' % len(new_files))
    sys.stdout.flush()


    for (ind, fname) in enumerate(new_files):
        sys.stdout.write('(%s of %s): %s\n' % (ind + 1, len(new_files), fname))
        sys.stdout.flush()

        # Get relevant information about the file.
        info = patt.match(os.path.abspath(fname))
        if info:
            info = info.groupdict()
        else:
            sys.stdout.write("Invalid path name: %s\n" % fname)
            sys.stdout.flush()
            continue
        dirname = os.path.dirname(fname)
        try:
            username = pwd.getpwuid(os.stat(fname).st_uid).pw_name
        except OSError:
            try:
                username = pwd.getpwuid(os.stat(dirname).st_uid).pw_name
            except Exception:
                username = info['username']

        # Change to read only.
        try:
            os.chmod(fname, 0o660)
        except Exception as e:
            sys.stderr.write(str(e) + '\n')
            sys.stderr.flush()

        # Copy the original file to a pasteur backup.
        if os.environ['USER'] == 'pasteur':
            pasteur_path = fname.replace('raw_data', 'pasteur_backup')
            dname = os.path.dirname(pasteur_path)
            if not os.path.exists(dname):
                os.makedirs(dname)
            try:
                shutil.copy(fname, pasteur_path)
            except IOError as e:
                readonly_files[username].add(dirname)
                continue

        # Get a lock on the mzml file to prevent interference.
        try:
            fid = open(fname, 'r')
            fcntl.flock(fid, fcntl.LOCK_EX | fcntl.LOCK_NB)
        except IOError:
            fid.close()
            msg = '%s already converting in another process\n' % fname
            sys.stderr.write(msg)
            sys.stderr.flush()
            continue

        # Convert to HDF and store the entry in the database.
        try:
            hdf5_file = fname.replace('mzML', 'h5')
            mzml_to_hdf(fname, hdf5_file, True)
            os.chmod(hdf5_file, 0o660)
            description = info['experiment'] + ' ' + info['path']
            ctime = os.stat(fname).st_ctime
            # Add this to the database unless it is already there
            try:
                runs = retrieve('lcmsrun', username='******', mzml_file=fname)
            except Exception:
                runs = list()
            if not len(runs):
                run = LcmsRun(name=info['path'], description=description,
                              username=info['username'],
                              experiment=info['experiment'],
                              creation_time=ctime, last_modified=ctime,
                              mzml_file=fname, hdf5_file=hdf5_file)
                store(run)
        except Exception as e:
            if 'exists but it can not be written' in str(e):
                readonly_files[username].add(dirname)
            else:
                msg = traceback.format_exception(*sys.exc_info())
                msg.insert(0, 'Cannot convert %s' % fname)
                other_errors[info['username']].append('\n'.join(msg))
            sys.stderr.write(str(e) + '\n')
            sys.stderr.flush()
            try:
                os.remove(hdf5_file)
            except:
                pass
        finally:
            fid.close()

    # Handle errors.
    from metatlas.metatlas_objects import find_invalid_runs
    invalid_runs = find_invalid_runs(_override=True)

    if readonly_files:
        for (username, dirnames) in readonly_files.items():
            body = ("Please log in to NERSC and run 'chmod 777' on the "
                   "following directories:\n%s" % ('\n'.join(dirnames)))
            send_mail('Metatlas Files are Inaccessible', username, body)
    if invalid_runs:
        grouped = defaultdict(list)
        for run in invalid_runs:
            grouped[run.username].append(run.mzml_file)
        for (username, filenames) in grouped.items():
            body = 'You have runs that are not longer accessible\n'
            body += 'To remove them from the database, run the following on ipython.nersc.gov:\n\n'
            body += 'from metatlas.metatlas_objects import find_invalid_runs, remove_objects\n'
            body += 'remove_objects(find_invalid_runs())\n\n'
            body += 'The invalid runs are:\n%s' % ('\n'.join(filenames))
            send_mail('Metatlas Runs are Invalid', username, body)
    if other_errors:
        for (username, errors) in other_errors.items():
            body = 'Errored files found while loading in Metatlas files:\n\n%s' % '\n********************************\n'.join(errors)
            send_mail('Errors loading Metatlas files', username, body)
    sys.stdout.write('Done!\n')
    sys.stdout.flush()