Example #1
0
def get_files(conn):
    """Find all the files used by the experiment by reading the trace.
    """
    files = {}
    access_files = [set()]

    # Finds run timestamps, so we can sort input/output files by run
    proc_cursor = conn.cursor()
    executions = proc_cursor.execute('''
        SELECT timestamp
        FROM processes
        WHERE parent ISNULL
        ORDER BY id;
        ''')
    run_timestamps = [r_timestamp for r_timestamp, in executions][1:]
    proc_cursor.close()

    # Adds dynamic linkers
    for libdir in (Path('/lib'), Path('/lib64')):
        if libdir.exists():
            for linker in libdir.listdir('*ld-linux*'):
                for filename in find_all_links(linker, True):
                    if filename not in files:
                        f = TracedFile(filename)
                        f.read()
                        files[f.path] = f

    # Loops on executed files, and opened files, at the same time
    cur = conn.cursor()
    rows = cur.execute('''
        SELECT 'exec' AS event_type, name, NULL AS mode, timestamp
        FROM executed_files
        UNION ALL
        SELECT 'open' AS event_type, name, mode, timestamp
        FROM opened_files
        ORDER BY timestamp;
        ''')
    executed = set()
    for event_type, r_name, r_mode, r_timestamp in rows:
        if event_type == 'exec':
            r_mode = FILE_READ
        r_name = Path(r_name)

        if event_type == 'exec':
            executed.add(r_name)

        # Stays on the current run
        while run_timestamps and r_timestamp > run_timestamps[0]:
            del run_timestamps[0]
            access_files.append(set())

        # Adds symbolic links as read files
        for filename in find_all_links(
                r_name.parent if r_mode & FILE_LINK else r_name, False):
            if filename not in files:
                f = TracedFile(filename)
                f.read()
                files[f.path] = f
        # Go to final target
        if not r_mode & FILE_LINK:
            r_name = r_name.resolve()
        if r_name not in files:
            f = TracedFile(r_name)
            files[f.path] = f
        else:
            f = files[r_name]
        if r_mode & FILE_WRITE:
            f.write()
            # Mark the parent directory as read
            if r_name.parent not in files:
                fp = TracedFile(r_name.parent)
                fp.read()
                files[fp.path] = fp
        elif r_mode & FILE_READ:
            f.read()

        # Identifies input files
        if r_name.is_file() and r_name not in executed:
            access_files[-1].add(f)
    cur.close()

    # Further filters input files
    inputs = [
        [
            fi.path for fi in lst
            # Input files are regular files,
            if fi.path.is_file() and
            # ONLY_READ,
            fi.what == TracedFile.ONLY_READ and
            # not executable,
            # FIXME : currently disabled; only remove executed files
            # not fi.path.stat().st_mode & 0b111 and
            fi.path not in executed and
            # not in a system directory
            not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs)
        ] for lst in access_files
    ]

    # Identify output files
    outputs = [
        [
            fi.path for fi in lst
            # Output files are regular files,
            if fi.path.is_file() and
            # WRITTEN
            fi.what == TracedFile.WRITTEN and
            # not in a system directory
            not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs)
        ] for lst in access_files
    ]

    # Displays a warning for READ_THEN_WRITTEN files
    read_then_written_files = [
        fi for fi in itervalues(files)
        if fi.what == TracedFile.READ_THEN_WRITTEN and not any(
            fi.path.lies_under(m) for m in magic_dirs)
    ]
    if read_then_written_files:
        logging.warning(
            "Some files were read and then written. We will only pack the "
            "final version of the file; reproducible experiments shouldn't "
            "change their input files:\n%s",
            ", ".join(unicode_(fi.path) for fi in read_then_written_files))

    files = set(fi for fi in itervalues(files)
                if fi.what != TracedFile.WRITTEN and not any(
                    fi.path.lies_under(m) for m in magic_dirs))
    return files, inputs, outputs
Example #2
0
def get_files(conn):
    """Find all the files used by the experiment by reading the trace.
    """
    files = {}
    access_files = [set()]

    # Finds run timestamps, so we can sort input/output files by run
    proc_cursor = conn.cursor()
    executions = proc_cursor.execute(
        '''
        SELECT timestamp
        FROM processes
        WHERE parent ISNULL
        ORDER BY id;
        ''')
    run_timestamps = [r_timestamp for r_timestamp, in executions][1:]
    proc_cursor.close()

    # Adds dynamic linkers
    for libdir in (Path('/lib'), Path('/lib64')):
        if libdir.exists():
            for linker in libdir.listdir('*ld-linux*'):
                for filename in find_all_links(linker, True):
                    if filename not in files:
                        f = TracedFile(filename)
                        f.read()
                        files[f.path] = f

    # Loops on executed files, and opened files, at the same time
    cur = conn.cursor()
    rows = cur.execute(
        '''
        SELECT 'exec' AS event_type, name, NULL AS mode, timestamp
        FROM executed_files
        UNION ALL
        SELECT 'open' AS event_type, name, mode, timestamp
        FROM opened_files
        ORDER BY timestamp;
        ''')
    executed = set()
    for event_type, r_name, r_mode, r_timestamp in rows:
        if event_type == 'exec':
            r_mode = FILE_READ
        r_name = Path(r_name)

        if event_type == 'exec':
            executed.add(r_name)

        # Stays on the current run
        while run_timestamps and r_timestamp > run_timestamps[0]:
            del run_timestamps[0]
            access_files.append(set())

        # Adds symbolic links as read files
        for filename in find_all_links(r_name.parent if r_mode & FILE_LINK
                                       else r_name, False):
            if filename not in files:
                f = TracedFile(filename)
                f.read()
                files[f.path] = f
        # Go to final target
        if not r_mode & FILE_LINK:
            r_name = r_name.resolve()
        if r_name not in files:
            f = TracedFile(r_name)
            files[f.path] = f
        else:
            f = files[r_name]
        if r_mode & FILE_WRITE:
            f.write()
            # Mark the parent directory as read
            if r_name.parent not in files:
                fp = TracedFile(r_name.parent)
                fp.read()
                files[fp.path] = fp
        elif r_mode & FILE_READ:
            f.read()

        # Identifies input files
        if r_name.is_file() and r_name not in executed:
            access_files[-1].add(f)
    cur.close()

    # Further filters input files
    inputs = [[fi.path
               for fi in lst
               # Input files are regular files,
               if fi.path.is_file() and
               # ONLY_READ,
               fi.what == TracedFile.ONLY_READ and
               # not executable,
               # FIXME : currently disabled; only remove executed files
               # not fi.path.stat().st_mode & 0b111 and
               fi.path not in executed and
               # not in a system directory
               not any(fi.path.lies_under(m)
                       for m in magic_dirs + system_dirs)]
              for lst in access_files]

    # Identify output files
    outputs = [[fi.path
                for fi in lst
                # Output files are regular files,
                if fi.path.is_file() and
                # WRITTEN
                fi.what == TracedFile.WRITTEN and
                # not in a system directory
                not any(fi.path.lies_under(m)
                        for m in magic_dirs + system_dirs)]
               for lst in access_files]

    # Displays a warning for READ_THEN_WRITTEN files
    read_then_written_files = [
        fi
        for fi in itervalues(files)
        if fi.what == TracedFile.READ_THEN_WRITTEN and
        not any(fi.path.lies_under(m) for m in magic_dirs)]
    if read_then_written_files:
        logging.warning(
            "Some files were read and then written. We will only pack the "
            "final version of the file; reproducible experiments shouldn't "
            "change their input files:\n%s",
            ", ".join(unicode_(fi.path) for fi in read_then_written_files))

    files = set(
        fi
        for fi in itervalues(files)
        if fi.what != TracedFile.WRITTEN and not any(fi.path.lies_under(m)
                                                     for m in magic_dirs))
    return files, inputs, outputs