Exemple #1
0
def find_all_links_recursive(filename, files):
    path = Path('/')
    for c in filename.components[1:]:
        # At this point, path is a canonical path, and all links in it have
        # been resolved

        # We add the next path component
        path = path / c

        # That component is possibly a link
        if path.is_link():
            # Adds the link itself
            files.add(path)

            target = path.read_link(absolute=True)
            # Here, target might contain a number of symlinks
            if target not in files:
                # Recurse on this new path
                find_all_links_recursive(target, files)
            # Restores the invariant; realpath might resolve several links here
            path = path.resolve()
    return path
Exemple #2
0
def find_all_links_recursive(filename, files):
    path = Path('/')
    for c in filename.components[1:]:
        # At this point, path is a canonical path, and all links in it have
        # been resolved

        # We add the next path component
        path = path / c

        # That component is possibly a link
        if path.is_link():
            # Adds the link itself
            files.add(path)

            target = path.read_link(absolute=True)
            # Here, target might contain a number of symlinks
            if target not in files:
                # Recurse on this new path
                find_all_links_recursive(target, files)
            # Restores the invariant; realpath might resolve several links here
            path = path.resolve()
    return path
Exemple #3
0
def get_files(conn):
    """Find all the files used by the experiment by reading the trace.
    """
    files = {}
    access_files = [set()]

    # Finds run timestamps, so we can sort input/output files by run
    proc_cursor = conn.cursor()
    executions = proc_cursor.execute(
        '''
        SELECT timestamp
        FROM processes
        WHERE parent ISNULL
        ORDER BY id;
        ''')
    run_timestamps = [r_timestamp for r_timestamp, in executions][1:]
    proc_cursor.close()

    # Adds dynamic linkers
    for libdir in (Path('/lib'), Path('/lib64')):
        if libdir.exists():
            for linker in libdir.listdir('*ld-linux*'):
                for filename in find_all_links(linker, True):
                    if filename not in files:
                        f = TracedFile(filename)
                        f.read()
                        files[f.path] = f

    # Loops on executed files, and opened files, at the same time
    cur = conn.cursor()
    rows = cur.execute(
        '''
        SELECT 'exec' AS event_type, name, NULL AS mode, timestamp
        FROM executed_files
        UNION ALL
        SELECT 'open' AS event_type, name, mode, timestamp
        FROM opened_files
        ORDER BY timestamp;
        ''')
    executed = set()
    for event_type, r_name, r_mode, r_timestamp in rows:
        if event_type == 'exec':
            r_mode = FILE_READ
        r_name = Path(r_name)

        if event_type == 'exec':
            executed.add(r_name)

        # Stays on the current run
        while run_timestamps and r_timestamp > run_timestamps[0]:
            del run_timestamps[0]
            access_files.append(set())

        # Adds symbolic links as read files
        for filename in find_all_links(r_name.parent if r_mode & FILE_LINK
                                       else r_name, False):
            if filename not in files:
                f = TracedFile(filename)
                f.read()
                files[f.path] = f
        # Go to final target
        if not r_mode & FILE_LINK:
            r_name = r_name.resolve()
        if r_name not in files:
            f = TracedFile(r_name)
            files[f.path] = f
        else:
            f = files[r_name]
        if r_mode & FILE_WRITE:
            f.write()
            # Mark the parent directory as read
            if r_name.parent not in files:
                fp = TracedFile(r_name.parent)
                fp.read()
                files[fp.path] = fp
        elif r_mode & FILE_READ:
            f.read()

        # Identifies input files
        if r_name.is_file() and r_name not in executed:
            access_files[-1].add(f)
    cur.close()

    # Further filters input files
    inputs = [[fi.path
               for fi in lst
               # Input files are regular files,
               if fi.path.is_file() and
               # ONLY_READ,
               fi.what == TracedFile.ONLY_READ and
               # not executable,
               # FIXME : currently disabled; only remove executed files
               # not fi.path.stat().st_mode & 0b111 and
               fi.path not in executed and
               # not in a system directory
               not any(fi.path.lies_under(m)
                       for m in magic_dirs + system_dirs)]
              for lst in access_files]

    # Identify output files
    outputs = [[fi.path
                for fi in lst
                # Output files are regular files,
                if fi.path.is_file() and
                # WRITTEN
                fi.what == TracedFile.WRITTEN and
                # not in a system directory
                not any(fi.path.lies_under(m)
                        for m in magic_dirs + system_dirs)]
               for lst in access_files]

    # Displays a warning for READ_THEN_WRITTEN files
    read_then_written_files = [
        fi
        for fi in itervalues(files)
        if fi.what == TracedFile.READ_THEN_WRITTEN and
        not any(fi.path.lies_under(m) for m in magic_dirs)]
    if read_then_written_files:
        logging.warning(
            "Some files were read and then written. We will only pack the "
            "final version of the file; reproducible experiments shouldn't "
            "change their input files:\n%s",
            ", ".join(unicode_(fi.path) for fi in read_then_written_files))

    files = set(
        fi
        for fi in itervalues(files)
        if fi.what != TracedFile.WRITTEN and not any(fi.path.lies_under(m)
                                                     for m in magic_dirs))
    return files, inputs, outputs
Exemple #4
0
def get_files(conn):
    """Find all the files used by the experiment by reading the trace.
    """
    files = {}
    access_files = [set()]

    # Finds run timestamps, so we can sort input/output files by run
    proc_cursor = conn.cursor()
    executions = proc_cursor.execute('''
        SELECT timestamp
        FROM processes
        WHERE parent ISNULL
        ORDER BY id;
        ''')
    run_timestamps = [r_timestamp for r_timestamp, in executions][1:]
    proc_cursor.close()

    # Adds dynamic linkers
    for libdir in (Path('/lib'), Path('/lib64')):
        if libdir.exists():
            for linker in libdir.listdir('*ld-linux*'):
                for filename in find_all_links(linker, True):
                    if filename not in files:
                        f = TracedFile(filename)
                        f.read(None)
                        files[f.path] = f

    # Loops on executed files, and opened files, at the same time
    cur = conn.cursor()
    rows = cur.execute('''
        SELECT 'exec' AS event_type, name, NULL AS mode, timestamp
        FROM executed_files
        UNION ALL
        SELECT 'open' AS event_type, name, mode, timestamp
        FROM opened_files
        ORDER BY timestamp;
        ''')
    executed = set()
    run = 0
    for event_type, r_name, r_mode, r_timestamp in rows:
        if event_type == 'exec':
            r_mode = FILE_READ
        r_name = Path(normalize_path(r_name))

        # Stays on the current run
        while run_timestamps and r_timestamp > run_timestamps[0]:
            del run_timestamps[0]
            access_files.append(set())
            run += 1

        # Adds symbolic links as read files
        for filename in find_all_links(
                r_name.parent if r_mode & FILE_LINK else r_name, False):
            if filename not in files:
                f = TracedFile(filename)
                f.read(run)
                files[f.path] = f
        # Go to final target
        if not r_mode & FILE_LINK:
            r_name = r_name.resolve()
        if event_type == 'exec':
            executed.add(r_name)
        if r_name not in files:
            f = TracedFile(r_name)
            files[f.path] = f
        else:
            f = files[r_name]
        if r_mode & FILE_READ:
            f.read(run)
        if r_mode & FILE_WRITE:
            f.write(run)
            # Mark the parent directory as read
            if r_name.parent not in files:
                fp = TracedFile(r_name.parent)
                fp.read(run)
                files[fp.path] = fp

        # Identifies input files
        if r_name.is_file() and r_name not in executed:
            access_files[-1].add(f)
    cur.close()

    # Further filters input files
    inputs = [
        [
            fi.path for fi in lst
            # Input files are regular files,
            if fi.path.is_file() and
            # ONLY_READ,
            fi.runs[r] == TracedFile.ONLY_READ and
            # not executable,
            # FIXME : currently disabled; only remove executed files
            # not fi.path.stat().st_mode & 0b111 and
            fi.path not in executed and
            # not in a system directory
            not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs)
        ] for r, lst in enumerate(access_files)
    ]

    # Identify output files
    outputs = [
        [
            fi.path for fi in lst
            # Output files are regular files,
            if fi.path.is_file() and
            # WRITTEN
            fi.runs[r] == TracedFile.WRITTEN and
            # not in a system directory
            not any(fi.path.lies_under(m) for m in magic_dirs + system_dirs)
        ] for r, lst in enumerate(access_files)
    ]

    # Run the list of files through the filter plugins
    run_filter_plugins(files, inputs)

    # Files removed from plugins should be removed from inputs as well
    inputs = [[path for path in lst if path in files] for lst in inputs]

    # Displays a warning for READ_THEN_WRITTEN files
    read_then_written_files = [
        fi for fi in files.values()
        if fi.what == TracedFile.READ_THEN_WRITTEN and not any(
            fi.path.lies_under(m) for m in magic_dirs)
    ]
    if read_then_written_files:
        logger.warning(
            "Some files were read and then written. We will only pack the "
            "final version of the file; reproducible experiments shouldn't "
            "change their input files")
        logger.info("Paths:\n%s",
                    ", ".join(str(fi.path) for fi in read_then_written_files))

    files = set(fi for fi in files.values()
                if fi.what != TracedFile.WRITTEN and not any(
                    fi.path.lies_under(m) for m in magic_dirs))
    return files, inputs, outputs
Exemple #5
0
def get_files(conn):
    """Find all the files used by the experiment by reading the trace.
    """
    files = {}
    access_files = [set()]

    # Finds run timestamps, so we can sort input/output files by run
    proc_cursor = conn.cursor()
    executions = proc_cursor.execute(
            '''
            SELECT timestamp
            FROM processes
            WHERE parent ISNULL
            ORDER BY id;
            ''')
    run_timestamps = [r_timestamp for r_timestamp, in executions][1:]
    proc_cursor.close()

    # Adds dynamic linkers
    for libdir in (Path('/lib'), Path('/lib64')):
        if libdir.exists():
            for linker in libdir.listdir('*ld-linux*'):
                for filename in find_all_links(linker, True):
                    if filename not in files:
                        f = TracedFile(filename)
                        f.read()
                        files[f.path] = f

    # Adds executed files
    exec_cursor = conn.cursor()
    executed_files = exec_cursor.execute(
            '''
            SELECT name, timestamp
            FROM executed_files
            ORDER BY timestamp;
            ''')
    executed = set()
    # ... and opened files
    open_cursor = conn.cursor()
    opened_files = open_cursor.execute(
            '''
            SELECT name, mode, timestamp
            FROM opened_files
            ORDER BY timestamp;
            ''')
    # Loop on both lists at once
    rows = heapq.merge(((r[1], 'exec', r) for r in executed_files),
                       ((r[2], 'open', r) for r in opened_files))
    for ts, event_type, data in rows:
        if event_type == 'exec':
            r_name, r_timestamp = data
            r_mode = FILE_READ
        else:  # event_type == 'open'
            r_name, r_mode, r_timestamp = data
        r_name = Path(r_name)

        if event_type == 'exec':
            executed.add(r_name)

        # Stays on the current run
        while run_timestamps and r_timestamp > run_timestamps[0]:
            del run_timestamps[0]
            access_files.append(set())

        # Adds symbolic links as read files
        for filename in find_all_links(r_name, False):
            if filename not in files:
                f = TracedFile(filename)
                f.read()
                files[f.path] = f
        # Adds final target
        r_name = r_name.resolve()
        if r_name not in files:
            f = TracedFile(r_name)
            files[f.path] = f
        else:
            f = files[r_name]
        if r_mode & FILE_WRITE:
            f.write()
        elif r_mode & FILE_READ:
            f.read()

        # Identifies input files
        if r_name.is_file() and r_name not in executed:
            access_files[-1].add(f)
    exec_cursor.close()
    open_cursor.close()

    # Further filters input files
    inputs = [[fi.path
               for fi in lst
               # Input files are regular files,
               if fi.path.is_file() and
               # ONLY_READ,
               fi.what == TracedFile.ONLY_READ and
               # not executable,
               # FIXME : currently disabled. Maybe only remove executed files?
               # not fi.path.stat().st_mode & 0b111 and
               # not in a system directory
               not any(fi.path.lies_under(m)
                       for m in magic_dirs + system_dirs)]
              for lst in access_files]

    # Identify output files
    outputs = [[fi.path
                for fi in lst
                # Output files are regular files,
                if fi.path.is_file() and
                # WRITTEN
                fi.what == TracedFile.WRITTEN and
                # not in a system directory
                not any(fi.path.lies_under(m)
                        for m in magic_dirs + system_dirs)]
               for lst in access_files]

    # Displays a warning for READ_THEN_WRITTEN files
    read_then_written_files = [
            fi
            for fi in itervalues(files)
            if fi.what == TracedFile.READ_THEN_WRITTEN and
            not any(fi.path.lies_under(m) for m in magic_dirs)]
    if read_then_written_files:
        logging.warning(
                "Some files were read and then written. We will only pack the "
                "final version of the file; reproducible experiments "
                "shouldn't change their input files:\n%s",
                ", ".join(unicode_(fi.path) for fi in read_then_written_files))

    files = set(
            fi
            for fi in itervalues(files)
            if fi.what != TracedFile.WRITTEN and not any(fi.path.lies_under(m)
                                                         for m in magic_dirs))
    return files, inputs, outputs