Beispiel #1
0
def test_empty_analysis(
    proto_db: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    order_by: str,
    n: int,
):
    """Test that 'empty' graphs are produced when analysis returns no results."""
    FLAGS.n = n
    progress.Run(
        make_data_flow_analysis_dataset.DatasetGenerator(
            input_db=proto_db,
            analysis="test_empty",
            output_db=graph_db,
            order_by=order_by,
        ))
    with graph_db.Session() as session, proto_db.Session() as proto_session:
        output_graph_count = session.query(
            sql.func.count(graph_tuple_database.GraphTuple.id)).scalar()

        input_graph_count = proto_session.query(
            sql.func.count(
                unlabelled_graph_database.ProgramGraph.ir_id)).scalar()

        assert output_graph_count == input_graph_count

        # All graphs are empty.
        assert (session.query(
            sql.func.sum(
                graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
Beispiel #2
0
def test_timeout_analysis(
    proto_db_10: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    order_by: str,
    n: int,
):
    """Test that timeout annotator produces one 'empty' graph for each input."""
    FLAGS.n = n
    FLAGS.annotator_timeout = 1
    progress.Run(
        make_data_flow_analysis_dataset.DatasetGenerator(
            input_db=proto_db_10,
            analysis="test_timeout",
            output_db=graph_db,
            order_by=order_by,
        ))
    with graph_db.Session() as session, proto_db_10.Session() as proto_session:
        assert (session.query(
            sql.func.count(graph_tuple_database.GraphTuple.id)).scalar(
            ) == proto_session.query(
                sql.func.count(
                    unlabelled_graph_database.ProgramGraph.ir_id)).scalar())

        # All graphs are empty.
        assert (session.query(
            sql.func.sum(
                graph_tuple_database.GraphTuple.node_count)).scalar() == 0)
Beispiel #3
0
def test_flaky_analysis(
    proto_db: unlabelled_graph_database.Database,
    graph_db: graph_tuple_database.Database,
    order_by: str,
    n: int,
):
    """Test that flaky annotator produces "some" graphs."""
    FLAGS.n = n
    progress.Run(
        make_data_flow_analysis_dataset.DatasetGenerator(
            input_db=proto_db,
            analysis="test_flaky",
            output_db=graph_db,
            order_by=order_by,
        ))
    with graph_db.Session() as session, proto_db.Session() as proto_session:
        assert (
            session.query(sql.func.count(
                graph_tuple_database.GraphTuple.id)).scalar() >=
            proto_session.query(
                sql.func.count(
                    unlabelled_graph_database.ProgramGraph.ir_id)).scalar())

        # Not all graphs are empty.
        assert session.query(
            sql.func.sum(graph_tuple_database.GraphTuple.node_count)).scalar()
Beispiel #4
0
def two_graph_db_session(
  db: unlabelled_graph_database.Database,
) -> unlabelled_graph_database.Database.SessionType:
  a = unlabelled_graph_database.ProgramGraph.Create(
    proto=random_programl_generator.CreateRandomProto(), ir_id=1
  )
  b = unlabelled_graph_database.ProgramGraph.Create(
    proto=random_programl_generator.CreateRandomProto(), ir_id=2
  )

  with db.Session() as session:
    session.add_all([a, b])
    session.commit()

    # Sanity check that the graphs have been added to the database.
    assert (
      session.query(
        sql.func.count(unlabelled_graph_database.ProgramGraph.ir_id)
      ).scalar()
      == 2
    )
    assert (
      session.query(
        sql.func.count(unlabelled_graph_database.ProgramGraphData.ir_id)
      ).scalar()
      == 2
    )

    yield session
def AnnotateGraphMetas(
    ir_db: ir_database.Database,
    proto_db: unlabelled_graph_database.Database,
    df: pd.DataFrame,
    ctx: progress.ProgressContext = progress.NullContext,
) -> Iterable[graph_tuple_database.GraphTuple]:
    """Add features and labels to graph metas in database."""
    with ir_db.Session() as ir_session, proto_db.Session() as proto_session:
        for _, row in df.iterrows():
            relpath = row["relpath"]
            with ctx.Profile(
                    2,
                    f"Processed graph {row['relpath']}:{row['data:dataset_name']}"
            ):
                # Select the corresponding IR.
                ir_id = (ir_session.query(
                    ir_database.IntermediateRepresentation.id).filter(
                        ir_database.IntermediateRepresentation.source ==
                        "pact17_opencl_devmap",
                        ir_database.IntermediateRepresentation.relpath ==
                        relpath,
                    ).scalar())
                # Check that we have an exact 1:1 mapping from the opencl devmap dataset
                # to IR.
                if ir_id is None:
                    raise ValueError(f"Expected one IR with relpath {relpath}")

                # Load the program graph.
                proto_row = (proto_session.query(
                    unlabelled_graph_database.ProgramGraph).filter(
                        unlabelled_graph_database.ProgramGraph.ir_id ==
                        ir_id).options(
                            sql.orm.joinedload(unlabelled_graph_database.
                                               ProgramGraph.data)).scalar())
                if proto_row is None:
                    raise ValueError(
                        f"Expected one proto for relpath {relpath} with ID {ir_id}"
                    )
                proto: programl_pb2.ProgramGraph = proto_row.proto

                # Add the null "selector vector" value.
                for node in proto.node:
                    node.x.append(0)

                # Add the graph-level features.
                proto.x[:] = [row["wgsize"], row["transfer"]]
                # Add 'y' graph feature as target.
                proto.y[:] = row["y"].tolist()

                # Create the graph tuple. Note the jumping through hoops with converting
                # proto -> nx -> graph_tuple, because there is currently no direct
                # proto -> graph_tuple conversion.
                graph_tuple = graph_tuple_database.GraphTuple.CreateFromGraphTuple(
                    graph_tuple=graph_tuples.GraphTuple.CreateFromNetworkX(
                        programl.ProgramGraphToNetworkX(proto)),
                    ir_id=ir_id,
                )
            yield graph_tuple
Beispiel #6
0
def BatchedProtoReader(
    proto_db: unlabelled_graph_database.Database,
    ids_and_sizes_to_do: List[Tuple[int, int]],
    batch_size_in_bytes: int,
    order_by: str,
    ctx: progress.ProgressBarContext,
) -> Iterable[List[ProgramGraphProto]]:
    """Read from the given list of IDs in batches."""
    ids_and_sizes_to_do = sorted(ids_and_sizes_to_do, key=lambda x: x[0])
    i = 0
    while i < len(ids_and_sizes_to_do):
        end_i = i
        batch_size = 0
        while batch_size < batch_size_in_bytes:
            batch_size += ids_and_sizes_to_do[end_i][1]
            end_i += 1
            if end_i >= len(ids_and_sizes_to_do):
                # We have run out of graphs to read.
                break

        with proto_db.Session() as session:
            with ctx.Profile(
                    2,
                    f"[reader] Read {humanize.BinaryPrefix(batch_size, 'B')} "
                    f"batch of {end_i - i} unlabelled graphs",
            ):
                graphs = session.query(
                    unlabelled_graph_database.ProgramGraph).options(
                        sql.orm.joinedload(
                            unlabelled_graph_database.ProgramGraph.data))
                if order_by == "in_order":
                    # For in-order reading, we can do fast range checks on the IR id.
                    start_id = ids_and_sizes_to_do[i][0]
                    end_id = ids_and_sizes_to_do[end_i - 1][0]
                    graphs = graphs.filter(
                        unlabelled_graph_database.ProgramGraph.ir_id >=
                        start_id,
                        unlabelled_graph_database.ProgramGraph.ir_id <= end_id,
                    )
                elif order_by == "random":
                    # For random order, have to do set lookups on each ID in the batch.
                    batch_ids_and_sizes = ids_and_sizes_to_do[i:end_i]
                    batch_ids = [x[0] for x in batch_ids_and_sizes]
                    graphs = graphs.filter(
                        unlabelled_graph_database.ProgramGraph.ir_id.in_(
                            batch_ids), )
                else:
                    raise app.UsageError(f"Unknown order: {order_by}")

                graphs = graphs.all()
            yield [
                ProgramGraphProto(ir_id=graph.ir_id,
                                  serialized_proto=graph.data.serialized_proto)
                for graph in graphs
            ]

        i = end_i
def test_fuzz_ProgramGraph_Create(db: unlabelled_graph_database.Database):
    """Fuzz the networkx -> proto conversion using randomly generated graphs."""
    global ir_id
    ir_id += 1
    with db.Session(commit=True) as session:
        session.add(
            unlabelled_graph_database.ProgramGraph.Create(
                proto=random_programl_generator.CreateRandomProto(),
                ir_id=ir_id,
                split=random.randint(0, 10) if random.random() < 0.5 else None,
            ))
def PopulateDatabaseWithTestSet(db: unlabelled_graph_database.Database,
                                graph_count: Optional[int] = None):
    """Populate a database with "real" programs."""
    inputs = itertools.islice(
        itertools.cycle(
            random_programl_generator.EnumerateTestSet(n=graph_count)),
        graph_count,
    )

    with db.Session(commit=True) as session:
        session.add_all([
            unlabelled_graph_database.ProgramGraph.Create(proto, ir_id=i + 1)
            for i, proto in enumerate(inputs)
        ])
    return db
Beispiel #9
0
def test_pass_thru_analysis(
  proto_db: unlabelled_graph_database.Database,
  graph_db: graph_tuple_database.Database,
  order_by: str,
  n: int,
):
  """Test that pass-thru annotator produces n * protos graphs."""
  FLAGS.n = n
  progress.Run(
    make_data_flow_analysis_dataset.DatasetGenerator(
      input_db=proto_db,
      analysis="test_pass_thru",
      output_db=graph_db,
      order_by=order_by,
    )
  )
  with graph_db.Session() as session, proto_db.Session() as proto_session:
    # Check that n * proto_countto graphs were generated.
    assert (
      session.query(sql.func.count(graph_tuple_database.GraphTuple.id)).scalar()
      == n
      * proto_session.query(
        sql.func.count(unlabelled_graph_database.ProgramGraph.ir_id)
      ).scalar()
    )

    # Check that every unique proto appears in the graph database.
    assert set(
      row.ir_id
      for row in session.query(graph_tuple_database.GraphTuple.ir_id).all()
    ) == set(
      row.ir_id
      for row in proto_session.query(
        unlabelled_graph_database.ProgramGraph.ir_id
      )
    )

    # Check the node counts of the generated graphs.
    assert (
      session.query(
        sql.func.sum(graph_tuple_database.GraphTuple.node_count)
      ).scalar()
      == n
      * proto_session.query(
        sql.func.sum(unlabelled_graph_database.ProgramGraph.node_count)
      ).scalar()
    )
def test_PopulateDatabaseWithRandomProgramGraphs(
    db: unlabelled_graph_database.Database,
    proto_count: int,
    node_x_dimensionality: int,
    node_y_dimensionality: int,
    graph_x_dimensionality: int,
    graph_y_dimensionality: int,
    split_count: int,
):
    """Test populating databases."""
    random_unlabelled_graph_database_generator.PopulateDatabaseWithRandomProgramGraphs(
        db=db,
        proto_count=proto_count,
        node_x_dimensionality=node_x_dimensionality,
        node_y_dimensionality=node_y_dimensionality,
        graph_x_dimensionality=graph_x_dimensionality,
        graph_y_dimensionality=graph_y_dimensionality,
        split_count=split_count,
    )
    with db.Session() as session:
        assert (session.query(
            sql.func.count(
                unlabelled_graph_database.ProgramGraph.ir_id)).scalar() ==
                proto_count)

        assert (session.query(
            sql.func.min(
                unlabelled_graph_database.ProgramGraph.node_x_dimensionality)).
                scalar() == node_x_dimensionality)

        assert (session.query(
            sql.func.min(
                unlabelled_graph_database.ProgramGraph.node_y_dimensionality)).
                scalar() == node_y_dimensionality)

        assert (session.query(
            sql.func.min(
                unlabelled_graph_database.ProgramGraph.graph_y_dimensionality)
        ).scalar() == graph_y_dimensionality)

        assert (session.query(
            sql.func.min(
                unlabelled_graph_database.ProgramGraph.graph_y_dimensionality)
        ).scalar() == graph_y_dimensionality)
Beispiel #11
0
def PopulateDatabaseWithRandomProgramGraphs(
    db: unlabelled_graph_database.Database,
    proto_count: int,
    node_x_dimensionality: int = 1,
    node_y_dimensionality: int = 0,
    graph_x_dimensionality: int = 0,
    graph_y_dimensionality: int = 0,
    split_count: int = 0,
    random_proto_pool_size: int = 0,
) -> DatabaseAndRows:
    """Populate a database of random graph tuples."""
    random_proto_pool_size = random_proto_pool_size or min(
        FLAGS.random_proto_pool_size, 128)

    graph_pool = [
        CreateRandomProgramGraph(
            node_x_dimensionality=node_x_dimensionality,
            node_y_dimensionality=node_y_dimensionality,
            graph_x_dimensionality=graph_x_dimensionality,
            graph_y_dimensionality=graph_y_dimensionality,
            split_count=split_count,
        ) for _ in range(random_proto_pool_size)
    ]

    # Generate a full list of rows by randomly selecting from the graph pool.
    rows = [
        copy.deepcopy(random.choice(graph_pool)) for _ in range(proto_count)
    ]

    # Assign unique keys and checksums.
    for i, row in enumerate(rows, start=1):
        row.ir_id = i
        row.data.sha1 = str(i) * 40

    with db.Session(commit=True) as session:
        session.add_all([copy.deepcopy(t) for t in rows])

    return DatabaseAndRows(db, rows)
Beispiel #12
0
    def __init__(
        self,
        input_db: unlabelled_graph_database.Database,
        analysis: str,
        output_db: graph_tuple_database.Database,
        order_by: str = "in_order",
        max_instances: int = 0,
    ):
        self.analysis = analysis
        self.output_db = output_db

        # Check that the requested analysis exists.
        if analysis not in annotate.ANALYSES:
            raise app.UsageError(
                f"Unknown analysis: {analysis}. "
                f"Available analyses: {annotate.AVAILABLE_ANALYSES}", )

        with input_db.Session() as in_session, output_db.Session(
        ) as out_session:
            # Get the graphs that have already been processed.
            already_done_max, already_done_count = out_session.query(
                sql.func.max(graph_tuple_database.GraphTuple.ir_id),
                sql.func.count(
                    sql.func.distinct(graph_tuple_database.GraphTuple.ir_id)),
            ).one()
            already_done_max = already_done_max or -1

            # Get the total number of graphs, including those that have already been
            # processed.
            total_graph_count = in_session.query(
                sql.func.count(
                    unlabelled_graph_database.ProgramGraph.ir_id)).scalar()

            # Get the total number of graphs to process, and the IDs of the graphs to
            # process.
            ids_and_sizes_to_do = in_session.query(
                unlabelled_graph_database.ProgramGraph.ir_id,
                unlabelled_graph_database.ProgramGraph.serialized_proto_size,
            )
            if order_by == "in_order":
                ids_and_sizes_to_do = ids_and_sizes_to_do.filter(
                    unlabelled_graph_database.ProgramGraph.ir_id >
                    already_done_max).order_by(
                        unlabelled_graph_database.ProgramGraph.ir_id)
            elif order_by == "random":
                # Filter out the graphs that have already been processed.
                if already_done_count:
                    already_done_ids = {
                        row.ir_id
                        for row in out_session.query(
                            graph_tuple_database.GraphTuple.ir_id)
                    }
                    assert already_done_ids != already_done_count
                    ids_and_sizes_to_do = ids_and_sizes_to_do.filter(
                        ~unlabelled_graph_database.ProgramGraph.ir_id.in_(
                            already_done_ids))
                # Order the graphs to do randomly.
                ids_and_sizes_to_do = ids_and_sizes_to_do.order_by(
                    input_db.Random())
            else:
                raise app.UsageError(f"Unknown order: {order_by}")

            # Optionally limit the number of IDs to process.
            if max_instances:
                ids_and_sizes_to_do = ids_and_sizes_to_do.limit(max_instances)
            ids_and_sizes_to_do = [(row.ir_id, row.serialized_proto_size)
                                   for row in ids_and_sizes_to_do]

        # Sanity check.
        if not max_instances:
            if len(ids_and_sizes_to_do
                   ) + already_done_count != total_graph_count:
                raise OSError(
                    "ids_to_do(%s) + already_done(%s) != total_rows(%s)",
                    len(ids_and_sizes_to_do),
                    already_done_count,
                    total_graph_count,
                )

        with output_db.Session(commit=True) as out_session:
            out_session.add(
                unlabelled_graph_database.Meta.Create(
                    key="Graph counts",
                    value=(already_done_count, total_graph_count)))
        app.Log(
            1,
            "Selected %s of %s to process",
            humanize.Commas(len(ids_and_sizes_to_do)),
            humanize.Plural(total_graph_count, "unlabelled graph"),
        )

        super(DatasetGenerator, self).__init__(name=analysis,
                                               i=already_done_count,
                                               n=total_graph_count,
                                               unit="protos")

        self.graph_reader = ppar.ThreadedIterator(
            BatchedProtoReader(
                input_db,
                ids_and_sizes_to_do,
                FLAGS.proto_batch_mb * 1024 * 1024,
                order_by,
                self.ctx.ToProgressContext(),
            ),
            max_queue_size=FLAGS.max_reader_queue_size,
        )