Exemple #1
0
def proto_db(
  request, ir_db: ir_database.Database
) -> unlabelled_graph_database.Database:
  """A test fixture which yields a graph database with random graph tuples."""
  with ir_db.Session() as session:
    ir_ids = [
      row.id for row in session.query(ir_database.IntermediateRepresentation.id)
    ]

  with testing_databases.DatabaseContext(
    unlabelled_graph_database.Database, request.param
  ) as db:
    with db.Session(commit=True) as session:
      session.add_all(
        [
          unlabelled_graph_database.ProgramGraph.Create(
            proto=random_programl_generator.CreateRandomProto(
              graph_y_dimensionality=2
            ),
            ir_id=ir_id,
          )
          for ir_id in ir_ids
        ]
      )
    yield db
def AnnotateGraphMetas(
    ir_db: ir_database.Database,
    proto_db: unlabelled_graph_database.Database,
    df: pd.DataFrame,
    ctx: progress.ProgressContext = progress.NullContext,
) -> Iterable[graph_tuple_database.GraphTuple]:
    """Add features and labels to graph metas in database."""
    with ir_db.Session() as ir_session, proto_db.Session() as proto_session:
        for _, row in df.iterrows():
            relpath = row["relpath"]
            with ctx.Profile(
                    2,
                    f"Processed graph {row['relpath']}:{row['data:dataset_name']}"
            ):
                # Select the corresponding IR.
                ir_id = (ir_session.query(
                    ir_database.IntermediateRepresentation.id).filter(
                        ir_database.IntermediateRepresentation.source ==
                        "pact17_opencl_devmap",
                        ir_database.IntermediateRepresentation.relpath ==
                        relpath,
                    ).scalar())
                # Check that we have an exact 1:1 mapping from the opencl devmap dataset
                # to IR.
                if ir_id is None:
                    raise ValueError(f"Expected one IR with relpath {relpath}")

                # Load the program graph.
                proto_row = (proto_session.query(
                    unlabelled_graph_database.ProgramGraph).filter(
                        unlabelled_graph_database.ProgramGraph.ir_id ==
                        ir_id).options(
                            sql.orm.joinedload(unlabelled_graph_database.
                                               ProgramGraph.data)).scalar())
                if proto_row is None:
                    raise ValueError(
                        f"Expected one proto for relpath {relpath} with ID {ir_id}"
                    )
                proto: programl_pb2.ProgramGraph = proto_row.proto

                # Add the null "selector vector" value.
                for node in proto.node:
                    node.x.append(0)

                # Add the graph-level features.
                proto.x[:] = [row["wgsize"], row["transfer"]]
                # Add 'y' graph feature as target.
                proto.y[:] = row["y"].tolist()

                # Create the graph tuple. Note the jumping through hoops with converting
                # proto -> nx -> graph_tuple, because there is currently no direct
                # proto -> graph_tuple conversion.
                graph_tuple = graph_tuple_database.GraphTuple.CreateFromGraphTuple(
                    graph_tuple=graph_tuples.GraphTuple.CreateFromNetworkX(
                        programl.ProgramGraphToNetworkX(proto)),
                    ir_id=ir_id,
                )
            yield graph_tuple
Exemple #3
0
  def Split(self, db: ir_database.Database) -> List[np.array]:
    """Split the database."""
    poj104 = super(TrainValTestSplitter, self).Split(db)

    # Get the IDs of non-POJ-104 IRs.
    with db.Session() as session:
      total_count = (
        session.query(sql.func.count(ir_database.IntermediateRepresentation.id))
        .filter(
          ir_database.IntermediateRepresentation.compilation_succeeded == True,
          ~ir_database.IntermediateRepresentation.source.like("poj-104:%"),
        )
        .scalar()
      )

      # Scale the train/val/test ratio to the total IR count.
      train_val_test_counts = np.floor(self.ratios * total_count).astype(
        np.int32
      )
      # Round up if there were missing values.
      while train_val_test_counts.sum() < total_count:
        train_val_test_counts[random.randint(0, 2)] += 1

      assert total_count == train_val_test_counts.sum()
      app.Log(
        1,
        "Splitting %s IRs into splits: %s train, %s val, %s test",
        humanize.Commas(total_count + sum(len(s) for s in poj104)),
        humanize.Commas(train_val_test_counts[0] + len(poj104[0])),
        humanize.Commas(train_val_test_counts[1] + len(poj104[1])),
        humanize.Commas(train_val_test_counts[2] + len(poj104[2])),
      )

      ir_ids = [
        row.id
        for row in session.query(ir_database.IntermediateRepresentation.id)
        .filter(
          ir_database.IntermediateRepresentation.compilation_succeeded == True,
          ~ir_database.IntermediateRepresentation.source.like("poj-104:%"),
        )
        .order_by(db.Random())
      ]

    if not ir_ids:
      raise ValueError("No results")

    return [
      np.concatenate((poj104[0], ir_ids[: train_val_test_counts[0]])),
      np.concatenate(
        (
          poj104[1],
          ir_ids[train_val_test_counts[0] : sum(train_val_test_counts[:2])],
        )
      ),
      np.concatenate((poj104[2], ir_ids[sum(train_val_test_counts[:2]) :])),
    ]
Exemple #4
0
def db_with_empty_ir(request,
                     db: ir_database.Database) -> ir_database.Database:
    empty_ir_count = request.param
    with db.Session(commit=True) as session:
        session.add_all([
            ir_database.IntermediateRepresentation.CreateEmpty(
                source="foo",
                relpath=str(i),
                source_language=ir_database.SourceLanguage.C,
                type=ir_database.IrType.LLVM_6_0,
                cflags="",
            ) for i in range(empty_ir_count)
        ])
    return db
Exemple #5
0
  def Split(self, db: ir_database.Database) -> List[np.array]:
    """Get the bytecode IDs for the POJ-104 app classification experiment."""

    def GetBytecodeIds(session, filter_cb) -> np.array:
      """Return the IDs for the given filtered query."""
      ids = np.array(
        [
          row.id
          for row in (
            session.query(ir_database.IntermediateRepresentation.id).filter(
              ir_database.IntermediateRepresentation.compilation_succeeded
              == True,
              filter_cb(),
            )
          )
        ],
        dtype=np.int32,
      )
      if not ids.size:
        raise ValueError("No results")
      return ids

    with db.Session() as session:
      return [
        GetBytecodeIds(
          session,
          lambda: (
            ir_database.IntermediateRepresentation.source == "poj-104:train"
          ),
        ),
        GetBytecodeIds(
          session,
          lambda: (
            ir_database.IntermediateRepresentation.source == "poj-104:val"
          ),
        ),
        GetBytecodeIds(
          session,
          lambda: (
            ir_database.IntermediateRepresentation.source == "poj-104:test"
          ),
        ),
      ]
Exemple #6
0
    def PopulateBytecodeTable(self,
                              db: ir_database.Database,
                              commit_every: int = 1000):
        programs_df = self.programs_df.reset_index()
        bar = progressbar.ProgressBar()
        bar.max_value = len(programs_df)

        # Process each row of the table in parallel.
        pool = multiprocessing.Pool()
        with db.Session(commit=True) as s:
            for i, proto in enumerate(
                    pool.imap_unordered(ProcessOpenClProgramDfBytecode,
                                        [d
                                         for _, d in programs_df.iterrows()])):
                bar.update(i)
                s.add(
                    bytecode_database.LlvmBytecode(
                        **bytecode_database.LlvmBytecode.FromProto(proto)))
                if not (i % commit_every):
                    s.commit()
Exemple #7
0
def populated_ir_db(ir_db: ir_database.Database,
                    opencl_relpaths: Set[str]) -> ir_database.Database:
    """A test fixture which yields an IR database with 256 OpenCL entries."""
    rows = []
    # Create random rows using OpenCL relpaths.
    for i, relpath in enumerate(opencl_relpaths):
        ir = ir_database.IntermediateRepresentation.CreateFromText(
            source="pact17_opencl_devmap",
            relpath=relpath,
            source_language=ir_database.SourceLanguage.OPENCL,
            type=ir_database.IrType.LLVM_6_0,
            cflags="",
            text=CreateRandomString(),
        )
        ir.id = i + 1
        rows.append(ir)

    with ir_db.Session(commit=True) as session:
        session.add_all(rows)

    return ir_db
Exemple #8
0
  def Split(self, db: ir_database.Database) -> List[np.array]:
    """Split the database."""
    with db.Session() as session:
      all_ids = np.array(
        [
          row.id
          for row in session.query(
            ir_database.IntermediateRepresentation.id
          ).filter(
            ir_database.IntermediateRepresentation.compilation_succeeded
            == True,
            ir_database.IntermediateRepresentation.source
            == "pact17_opencl_devmap",
          )
        ],
        dtype=np.int32,
      )

    if not all_ids.size:
      raise ValueError("No results")
    kfold = sklearn.model_selection.KFold(self.k).split(all_ids)
    return [all_ids[test] for (train, test) in kfold]