Ejemplo n.º 1
0
def test_difference():
    py_id = "5cafffc1e850c97e6a1f9a70"
    r_id = "5cb567d2e850c93c2bec1361"
    store = mongo_driver.MongoStore(props.DATASET)
    start = time.time()
    py_stmt = store.load_stmt(mongo_id=py_id)
    py_stmt = Statement(mongo_id=py_stmt["_id"],
                        snippet=py_stmt["snippet"],
                        variables=py_stmt["variables"],
                        language=py_stmt["language"],
                        outputs=format_outputs(py_stmt["outputs"]))
    py_time = time.time()
    print("Processing Py = %0.2f" % (py_time - start))
    r_stmt = store.load_stmt(mongo_id=r_id)
    r_stmt = Statement(mongo_id=r_stmt["_id"],
                       snippet=r_stmt["snippet"],
                       variables=r_stmt["variables"],
                       language=r_stmt["language"],
                       outputs=format_outputs(r_stmt["outputs"]))
    r_time = time.time()
    print("Processing Py = %0.2f" % (r_time - py_time))
    r_key = r_stmt.outputs.keys()[0]
    py_key = py_stmt.outputs.keys()[0]
    # diffs = compare_returns(r_stmt.outputs[r_key].returns, py_stmt.outputs[py_key].returns)
    # diffs = threaded_compare_returns(r_stmt.outputs[r_key].returns[:4], py_stmt.outputs[py_key].returns[:4])
    diffs = pooled_compare_returns(r_stmt.outputs[r_key].returns,
                                   py_stmt.outputs[py_key].returns)
    print(DiffMeta.from_dict(diffs[0].to_dict()))
    end = time.time()
    print("Time Taken = %0.2f" % (end - r_time))
    exit()
Ejemplo n.º 2
0
def fetch_statements(language,
                     force=False,
                     do_save=False,
                     limit=None,
                     as_list=False):
    pkl_file = get_executed_stmts_pkl(language)
    if not force and cache.file_exists(pkl_file):
        LOGGER.info("Retrieving existing '%s' statements!" % language)
        if as_list:
            return cache.load_pickle(pkl_file).values()
        return cache.load_pickle(pkl_file)
    LOGGER.info("Reprocessing '%s' statements!" % language)
    store = mongo_driver.MongoStore(props.DATASET)
    stmts = {}
    mongo_stmts = store.load_stmts(language=language,
                                   is_valid=True,
                                   has_output=True,
                                   limit=limit).items()
    n_stmts = len(mongo_stmts)
    for i, (key, mongo_stmt) in enumerate(mongo_stmts):
        LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts))
        stmt = Statement(mongo_id=mongo_stmt["_id"],
                         snippet=mongo_stmt["snippet"],
                         variables=mongo_stmt["variables"],
                         language=language,
                         outputs=format_outputs(mongo_stmt["outputs"]))
        stmts[stmt.mongo_id] = stmt
    if do_save:
        LOGGER.info("Saving statements .... ")
        cache.save_pickle(pkl_file, stmts)
    if as_list:
        return stmts.values()
    return stmts
Ejemplo n.º 3
0
def execute_stmts(stmts, limit=props.PY_STMT_LIMIT, do_log=False):
    store = mongo_driver.MongoStore(props.DATASET)
    n_stmts = len(stmts)
    top_py_stmts = crawler.get_stmt_counts(limit)
    valids = 0
    generated_args = arg_generator.generate_args()
    for i, (key, stmt) in enumerate(stmts.items()):
        snippet = stmt["snippet"]
        if snippet not in top_py_stmts:
            if do_log:
                LOGGER.info("Not processing %d / %d. Moving on !" %
                            (i + 1, n_stmts))
            continue
        if 'outputs' in stmt:
            LOGGER.info("Processed %d / %d. Moving on !" % (i + 1, n_stmts))
            continue
        # To skip plotting functions.
        if ".hist(" in snippet:
            LOGGER.info(
                "Snippet %d / %d contains a plot operation. Moving on !" %
                (i + 1, n_stmts))
            continue
        LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts))
        outputs = execute_stmt(stmt, generated_args)
        assert len(outputs) > 0
        store.update_stmt_outputs(stmt['_id'], outputs)
        valids += 1
    LOGGER.info("Valid outputs = %d / %d" % (valids, n_stmts))
Ejemplo n.º 4
0
def create_normalized_stmt_file_map():
  store = mongo_driver.MongoStore(props.DATASET)
  LOGGER.info("Fetching Stmts .... ")
  valid_stmts = store.load_stmts(is_valid=True, has_output=True)
  LOGGER.info("Fetching File Stmts .... ")
  file_stmts = store.load_file_stmts()
  back_pointers = {}
  i, n_file_stmts = 0, file_stmts.count()
  for file_stmt in file_stmts:
    i += 1
    LOGGER.info("Processing file %d / %d ... " % (i, n_file_stmts))
    file_name = file_stmt['file_name'].split(props.PROJECTS_SRC)[-1].split("/", 2)[-1]
    language = file_stmt['language']
    for snippet in file_stmt['snippets']:
      if (snippet, language) not in valid_stmts:
        continue
      normalized_snippet = syntactic.normalize(valid_stmts[(snippet, language)])
      if normalized_snippet not in back_pointers:
        back_pointers[normalized_snippet] = {
          "language": language,
          "file_names": set()
        }
      back_pointers[normalized_snippet]["file_names"].add(file_name)
  LOGGER.info("Saving stmt to file pointers ... ")
  for snippet, back_pointer in back_pointers.items():
    back_pointer["file_names"] = list(back_pointer["file_names"])
    store.create_stmt_file_map(snippet, back_pointer)
Ejemplo n.º 5
0
def get_single_output_stmts(language, limit=None, log_interval=100):
    store = mongo_driver.MongoStore(props.DATASET)
    mongo_stmts = store.load_stmts(language=language,
                                   is_valid=True,
                                   has_output=True,
                                   limit=limit,
                                   use_normalized=True).items()
    LOGGER.info("Fetched statements from mongo ... ")
    n_stmts = len(mongo_stmts)
    n_singular_output_stmts = 0
    for i, (key, mongo_stmt) in enumerate(mongo_stmts):
        do_log = (i + 1) % log_interval == 0
        if do_log:
            LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts))
        stmt = compare.Statement(mongo_id=mongo_stmt["_id"],
                                 snippet=mongo_stmt["snippet"],
                                 variables=mongo_stmt["variables"],
                                 language=language,
                                 outputs=compare.format_outputs(
                                     mongo_stmt["outputs"]))
        is_all_same = {}
        for ret_key in stmt.outputs.keys():
            is_all_same[ret_key] = stmt.outputs[ret_key].is_all_same
            if stmt.outputs[ret_key].is_all_same:
                n_singular_output_stmts += 1
        store.update_stmt(stmt.mongo_id, {"meta": is_all_same})
    LOGGER.info("Completed. Singular Output Statements = %d / %d" %
                (n_singular_output_stmts, n_stmts))
Ejemplo n.º 6
0
def update_semantic_scores(start=0, end=None, log_interval=100, limit=0):
    store = mongo_driver.MongoStore(props.DATASET)
    diff_records = store.load_differences(
        additional_queries={"n_mismatched": {
            "$exists": False
        }}, limit=limit)
    n_records = diff_records.count()
    LOGGER.info("Retrieved %d records ..." % n_records)
    semantic_columns = [
        "semantic_score", "n_mismatched", "row_diff", "col_diff", "size_diff",
        "n_both_empty", "n_val1_empty", "n_val2_empty"
    ]
    store.create_semantic_indices(semantic_columns)
    for i, diff_record in enumerate(diff_records):
        if i < start or (end and i >= end):
            continue
        if (i + 1) % log_interval == 0:
            LOGGER.info("Processing semantic difference for %d / %d" %
                        (i + 1, n_records))
        summary = SemanticSummary()
        for d in diff_record["diff"]:
            diff = differences.DiffMeta.from_dict(d)
            summary.update_summary(diff)
        query = {"_id": diff_record["_id"]}
        updates = summary.summarize()
        store.update_difference(query, updates)
Ejemplo n.º 7
0
def extract_variable_names():
    store = mongo_driver.MongoStore(props.DATASET)
    docs = store.load_file_stmts(props.TYPE_PYTHON)
    stored_stmts = store.load_stmts(props.TYPE_PYTHON, is_valid=False)
    stmt_counter = Counter()
    for i, doc in enumerate(docs):
        for stmt in doc['snippets']:
            if not stored_stmts or (stmt,
                                    props.TYPE_PYTHON) not in stored_stmts:
                stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1
    valid_stmts = 0
    for i, (stmt, count) in enumerate(stmt_counter.items()):
        if i % 1000 == 0:
            LOGGER.info("Processing %d / %d .... " %
                        (i + 1, len(stmt_counter)))
        try:
            detector = VariableDetector(stmt)
            variables = list(detector.parse())
            if len(variables) == 1:
                valid_stmts += 1
            else:
                variables = None
                if DEBUG:
                    LOGGER.info(list(variables))
                    LOGGER.info("STMT:::: %s\n" % stmt)
        except Exception:
            variables = None
            if DEBUG:
                LOGGER.info(list(variables))
                LOGGER.info("EXCEPTION::::  %s\n" % stmt)
        store.store_stmt(stmt, props.TYPE_PYTHON, variables)
    LOGGER.info("VALID: %d / %d" % (valid_stmts, len(stmt_counter)))
Ejemplo n.º 8
0
def get_stmt_counts(limit=None):
  store = mongo_driver.MongoStore(props.DATASET)
  docs = store.load_file_stmts(props.TYPE_PYTHON)
  stmt_counter = Counter()
  for i, doc in enumerate(docs):
    for stmt in doc['snippets']:
      stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1
  stmts = OrderedDict()
  for stmt, count in stmt_counter.most_common(limit):
    stmts[stmt] = count
  return stmts
Ejemplo n.º 9
0
def execute_stmts():
  store = mongo_driver.MongoStore(props.DATASET)
  generated_args = arg_generator.generate_args()
  stmts = store.load_stmts(language='R')
  n_stmts = len(stmts)
  for i, (key, stmt) in enumerate(stmts.items()):
    if 'outputs' in stmt:
      LOGGER.info("Processing %d / %d. Moving on ... " % (i + 1, n_stmts))
      continue
    LOGGER.info("StmtID : '%s'. Processing %d / %d .... " % (stmt['_id'], i + 1, n_stmts))
    outputs = execute_stmt(stmt, generated_args)
    store.update_stmt_outputs(stmt['_id'], outputs)
Ejemplo n.º 10
0
def runner(force=False):
  store = mongo_driver.MongoStore(props.DATASET)
  if force:
    store.delete_file_stmts(props.TYPE_PYTHON)
  py_files = get_converted_files()
  for i, py_file in enumerate(py_files):
    file_name = cache.get_file_name(py_file)
    if store.load_stmts_for_file_name(py_file):
      LOGGER.info("Processed %s. Moving on ... " % file_name)
      continue
    LOGGER.info("Processing %d / %d ... " % (i + 1, len(py_files)))
    LOGGER.info("Processing %s ... " % file_name)
    lines = PARSER.parse_file(py_file)
    store.store_file_stmts(py_file, lines, props.TYPE_PYTHON)
Ejemplo n.º 11
0
def test_execute_stmts():
    store = mongo_driver.MongoStore(props.DATASET)
    ids = [
        "5cafff91e850c97e6a1f5b9c", "5cafff91e850c97e6a1f5b9e",
        "5cafff91e850c97e6a1f5b9f", "5cafff91e850c97e6a1f5ba0",
        "5cafff91e850c97e6a1f5ba5", "5cafff91e850c97e6a1f5bb0",
        "5cafff91e850c97e6a1f5bb7", "5cafff91e850c97e6a1f5bc6",
        "5cafff91e850c97e6a1f5bc8", "5cafff91e850c97e6a1f5bf0"
    ]
    stmts = {
        stmt["_id"]: stmt
        for stmt in [store.load_stmt(_id) for _id in ids]
    }
    execute_stmts(stmts)
Ejemplo n.º 12
0
def syntactic_differences(start=0):
    store = mongo_driver.MongoStore(props.DATASET)
    stmts = syntactic.get_normalized_R_statements(debug=True)
    trees = {}
    for i in xrange(len(stmts) - 1):
        if i < start:
            continue
        LOGGER.info("Processing %d / %d R snippet ... " % (i + 1, len(stmts)))
        stmt_i = stmts[i]
        if not stmt_i.normalized:
            continue
        stmt_i_tree = trees.get(i, None)
        if not stmt_i_tree:
            stmt_i_tree = ast_distances.r_parse(stmt_i.normalized)
            trees[i] = stmt_i_tree
        records = []
        for j in xrange(i + 1, len(stmts)):
            if i >= j:
                continue
            stmt_j = stmts[j]
            if not stmt_j.normalized:
                continue
            stmt_j_tree = trees.get(j, None)
            if not stmt_j_tree:
                stmt_j_tree = ast_distances.r_parse(stmt_j.normalized)
                trees[j] = stmt_j_tree
            record = {
                "id_1":
                stmt_i.mongo_id,
                "id_2":
                stmt_j.mongo_id,
                "language":
                props.TYPE_R,
                "d_levenshtein":
                syntactic.levenshtein(stmt_i.normalized, stmt_j.normalized),
                "d_jaro":
                syntactic.jaro(stmt_i.normalized, stmt_j.normalized),
                "d_jaro_winkler":
                syntactic.jaro_winkler(stmt_i.normalized, stmt_j.normalized),
                "d_ast":
                ast_distances.edit_distance(stmt_i_tree, stmt_j_tree)
            }
            try:
                record["d_n_gram"] = syntactic.n_gram_distance(
                    stmt_i.normalized, stmt_j.normalized)[0]
            except Exception as e:
                record["d_n_gram"] = 1.0
            records.append(record)
        store.save_self_syntactic_differences(records, do_log=True)
Ejemplo n.º 13
0
def delete_single_output_stmts(limit=0, log_interval=100):
    store = mongo_driver.MongoStore(props.DATASET)
    stmts = store.load_stmts(language=None,
                             is_valid=True,
                             has_output=True,
                             limit=limit).values()
    valids = {}
    for i, stmt in enumerate(stmts):
        if i % log_interval == 0:
            LOGGER.info("Processing statement %d/%d ... " % (i, len(stmts)))
        meta = stmt.get("meta", {})
        stmt_valids = set()
        for key, validity in meta.items():
            if validity:
                stmt_valids.add(mongo_driver.mongo_de_escape(key))
        if len(stmt_valids) > 0:
            valids[stmt["_id"]] = stmt_valids
    LOGGER.info("Non singular output stmts: %d/%d" % (len(valids), len(stmts)))
    differences = store.load_differences(projection={
        "_id": True,
        "r_id": True,
        "py_id": True,
        "r_return": True,
        "py_return": True
    },
                                         limit=limit)
    to_delete = []
    n_differences = differences.count()
    i = 0
    for diff in differences:
        i += 1
        if i % log_interval == 0:
            LOGGER.info("Processing differences: %d / %d" % (i, n_differences))
        r_id, py_id = diff["r_id"], diff["py_id"]
        if (r_id in valids) and (diff["r_return"] in valids[r_id]):
            to_delete.append(diff["_id"])
            continue
        if (py_id in valids) and (diff["py_return"] in valids[py_id]):
            to_delete.append(diff["_id"])
            continue
    LOGGER.info("Differences to delete: %d / %d" %
                (len(to_delete), differences.count()))
    for i, diff_id in enumerate(to_delete):
        if i % log_interval == 0:
            LOGGER.info("Deleting difference %d/%d ... " % (i, len(to_delete)))
        store.delete_difference(diff_id)
Ejemplo n.º 14
0
def normalize(log_interval=100):
  store = mongo_driver.MongoStore(props.DATASET)
  LOGGER.info("Fetching statements ... ")
  cursor = store.load_raw_stmts()
  stmts = []
  for stmt in cursor:
    if not stmt.get('variables', None) or not stmt.get('outputs', None):
      continue
    stmts.append(stmt)
  del cursor
  LOGGER.info("Valid Statements: %d!" % len(stmts))
  for i, stmt in enumerate(stmts):
    if (i + 1) % log_interval == 0:
      LOGGER.info("Processing statement: %d/%d" % (i + 1, len(stmts)))
    stmt["snippet"] = syntactic.normalize(stmt)
    del stmt["_id"]
    store.store_normalized_stmt(stmt)
Ejemplo n.º 15
0
def dump_extracted_to_csv():
  store = mongo_driver.MongoStore(props.DATASET)
  docs = store.load_file_stmts(props.TYPE_PYTHON)
  stmt_counter = Counter()
  stmt_file_map = {}
  for i, doc in enumerate(docs):
    file_name = doc['file_name']
    LOGGER.info("Processing %d / %d ... " % (i + 1, docs.count()))
    for stmt in doc['snippets']:
      stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1
      file_names = stmt_file_map.get(stmt, set())
      file_names.add(file_name)
      stmt_file_map[stmt] = file_names
  with open(IPYTHON_CSV, "wb") as csv_file:
    writer = csv.writer(csv_file, delimiter='\t')
    writer.writerow(["Statement", "Count", "# unique files"])
    for stmt, count in stmt_counter.most_common():
      writer.writerow([unicode(stmt).encode("utf-8"), count, len(stmt_file_map[stmt])])
Ejemplo n.º 16
0
def export_normalized_stmt_file_map(xl_path):
    store = mongo_driver.MongoStore(props.DATASET)
    writer = pd.ExcelWriter(xl_path, engine='xlsxwriter')
    snippets = []
    languages = []
    file_names = []
    for file_stmt in store.load_stmt_file_map():
        snippet = file_stmt["snippet"].replace("slacc", "df")
        empties = [''] * (len(file_stmt["file_names"]) - 1)
        snippets += [snippet] + empties
        languages += [file_stmt["language"]] + empties
        file_names += file_stmt["file_names"]
    d = OrderedDict()
    d["snippet"] = snippets
    d["languages"] = languages
    d["file_names"] = file_names
    df = pd.DataFrame(d, columns=d.keys())
    df.to_excel(writer, sheet_name="Snippet File Map", index=False)
Ejemplo n.º 17
0
def difference(r_stmt, py_stmt, store=None, do_log=False):
    assert r_stmt.language != py_stmt.language
    if store is None:
        store = mongo_driver.MongoStore(props.DATASET)
    for r_return, res1 in r_stmt.outputs.items():
        if res1.is_all_same or is_all_none(res1.returns):
            continue
        for py_return, res2 in py_stmt.outputs.items():
            if res2.is_all_same or is_all_none(res2.returns):
                continue
            # diffs = compare_returns(res1.returns, res2.returns)
            diffs = pooled_compare_returns(res1.returns, res2.returns)
            if do_log:
                LOGGER.info("Saving R: %s, PY: %s ... " %
                            (r_stmt.mongo_id, py_stmt.mongo_id))
            store.save_difference(
                r_id=r_stmt.mongo_id,
                py_id=py_stmt.mongo_id,
                r_return=mongo_driver.mongo_de_escape(r_return),
                py_return=mongo_driver.mongo_de_escape(py_return),
                diff=diffs,
                do_log=do_log)
Ejemplo n.º 18
0
def check_correlation():
  store = mongo_driver.MongoStore(props.DATASET)
  diffs = store.load_differences(projection={"diff": False})
  asts = []
  ngrams = []
  levenshteins = []
  for i, diff in enumerate(diffs):
    if i % 1000 == 0:
      print(i)
    # if i == 5000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    asts.append(ast)
    ngrams.append(ngram)
    levenshteins.append(lev)
  print("## Pearson Correlation")
  print("AST-Ngram", pearsonr(asts, ngrams))
  print("AST-Levenshtein", pearsonr(asts, levenshteins))
  print("Ngram-Levenshtein", pearsonr(ngrams, levenshteins))
Ejemplo n.º 19
0
def extract_variable_names():
  store = mongo_driver.MongoStore(props.DATASET)
  docs = store.load_file_stmts(props.TYPE_R)
  stored_stmts = store.load_stmts(props.TYPE_R, is_valid=False)
  stmt_counter = Counter()
  for i, doc in enumerate(docs):
    for stmt in doc['snippets']:
      if not stored_stmts or (stmt, props.TYPE_R) not in stored_stmts:
        stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1
  valid_stmts = 0
  for i, (stmt, count) in enumerate(stmt_counter.items()):
    if i % 10 == 0:
      LOGGER.info("Processing %d / %d .... " % (i + 1, len(stmt_counter)))
    variables = extract_variable(stmt)
    if variables and len(variables) == 1:
      valid_stmts += 1
    else:
      variables = None
      if DEBUG:
        LOGGER.info(list(variables))
        LOGGER.info("STMT:::: %s\n" % stmt)
    store.store_stmt(stmt, props.TYPE_R, variables)
  LOGGER.info("VALID: %d / %d" % (valid_stmts, len(stmt_counter)))
Ejemplo n.º 20
0
def export_similar_differences(sim, syn, xl_writer, sheet_name, syn_key):
    LOGGER.info("Running for %s ... " % sheet_name)
    query = {"n_both_empty": {"$eq": 0}}
    if sim > 0:
        query["semantic_score"] = {"$gte": abs(sim)}
    elif sim < 0:
        query["semantic_score"] = {"$lte": abs(sim)}
    if syn > 0:
        query[syn_key] = {"$gte": abs(syn)}
    elif syn < 0:
        query[syn_key] = {"$lte": abs(syn)}
    projections = {
        "py_id": 1,
        "r_id": 1,
        "r_snippet": 1,
        "r_return": 1,
        "py_return": 1,
        "py_snippet": 1,
        "_id": 0,
        "row_diff": 1,
        "col_diff": 1,
        "d_n_gram": 1,
        "d_ast": 1,
        "d_levenshtein": 1,
        "size_diff": 1,
        "semantic_score": 1
    }
    store = mongo_driver.MongoStore(props.DATASET)
    diffs = [
        d for d in store.load_differences(additional_queries=query,
                                          projection=projections)
    ]
    r_stmts = []
    py_stmts = []
    n_grams = []
    levenshteins = []
    asts = []
    semantics = []
    row_diffs = []
    col_diffs = []
    size_diffs = []
    for diff in diffs:
        # if diff["r_return"] == props.AUTO_RETURN:
        #   r_stmts.append(diff["r_snippet"])
        # else:
        #   r_stmts.append("%s; %s" % (diff["r_snippet"], diff["r_return"]))
        # if diff["py_return"] == props.AUTO_RETURN:
        #   py_stmts.append(diff["py_snippet"])
        # elif diff["py_return"] == props.SELF:
        #   py_stmts.append("return %s" % diff["py_snippet"])
        # else:
        #   py_stmts.append("%s; return %s" % (diff["py_snippet"], diff["py_return"]))
        r_stmt = syntactic.r_normalize(
            store.load_stmt(diff["r_id"], use_normalized=True)).replace(
                syntactic.RENAMED_VARIABLE, "df")
        py_stmt = syntactic.py_normalize(
            store.load_stmt(diff["py_id"], use_normalized=True)).replace(
                syntactic.RENAMED_VARIABLE, "df")
        r_stmts.append(r_stmt)
        py_stmts.append(py_stmt)
        n_grams.append(diff.get("d_n_gram", None))
        levenshteins.append(diff.get("d_levenshtein", None))
        asts.append(diff.get("d_ast", None))
        semantics.append(diff.get("semantic_score", None))
        row_diffs.append(diff.get("row_diff", None))
        col_diffs.append(diff.get("col_diff", None))
        size_diffs.append(diff.get("size_diff", None))
    d = OrderedDict()
    d["R"] = r_stmts
    d["Pandas"] = py_stmts
    d["AST Distance"] = asts
    d["N-Gram Distance"] = n_grams
    d["Semantic Score"] = semantics
    d["row_diff"] = row_diffs
    d["col_diff"] = col_diffs
    d["size_diff"] = size_diffs
    df = pd.DataFrame(d, columns=d.keys())
    df.to_excel(xl_writer, sheet_name=sheet_name, index=False)
Ejemplo n.º 21
0
def check_anova():
  store = mongo_driver.MongoStore(props.DATASET)
  LOGGER.info("Processing for R-Py")
  diffs = store.load_differences(projection={"diff": False})
  rpy_asts = []
  rpy_ngrams = []
  rpy_levenshteins = []
  for i, diff in enumerate(diffs):
    if i % 10000 == 0:
      LOGGER.info("Processed R-Py: %d ..." % i)
    # if i == 50000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    rpy_asts.append(ast)
    rpy_ngrams.append(ngram)
    rpy_levenshteins.append(lev)
  LOGGER.info("Processing for Python")
  py_diffs = store.load_self_syntactic_differences(language=props.TYPE_PYTHON)
  py_asts = []
  py_ngrams = []
  py_levenshteins = []
  for i, diff in enumerate(py_diffs):
    if i % 10000 == 0:
      LOGGER.info("Processed Python: %d ..." % i)
    # if i == 50000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    py_asts.append(ast)
    py_ngrams.append(ngram)
    py_levenshteins.append(lev)
  LOGGER.info("Processing for R")
  r_diffs = store.load_self_syntactic_differences(language=props.TYPE_R)
  r_asts = []
  r_ngrams = []
  r_levenshteins = []
  for i, diff in enumerate(r_diffs):
    if i % 10000 == 0:
      LOGGER.info("Processed R: %d ..." % i)
    # if i == 50000:
    #   break
    ast = diff.get('d_ast', None)
    ngram = diff.get('d_n_gram', None)
    lev = diff.get('d_levenshtein', None)
    if ast is None or ngram is None or lev is None:
      continue
    r_asts.append(ast)
    r_ngrams.append(ngram)
    r_levenshteins.append(lev)
  print("\n### AST distance")
  f_measure, p_value = f_oneway(rpy_asts, py_asts, r_asts)
  print("F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_asts)), np.asscalar(np.var(rpy_asts))))
  print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_asts)), np.asscalar(np.var(py_asts))))
  print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_asts)), np.asscalar(np.var(r_asts))))
  f_measure, p_value = f_oneway(rpy_asts, py_asts)
  print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(rpy_asts, r_asts)
  print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(py_asts, r_asts)
  print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("\n### N-Gram distance")
  f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams, r_ngrams)
  print("F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_ngrams)), np.asscalar(np.var(rpy_ngrams))))
  print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_ngrams)), np.asscalar(np.var(py_ngrams))))
  print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_ngrams)), np.asscalar(np.var(r_ngrams))))
  f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams)
  print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(rpy_ngrams, r_ngrams)
  print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(py_ngrams, r_ngrams)
  print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("\n### Levenshtein distance")
  f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins, r_levenshteins)
  print("F-Measure: %f, p-value: %f" % (f_measure, p_value))
  print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_levenshteins)), np.asscalar(np.var(rpy_levenshteins))))
  print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_levenshteins)), np.asscalar(np.var(py_levenshteins))))
  print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_levenshteins)), np.asscalar(np.var(r_levenshteins))))
  f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins)
  print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(rpy_levenshteins, r_levenshteins)
  print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
  f_measure, p_value = f_oneway(py_levenshteins, r_levenshteins)
  print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
Ejemplo n.º 22
0
def _test_execute_stmt():
    store = mongo_driver.MongoStore(props.DATASET)
    generated_args = arg_generator.generate_args()
    stmt = store.load_stmt("5cafff91e850c97e6a1f5b9c")
    print(execute_stmt(stmt, generated_args).values()[0][0])
Ejemplo n.º 23
0
def run_execute_stmts():
    store = mongo_driver.MongoStore(props.DATASET)
    stmts = store.load_stmts(language="py", is_valid=True)
    execute_stmts(stmts, do_log=False)
Ejemplo n.º 24
0
def runner(skip_threshold=3500, start=0, end=None, use_normalized=False):
    LOGGER.info(
        "Computing differences for R stmts b/w %d and %d on %d processes" %
        (start, end if end else -1, multiprocessing.cpu_count()))
    # log_interval = 100
    log_interval = 100
    store = mongo_driver.MongoStore(props.DATASET)
    r_cursor = store.load_raw_stmts(props.TYPE_R,
                                    use_normalized=use_normalized)
    r_stmts = []
    for r_stmt in r_cursor:
        if not r_stmt.get('variables', None) or not r_stmt.get(
                'outputs', None):
            continue
        r_stmts.append(r_stmt)
    del r_cursor

    # Top Py Statements
    py_cursor = store.load_raw_stmts(props.TYPE_PYTHON,
                                     use_normalized=use_normalized)
    py_stmts = []
    for py_stmt in py_cursor:
        if (not py_stmt.get('variables', None)) or (not py_stmt.get(
                'outputs', None)):
            continue
        py_stmts.append(py_stmt)
    del py_cursor

    for i, r_stmt in enumerate(r_stmts):
        if i < start or (end and i >= end):
            LOGGER.info("Skipping R Stmt: %d / %d !" % (i + 1, len(r_stmts)))
            continue
        existing_diffs = store.load_differences(r_id=r_stmt["_id"])
        processed = set()
        if existing_diffs.count() > skip_threshold:
            LOGGER.info("Processed R Stmt: %d / %d !" % (i + 1, len(r_stmts)))
            continue
        elif existing_diffs.count() > 0:
            for diff in existing_diffs:
                processed.add(diff["py_id"])
        LOGGER.info("Processing R Stmt: %d / %d ... " % (i + 1, len(r_stmts)))
        r_stmt = Statement(mongo_id=r_stmt["_id"],
                           snippet=r_stmt["snippet"],
                           variables=r_stmt["variables"],
                           language=r_stmt["language"],
                           outputs=format_outputs(r_stmt["outputs"]))
        if r_stmt.is_all_same() or r_stmt.is_all_none():
            LOGGER.info("Empty or singular R stmt: %d. Skipping ..." % (i + 1))
            continue
        valid = 0
        took_too_long = 0
        for j in xrange(len(py_stmts)):
            valid += 1
            do_log = valid % log_interval == 0
            py_stmt = py_stmts[j]
            if py_stmt is None:
                if do_log:
                    LOGGER.info("Empty or singular py stmt: %d. Skipping ..." %
                                valid)
                continue
            if not isinstance(py_stmt, Statement):
                py_stmts[j] = Statement(mongo_id=py_stmt["_id"],
                                        snippet=py_stmt["snippet"],
                                        variables=py_stmt["variables"],
                                        language=py_stmt["language"],
                                        outputs=format_outputs(
                                            py_stmt["outputs"]))
                if py_stmts[j].is_all_same() or py_stmts[j].is_all_none():
                    py_stmts[j] = None
                    if do_log:
                        LOGGER.info(
                            "Empty or singular py stmt: %d. Skipping ..." %
                            valid)
                    continue
                py_stmt = py_stmts[j]
            if py_stmt.mongo_id in processed:
                if do_log:
                    LOGGER.info("Already processed py stmt: %d !" % valid)
                continue
            if do_log:
                LOGGER.info("Processing py stmt: %d" % valid)
            difference(r_stmt, py_stmt, store=store, do_log=do_log)
            # prev_signal = signal.getsignal(signal.SIGALRM)
            # signal.signal(signal.SIGALRM, execute.timeout_handler)
            # signal.alarm(max_wait_time)
            # try:
            #   difference(r_stmt, py_stmt, store=store, do_log=do_log)
            # except execute.TimeoutException:
            #   took_too_long += 1
            #   LOGGER.info("Timed out for py: %s" % py_stmt.mongo_id)
            # except Exception as e:
            #   LOGGER.info(e.message)
            # signal.alarm(0)
            # signal.signal(signal.SIGALRM, prev_signal)
        LOGGER.info("# Timed Out for %s = %d" %
                    (r_stmt.mongo_id, took_too_long))