def test_difference(): py_id = "5cafffc1e850c97e6a1f9a70" r_id = "5cb567d2e850c93c2bec1361" store = mongo_driver.MongoStore(props.DATASET) start = time.time() py_stmt = store.load_stmt(mongo_id=py_id) py_stmt = Statement(mongo_id=py_stmt["_id"], snippet=py_stmt["snippet"], variables=py_stmt["variables"], language=py_stmt["language"], outputs=format_outputs(py_stmt["outputs"])) py_time = time.time() print("Processing Py = %0.2f" % (py_time - start)) r_stmt = store.load_stmt(mongo_id=r_id) r_stmt = Statement(mongo_id=r_stmt["_id"], snippet=r_stmt["snippet"], variables=r_stmt["variables"], language=r_stmt["language"], outputs=format_outputs(r_stmt["outputs"])) r_time = time.time() print("Processing Py = %0.2f" % (r_time - py_time)) r_key = r_stmt.outputs.keys()[0] py_key = py_stmt.outputs.keys()[0] # diffs = compare_returns(r_stmt.outputs[r_key].returns, py_stmt.outputs[py_key].returns) # diffs = threaded_compare_returns(r_stmt.outputs[r_key].returns[:4], py_stmt.outputs[py_key].returns[:4]) diffs = pooled_compare_returns(r_stmt.outputs[r_key].returns, py_stmt.outputs[py_key].returns) print(DiffMeta.from_dict(diffs[0].to_dict())) end = time.time() print("Time Taken = %0.2f" % (end - r_time)) exit()
def fetch_statements(language, force=False, do_save=False, limit=None, as_list=False): pkl_file = get_executed_stmts_pkl(language) if not force and cache.file_exists(pkl_file): LOGGER.info("Retrieving existing '%s' statements!" % language) if as_list: return cache.load_pickle(pkl_file).values() return cache.load_pickle(pkl_file) LOGGER.info("Reprocessing '%s' statements!" % language) store = mongo_driver.MongoStore(props.DATASET) stmts = {} mongo_stmts = store.load_stmts(language=language, is_valid=True, has_output=True, limit=limit).items() n_stmts = len(mongo_stmts) for i, (key, mongo_stmt) in enumerate(mongo_stmts): LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts)) stmt = Statement(mongo_id=mongo_stmt["_id"], snippet=mongo_stmt["snippet"], variables=mongo_stmt["variables"], language=language, outputs=format_outputs(mongo_stmt["outputs"])) stmts[stmt.mongo_id] = stmt if do_save: LOGGER.info("Saving statements .... ") cache.save_pickle(pkl_file, stmts) if as_list: return stmts.values() return stmts
def execute_stmts(stmts, limit=props.PY_STMT_LIMIT, do_log=False): store = mongo_driver.MongoStore(props.DATASET) n_stmts = len(stmts) top_py_stmts = crawler.get_stmt_counts(limit) valids = 0 generated_args = arg_generator.generate_args() for i, (key, stmt) in enumerate(stmts.items()): snippet = stmt["snippet"] if snippet not in top_py_stmts: if do_log: LOGGER.info("Not processing %d / %d. Moving on !" % (i + 1, n_stmts)) continue if 'outputs' in stmt: LOGGER.info("Processed %d / %d. Moving on !" % (i + 1, n_stmts)) continue # To skip plotting functions. if ".hist(" in snippet: LOGGER.info( "Snippet %d / %d contains a plot operation. Moving on !" % (i + 1, n_stmts)) continue LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts)) outputs = execute_stmt(stmt, generated_args) assert len(outputs) > 0 store.update_stmt_outputs(stmt['_id'], outputs) valids += 1 LOGGER.info("Valid outputs = %d / %d" % (valids, n_stmts))
def create_normalized_stmt_file_map(): store = mongo_driver.MongoStore(props.DATASET) LOGGER.info("Fetching Stmts .... ") valid_stmts = store.load_stmts(is_valid=True, has_output=True) LOGGER.info("Fetching File Stmts .... ") file_stmts = store.load_file_stmts() back_pointers = {} i, n_file_stmts = 0, file_stmts.count() for file_stmt in file_stmts: i += 1 LOGGER.info("Processing file %d / %d ... " % (i, n_file_stmts)) file_name = file_stmt['file_name'].split(props.PROJECTS_SRC)[-1].split("/", 2)[-1] language = file_stmt['language'] for snippet in file_stmt['snippets']: if (snippet, language) not in valid_stmts: continue normalized_snippet = syntactic.normalize(valid_stmts[(snippet, language)]) if normalized_snippet not in back_pointers: back_pointers[normalized_snippet] = { "language": language, "file_names": set() } back_pointers[normalized_snippet]["file_names"].add(file_name) LOGGER.info("Saving stmt to file pointers ... ") for snippet, back_pointer in back_pointers.items(): back_pointer["file_names"] = list(back_pointer["file_names"]) store.create_stmt_file_map(snippet, back_pointer)
def get_single_output_stmts(language, limit=None, log_interval=100): store = mongo_driver.MongoStore(props.DATASET) mongo_stmts = store.load_stmts(language=language, is_valid=True, has_output=True, limit=limit, use_normalized=True).items() LOGGER.info("Fetched statements from mongo ... ") n_stmts = len(mongo_stmts) n_singular_output_stmts = 0 for i, (key, mongo_stmt) in enumerate(mongo_stmts): do_log = (i + 1) % log_interval == 0 if do_log: LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts)) stmt = compare.Statement(mongo_id=mongo_stmt["_id"], snippet=mongo_stmt["snippet"], variables=mongo_stmt["variables"], language=language, outputs=compare.format_outputs( mongo_stmt["outputs"])) is_all_same = {} for ret_key in stmt.outputs.keys(): is_all_same[ret_key] = stmt.outputs[ret_key].is_all_same if stmt.outputs[ret_key].is_all_same: n_singular_output_stmts += 1 store.update_stmt(stmt.mongo_id, {"meta": is_all_same}) LOGGER.info("Completed. Singular Output Statements = %d / %d" % (n_singular_output_stmts, n_stmts))
def update_semantic_scores(start=0, end=None, log_interval=100, limit=0): store = mongo_driver.MongoStore(props.DATASET) diff_records = store.load_differences( additional_queries={"n_mismatched": { "$exists": False }}, limit=limit) n_records = diff_records.count() LOGGER.info("Retrieved %d records ..." % n_records) semantic_columns = [ "semantic_score", "n_mismatched", "row_diff", "col_diff", "size_diff", "n_both_empty", "n_val1_empty", "n_val2_empty" ] store.create_semantic_indices(semantic_columns) for i, diff_record in enumerate(diff_records): if i < start or (end and i >= end): continue if (i + 1) % log_interval == 0: LOGGER.info("Processing semantic difference for %d / %d" % (i + 1, n_records)) summary = SemanticSummary() for d in diff_record["diff"]: diff = differences.DiffMeta.from_dict(d) summary.update_summary(diff) query = {"_id": diff_record["_id"]} updates = summary.summarize() store.update_difference(query, updates)
def extract_variable_names(): store = mongo_driver.MongoStore(props.DATASET) docs = store.load_file_stmts(props.TYPE_PYTHON) stored_stmts = store.load_stmts(props.TYPE_PYTHON, is_valid=False) stmt_counter = Counter() for i, doc in enumerate(docs): for stmt in doc['snippets']: if not stored_stmts or (stmt, props.TYPE_PYTHON) not in stored_stmts: stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1 valid_stmts = 0 for i, (stmt, count) in enumerate(stmt_counter.items()): if i % 1000 == 0: LOGGER.info("Processing %d / %d .... " % (i + 1, len(stmt_counter))) try: detector = VariableDetector(stmt) variables = list(detector.parse()) if len(variables) == 1: valid_stmts += 1 else: variables = None if DEBUG: LOGGER.info(list(variables)) LOGGER.info("STMT:::: %s\n" % stmt) except Exception: variables = None if DEBUG: LOGGER.info(list(variables)) LOGGER.info("EXCEPTION:::: %s\n" % stmt) store.store_stmt(stmt, props.TYPE_PYTHON, variables) LOGGER.info("VALID: %d / %d" % (valid_stmts, len(stmt_counter)))
def get_stmt_counts(limit=None): store = mongo_driver.MongoStore(props.DATASET) docs = store.load_file_stmts(props.TYPE_PYTHON) stmt_counter = Counter() for i, doc in enumerate(docs): for stmt in doc['snippets']: stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1 stmts = OrderedDict() for stmt, count in stmt_counter.most_common(limit): stmts[stmt] = count return stmts
def execute_stmts(): store = mongo_driver.MongoStore(props.DATASET) generated_args = arg_generator.generate_args() stmts = store.load_stmts(language='R') n_stmts = len(stmts) for i, (key, stmt) in enumerate(stmts.items()): if 'outputs' in stmt: LOGGER.info("Processing %d / %d. Moving on ... " % (i + 1, n_stmts)) continue LOGGER.info("StmtID : '%s'. Processing %d / %d .... " % (stmt['_id'], i + 1, n_stmts)) outputs = execute_stmt(stmt, generated_args) store.update_stmt_outputs(stmt['_id'], outputs)
def runner(force=False): store = mongo_driver.MongoStore(props.DATASET) if force: store.delete_file_stmts(props.TYPE_PYTHON) py_files = get_converted_files() for i, py_file in enumerate(py_files): file_name = cache.get_file_name(py_file) if store.load_stmts_for_file_name(py_file): LOGGER.info("Processed %s. Moving on ... " % file_name) continue LOGGER.info("Processing %d / %d ... " % (i + 1, len(py_files))) LOGGER.info("Processing %s ... " % file_name) lines = PARSER.parse_file(py_file) store.store_file_stmts(py_file, lines, props.TYPE_PYTHON)
def test_execute_stmts(): store = mongo_driver.MongoStore(props.DATASET) ids = [ "5cafff91e850c97e6a1f5b9c", "5cafff91e850c97e6a1f5b9e", "5cafff91e850c97e6a1f5b9f", "5cafff91e850c97e6a1f5ba0", "5cafff91e850c97e6a1f5ba5", "5cafff91e850c97e6a1f5bb0", "5cafff91e850c97e6a1f5bb7", "5cafff91e850c97e6a1f5bc6", "5cafff91e850c97e6a1f5bc8", "5cafff91e850c97e6a1f5bf0" ] stmts = { stmt["_id"]: stmt for stmt in [store.load_stmt(_id) for _id in ids] } execute_stmts(stmts)
def syntactic_differences(start=0): store = mongo_driver.MongoStore(props.DATASET) stmts = syntactic.get_normalized_R_statements(debug=True) trees = {} for i in xrange(len(stmts) - 1): if i < start: continue LOGGER.info("Processing %d / %d R snippet ... " % (i + 1, len(stmts))) stmt_i = stmts[i] if not stmt_i.normalized: continue stmt_i_tree = trees.get(i, None) if not stmt_i_tree: stmt_i_tree = ast_distances.r_parse(stmt_i.normalized) trees[i] = stmt_i_tree records = [] for j in xrange(i + 1, len(stmts)): if i >= j: continue stmt_j = stmts[j] if not stmt_j.normalized: continue stmt_j_tree = trees.get(j, None) if not stmt_j_tree: stmt_j_tree = ast_distances.r_parse(stmt_j.normalized) trees[j] = stmt_j_tree record = { "id_1": stmt_i.mongo_id, "id_2": stmt_j.mongo_id, "language": props.TYPE_R, "d_levenshtein": syntactic.levenshtein(stmt_i.normalized, stmt_j.normalized), "d_jaro": syntactic.jaro(stmt_i.normalized, stmt_j.normalized), "d_jaro_winkler": syntactic.jaro_winkler(stmt_i.normalized, stmt_j.normalized), "d_ast": ast_distances.edit_distance(stmt_i_tree, stmt_j_tree) } try: record["d_n_gram"] = syntactic.n_gram_distance( stmt_i.normalized, stmt_j.normalized)[0] except Exception as e: record["d_n_gram"] = 1.0 records.append(record) store.save_self_syntactic_differences(records, do_log=True)
def delete_single_output_stmts(limit=0, log_interval=100): store = mongo_driver.MongoStore(props.DATASET) stmts = store.load_stmts(language=None, is_valid=True, has_output=True, limit=limit).values() valids = {} for i, stmt in enumerate(stmts): if i % log_interval == 0: LOGGER.info("Processing statement %d/%d ... " % (i, len(stmts))) meta = stmt.get("meta", {}) stmt_valids = set() for key, validity in meta.items(): if validity: stmt_valids.add(mongo_driver.mongo_de_escape(key)) if len(stmt_valids) > 0: valids[stmt["_id"]] = stmt_valids LOGGER.info("Non singular output stmts: %d/%d" % (len(valids), len(stmts))) differences = store.load_differences(projection={ "_id": True, "r_id": True, "py_id": True, "r_return": True, "py_return": True }, limit=limit) to_delete = [] n_differences = differences.count() i = 0 for diff in differences: i += 1 if i % log_interval == 0: LOGGER.info("Processing differences: %d / %d" % (i, n_differences)) r_id, py_id = diff["r_id"], diff["py_id"] if (r_id in valids) and (diff["r_return"] in valids[r_id]): to_delete.append(diff["_id"]) continue if (py_id in valids) and (diff["py_return"] in valids[py_id]): to_delete.append(diff["_id"]) continue LOGGER.info("Differences to delete: %d / %d" % (len(to_delete), differences.count())) for i, diff_id in enumerate(to_delete): if i % log_interval == 0: LOGGER.info("Deleting difference %d/%d ... " % (i, len(to_delete))) store.delete_difference(diff_id)
def normalize(log_interval=100): store = mongo_driver.MongoStore(props.DATASET) LOGGER.info("Fetching statements ... ") cursor = store.load_raw_stmts() stmts = [] for stmt in cursor: if not stmt.get('variables', None) or not stmt.get('outputs', None): continue stmts.append(stmt) del cursor LOGGER.info("Valid Statements: %d!" % len(stmts)) for i, stmt in enumerate(stmts): if (i + 1) % log_interval == 0: LOGGER.info("Processing statement: %d/%d" % (i + 1, len(stmts))) stmt["snippet"] = syntactic.normalize(stmt) del stmt["_id"] store.store_normalized_stmt(stmt)
def dump_extracted_to_csv(): store = mongo_driver.MongoStore(props.DATASET) docs = store.load_file_stmts(props.TYPE_PYTHON) stmt_counter = Counter() stmt_file_map = {} for i, doc in enumerate(docs): file_name = doc['file_name'] LOGGER.info("Processing %d / %d ... " % (i + 1, docs.count())) for stmt in doc['snippets']: stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1 file_names = stmt_file_map.get(stmt, set()) file_names.add(file_name) stmt_file_map[stmt] = file_names with open(IPYTHON_CSV, "wb") as csv_file: writer = csv.writer(csv_file, delimiter='\t') writer.writerow(["Statement", "Count", "# unique files"]) for stmt, count in stmt_counter.most_common(): writer.writerow([unicode(stmt).encode("utf-8"), count, len(stmt_file_map[stmt])])
def export_normalized_stmt_file_map(xl_path): store = mongo_driver.MongoStore(props.DATASET) writer = pd.ExcelWriter(xl_path, engine='xlsxwriter') snippets = [] languages = [] file_names = [] for file_stmt in store.load_stmt_file_map(): snippet = file_stmt["snippet"].replace("slacc", "df") empties = [''] * (len(file_stmt["file_names"]) - 1) snippets += [snippet] + empties languages += [file_stmt["language"]] + empties file_names += file_stmt["file_names"] d = OrderedDict() d["snippet"] = snippets d["languages"] = languages d["file_names"] = file_names df = pd.DataFrame(d, columns=d.keys()) df.to_excel(writer, sheet_name="Snippet File Map", index=False)
def difference(r_stmt, py_stmt, store=None, do_log=False): assert r_stmt.language != py_stmt.language if store is None: store = mongo_driver.MongoStore(props.DATASET) for r_return, res1 in r_stmt.outputs.items(): if res1.is_all_same or is_all_none(res1.returns): continue for py_return, res2 in py_stmt.outputs.items(): if res2.is_all_same or is_all_none(res2.returns): continue # diffs = compare_returns(res1.returns, res2.returns) diffs = pooled_compare_returns(res1.returns, res2.returns) if do_log: LOGGER.info("Saving R: %s, PY: %s ... " % (r_stmt.mongo_id, py_stmt.mongo_id)) store.save_difference( r_id=r_stmt.mongo_id, py_id=py_stmt.mongo_id, r_return=mongo_driver.mongo_de_escape(r_return), py_return=mongo_driver.mongo_de_escape(py_return), diff=diffs, do_log=do_log)
def check_correlation(): store = mongo_driver.MongoStore(props.DATASET) diffs = store.load_differences(projection={"diff": False}) asts = [] ngrams = [] levenshteins = [] for i, diff in enumerate(diffs): if i % 1000 == 0: print(i) # if i == 5000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue asts.append(ast) ngrams.append(ngram) levenshteins.append(lev) print("## Pearson Correlation") print("AST-Ngram", pearsonr(asts, ngrams)) print("AST-Levenshtein", pearsonr(asts, levenshteins)) print("Ngram-Levenshtein", pearsonr(ngrams, levenshteins))
def extract_variable_names(): store = mongo_driver.MongoStore(props.DATASET) docs = store.load_file_stmts(props.TYPE_R) stored_stmts = store.load_stmts(props.TYPE_R, is_valid=False) stmt_counter = Counter() for i, doc in enumerate(docs): for stmt in doc['snippets']: if not stored_stmts or (stmt, props.TYPE_R) not in stored_stmts: stmt_counter[stmt] = stmt_counter.get(stmt, 0) + 1 valid_stmts = 0 for i, (stmt, count) in enumerate(stmt_counter.items()): if i % 10 == 0: LOGGER.info("Processing %d / %d .... " % (i + 1, len(stmt_counter))) variables = extract_variable(stmt) if variables and len(variables) == 1: valid_stmts += 1 else: variables = None if DEBUG: LOGGER.info(list(variables)) LOGGER.info("STMT:::: %s\n" % stmt) store.store_stmt(stmt, props.TYPE_R, variables) LOGGER.info("VALID: %d / %d" % (valid_stmts, len(stmt_counter)))
def export_similar_differences(sim, syn, xl_writer, sheet_name, syn_key): LOGGER.info("Running for %s ... " % sheet_name) query = {"n_both_empty": {"$eq": 0}} if sim > 0: query["semantic_score"] = {"$gte": abs(sim)} elif sim < 0: query["semantic_score"] = {"$lte": abs(sim)} if syn > 0: query[syn_key] = {"$gte": abs(syn)} elif syn < 0: query[syn_key] = {"$lte": abs(syn)} projections = { "py_id": 1, "r_id": 1, "r_snippet": 1, "r_return": 1, "py_return": 1, "py_snippet": 1, "_id": 0, "row_diff": 1, "col_diff": 1, "d_n_gram": 1, "d_ast": 1, "d_levenshtein": 1, "size_diff": 1, "semantic_score": 1 } store = mongo_driver.MongoStore(props.DATASET) diffs = [ d for d in store.load_differences(additional_queries=query, projection=projections) ] r_stmts = [] py_stmts = [] n_grams = [] levenshteins = [] asts = [] semantics = [] row_diffs = [] col_diffs = [] size_diffs = [] for diff in diffs: # if diff["r_return"] == props.AUTO_RETURN: # r_stmts.append(diff["r_snippet"]) # else: # r_stmts.append("%s; %s" % (diff["r_snippet"], diff["r_return"])) # if diff["py_return"] == props.AUTO_RETURN: # py_stmts.append(diff["py_snippet"]) # elif diff["py_return"] == props.SELF: # py_stmts.append("return %s" % diff["py_snippet"]) # else: # py_stmts.append("%s; return %s" % (diff["py_snippet"], diff["py_return"])) r_stmt = syntactic.r_normalize( store.load_stmt(diff["r_id"], use_normalized=True)).replace( syntactic.RENAMED_VARIABLE, "df") py_stmt = syntactic.py_normalize( store.load_stmt(diff["py_id"], use_normalized=True)).replace( syntactic.RENAMED_VARIABLE, "df") r_stmts.append(r_stmt) py_stmts.append(py_stmt) n_grams.append(diff.get("d_n_gram", None)) levenshteins.append(diff.get("d_levenshtein", None)) asts.append(diff.get("d_ast", None)) semantics.append(diff.get("semantic_score", None)) row_diffs.append(diff.get("row_diff", None)) col_diffs.append(diff.get("col_diff", None)) size_diffs.append(diff.get("size_diff", None)) d = OrderedDict() d["R"] = r_stmts d["Pandas"] = py_stmts d["AST Distance"] = asts d["N-Gram Distance"] = n_grams d["Semantic Score"] = semantics d["row_diff"] = row_diffs d["col_diff"] = col_diffs d["size_diff"] = size_diffs df = pd.DataFrame(d, columns=d.keys()) df.to_excel(xl_writer, sheet_name=sheet_name, index=False)
def check_anova(): store = mongo_driver.MongoStore(props.DATASET) LOGGER.info("Processing for R-Py") diffs = store.load_differences(projection={"diff": False}) rpy_asts = [] rpy_ngrams = [] rpy_levenshteins = [] for i, diff in enumerate(diffs): if i % 10000 == 0: LOGGER.info("Processed R-Py: %d ..." % i) # if i == 50000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue rpy_asts.append(ast) rpy_ngrams.append(ngram) rpy_levenshteins.append(lev) LOGGER.info("Processing for Python") py_diffs = store.load_self_syntactic_differences(language=props.TYPE_PYTHON) py_asts = [] py_ngrams = [] py_levenshteins = [] for i, diff in enumerate(py_diffs): if i % 10000 == 0: LOGGER.info("Processed Python: %d ..." % i) # if i == 50000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue py_asts.append(ast) py_ngrams.append(ngram) py_levenshteins.append(lev) LOGGER.info("Processing for R") r_diffs = store.load_self_syntactic_differences(language=props.TYPE_R) r_asts = [] r_ngrams = [] r_levenshteins = [] for i, diff in enumerate(r_diffs): if i % 10000 == 0: LOGGER.info("Processed R: %d ..." % i) # if i == 50000: # break ast = diff.get('d_ast', None) ngram = diff.get('d_n_gram', None) lev = diff.get('d_levenshtein', None) if ast is None or ngram is None or lev is None: continue r_asts.append(ast) r_ngrams.append(ngram) r_levenshteins.append(lev) print("\n### AST distance") f_measure, p_value = f_oneway(rpy_asts, py_asts, r_asts) print("F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_asts)), np.asscalar(np.var(rpy_asts)))) print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_asts)), np.asscalar(np.var(py_asts)))) print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_asts)), np.asscalar(np.var(r_asts)))) f_measure, p_value = f_oneway(rpy_asts, py_asts) print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(rpy_asts, r_asts) print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(py_asts, r_asts) print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("\n### N-Gram distance") f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams, r_ngrams) print("F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_ngrams)), np.asscalar(np.var(rpy_ngrams)))) print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_ngrams)), np.asscalar(np.var(py_ngrams)))) print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_ngrams)), np.asscalar(np.var(r_ngrams)))) f_measure, p_value = f_oneway(rpy_ngrams, py_ngrams) print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(rpy_ngrams, r_ngrams) print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(py_ngrams, r_ngrams) print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("\n### Levenshtein distance") f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins, r_levenshteins) print("F-Measure: %f, p-value: %f" % (f_measure, p_value)) print("R-Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(rpy_levenshteins)), np.asscalar(np.var(rpy_levenshteins)))) print("Py => Mean: %f, Std: %f" % (np.asscalar(np.mean(py_levenshteins)), np.asscalar(np.var(py_levenshteins)))) print("R => Mean: %f, Std: %f" % (np.asscalar(np.mean(r_levenshteins)), np.asscalar(np.var(r_levenshteins)))) f_measure, p_value = f_oneway(rpy_levenshteins, py_levenshteins) print("Rpy-Py => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(rpy_levenshteins, r_levenshteins) print("Rpy-R => F-Measure: %f, p-value: %f" % (f_measure, p_value)) f_measure, p_value = f_oneway(py_levenshteins, r_levenshteins) print("Py-R => F-Measure: %f, p-value: %f" % (f_measure, p_value))
def _test_execute_stmt(): store = mongo_driver.MongoStore(props.DATASET) generated_args = arg_generator.generate_args() stmt = store.load_stmt("5cafff91e850c97e6a1f5b9c") print(execute_stmt(stmt, generated_args).values()[0][0])
def run_execute_stmts(): store = mongo_driver.MongoStore(props.DATASET) stmts = store.load_stmts(language="py", is_valid=True) execute_stmts(stmts, do_log=False)
def runner(skip_threshold=3500, start=0, end=None, use_normalized=False): LOGGER.info( "Computing differences for R stmts b/w %d and %d on %d processes" % (start, end if end else -1, multiprocessing.cpu_count())) # log_interval = 100 log_interval = 100 store = mongo_driver.MongoStore(props.DATASET) r_cursor = store.load_raw_stmts(props.TYPE_R, use_normalized=use_normalized) r_stmts = [] for r_stmt in r_cursor: if not r_stmt.get('variables', None) or not r_stmt.get( 'outputs', None): continue r_stmts.append(r_stmt) del r_cursor # Top Py Statements py_cursor = store.load_raw_stmts(props.TYPE_PYTHON, use_normalized=use_normalized) py_stmts = [] for py_stmt in py_cursor: if (not py_stmt.get('variables', None)) or (not py_stmt.get( 'outputs', None)): continue py_stmts.append(py_stmt) del py_cursor for i, r_stmt in enumerate(r_stmts): if i < start or (end and i >= end): LOGGER.info("Skipping R Stmt: %d / %d !" % (i + 1, len(r_stmts))) continue existing_diffs = store.load_differences(r_id=r_stmt["_id"]) processed = set() if existing_diffs.count() > skip_threshold: LOGGER.info("Processed R Stmt: %d / %d !" % (i + 1, len(r_stmts))) continue elif existing_diffs.count() > 0: for diff in existing_diffs: processed.add(diff["py_id"]) LOGGER.info("Processing R Stmt: %d / %d ... " % (i + 1, len(r_stmts))) r_stmt = Statement(mongo_id=r_stmt["_id"], snippet=r_stmt["snippet"], variables=r_stmt["variables"], language=r_stmt["language"], outputs=format_outputs(r_stmt["outputs"])) if r_stmt.is_all_same() or r_stmt.is_all_none(): LOGGER.info("Empty or singular R stmt: %d. Skipping ..." % (i + 1)) continue valid = 0 took_too_long = 0 for j in xrange(len(py_stmts)): valid += 1 do_log = valid % log_interval == 0 py_stmt = py_stmts[j] if py_stmt is None: if do_log: LOGGER.info("Empty or singular py stmt: %d. Skipping ..." % valid) continue if not isinstance(py_stmt, Statement): py_stmts[j] = Statement(mongo_id=py_stmt["_id"], snippet=py_stmt["snippet"], variables=py_stmt["variables"], language=py_stmt["language"], outputs=format_outputs( py_stmt["outputs"])) if py_stmts[j].is_all_same() or py_stmts[j].is_all_none(): py_stmts[j] = None if do_log: LOGGER.info( "Empty or singular py stmt: %d. Skipping ..." % valid) continue py_stmt = py_stmts[j] if py_stmt.mongo_id in processed: if do_log: LOGGER.info("Already processed py stmt: %d !" % valid) continue if do_log: LOGGER.info("Processing py stmt: %d" % valid) difference(r_stmt, py_stmt, store=store, do_log=do_log) # prev_signal = signal.getsignal(signal.SIGALRM) # signal.signal(signal.SIGALRM, execute.timeout_handler) # signal.alarm(max_wait_time) # try: # difference(r_stmt, py_stmt, store=store, do_log=do_log) # except execute.TimeoutException: # took_too_long += 1 # LOGGER.info("Timed out for py: %s" % py_stmt.mongo_id) # except Exception as e: # LOGGER.info(e.message) # signal.alarm(0) # signal.signal(signal.SIGALRM, prev_signal) LOGGER.info("# Timed Out for %s = %d" % (r_stmt.mongo_id, took_too_long))