def delete_differences(self, r_id=None, py_id=None): query = {} if r_id: query["r_id"] = r_id if py_id: query["py_id"] = py_id if not query: raise Exception("Empty query. So use drop!") LOGGER.info("Deleting differences for query %s .... " % query) mongo_driver.get_collection(self.dataset, DIFFERENCES_COLLECTIONS).delete_many(query)
def load_valid_snippets(self, language=None, use_normalized=False): collection_name = STMT_NORMALIZED_COLLECTION if use_normalized else STMT_COLLECTION projection = {"outputs": False} if language: stmts = mongo_driver.get_collection(self.dataset, collection_name).find({"language": language}, projection) else: stmts = mongo_driver.get_collection(self.dataset, collection_name).find({}, projection) valids = [] for stmt in stmts: if stmt.get('variables', None): valids.append(stmt) return valids
def load_stmts(self, language=None, is_valid=True, has_output=False, limit=None, use_normalized=False): collection_name = STMT_NORMALIZED_COLLECTION if use_normalized else STMT_COLLECTION if language: stmts = mongo_driver.get_collection(self.dataset, collection_name).find({"language": language}) else: stmts = mongo_driver.get_collection(self.dataset, collection_name).find() formatted = {} for stmt in stmts: if (not is_valid or (is_valid and stmt.get('variables', None))) \ and (not has_output or (has_output and stmt.get('outputs', None))): formatted[(stmt['snippet'], stmt['language'])] = stmt if limit and len(formatted) == limit: return formatted return formatted
def save_inputs(self, inps): collection = mongo_driver.get_collection(self.dataset, INPUTS_COLLECTIONS) for inp in inps: arg_set = [arg.to_dict(orient='records') for arg in inp] collection.insert({ "args": arg_set })
def get_executed_functions(self, language): collection = mongo_driver.get_collection( self.dataset, "language_executed_functions") document = collection.find_one({"language": language}) if document is None: return None return document['names']
def store_normalized_stmt(self, stmt_dict): collection = mongo_driver.get_collection(self.dataset, STMT_NORMALIZED_COLLECTION) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_unique_index_for_collection(collection, "snippet", "language") try: collection.insert(stmt_dict, continue_on_error=True) except pymongo.errors.DuplicateKeyError as e: pass
def save_language_executed_function_names(self, language, names): collection = mongo_driver.get_collection( self.dataset, "language_executed_functions") if not mongo_driver.is_collection_exists(collection): mongo_driver.create_index_for_collection(collection, "language") if mongo_driver.contains_document(collection, "language", language): mongo_driver.delete_document(collection, "language", language) collection.insert({"language": language, "names": names})
def load_meta(self, file_name): sep_positions = [m.start() for m in re.finditer(os.sep, file_name)] if sep_positions and len(sep_positions) > 3: fp_regex = file_name[sep_positions[2]:] else: fp_regex = file_name collection = mongo_driver.get_collection(self.dataset, "py_file_meta") return collection.find_one({"file_path": {"$regex": fp_regex}})
def save_py_metadata(self, func_json): collection = mongo_driver.get_collection(self.dataset, "py_functions_metadata") if not mongo_driver.is_collection_exists(collection): mongo_driver.create_index_for_collection(collection, "name") if mongo_driver.contains_document(collection, "name", func_json["name"]): mongo_driver.delete_document(collection, "name", func_json["name"]) collection.insert(func_json)
def store_file_stmts(self, file_name, snippets, language): collection = mongo_driver.get_collection(self.dataset, FILE_STMT_COLLECTION) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_unique_index_for_collection(collection, "file_name") collection.insert({ "file_name": file_name, "snippets": snippets, "language": language })
def store_stmt(self, snippet, language, variables): collection = mongo_driver.get_collection(self.dataset, STMT_COLLECTION) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_unique_index_for_collection(collection, "snippet", "language") collection.insert({ "snippet": snippet, "language": language, "variables": variables })
def load_differences(self, r_id=None, py_id=None, additional_queries=None, projection=None, limit=0): collection = mongo_driver.get_collection(self.dataset, DIFFERENCES_COLLECTIONS) query = {} if r_id: query["r_id"] = r_id if py_id: query["py_id"] = py_id if additional_queries: query.update(additional_queries) if not limit: limit = 0 return collection.find(query, projection).limit(limit)
def load_function_arg_type(self, function_name): try: return mongo_driver.get_collection( self.dataset, "py_functions_arg_types").find_one({"name": function_name}) except Exception as e: LOGGER.critical( "Failed to load args for function: '%s'. Returning None." "\nMessage: %s" % (function_name, e.message)) return None
def save_cloned_function_names(self, name, clones): collection = mongo_driver.get_collection(self.dataset, "cloned_functions") if not mongo_driver.is_collection_exists(collection): mongo_driver.create_index_for_collection(collection, "_function_name_") if mongo_driver.contains_document(collection, "_function_name_", name): mongo_driver.delete_document(collection, "_function_name_", name) clones["_function_name_"] = name collection.insert(clones)
def load_args(self, args_key): collection_name = "test_fuzzed_args" if self.is_test else "fuzzed_args" collection = mongo_driver.get_collection(self.dataset, collection_name) try: return collection.find_one({"key": args_key}) except Exception as e: LOGGER.exception( "Failed to load args with key: '%s'. Returning None" % args_key) return None
def load_py_metadata(self, function_name): try: collection = mongo_driver.get_collection(self.dataset, "py_functions_metadata") return collection.find_one({"name": function_name}) except Exception: LOGGER.exception( "Failed to metadata for function: '%s'. Returning None" % function_name) return None
def save_py_function(self, function_json): collection_name = "test_py_functions_executed" if self.is_test else "py_functions_executed" collection = mongo_driver.get_collection(self.dataset, collection_name) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_index_for_collection(collection, "name") try: collection.insert(function_json) except Exception: del function_json['outputs'] self.save_failed_py_function(function_json)
def load_inputs(self, column_names): collection = mongo_driver.get_collection(self.dataset, INPUTS_COLLECTIONS) inps = [] for inp in collection.find(): args = [] for arg in inp["args"]: df = pd.DataFrame(arg).reindex(column_names, axis=1) args.append(df) inps.append(args) return inps
def load_self_syntactic_differences(self, language=None, id_1=None, id_2=None, additional_queries=None, projection=None, limit=0): collection = mongo_driver.get_collection(self.dataset, SELF_SYNTACTIC_DIFFERENCES_COLLECTION) query = {} if id_1: query["id_1"] = id_1 if id_2: query["id_2"] = id_2 if language: query["language"] = language if additional_queries: query.update(additional_queries) if not limit: limit = 0 return collection.find(query, projection).limit(limit)
def load_inputs(self, args_key): arguments = mongo_driver.get_collection( self.dataset, "fuzzed_args").find_one({"key": args_key})["args"] assert len(arguments) == properties.FUZZ_ARGUMENT_SIZE if self.is_array(arguments): key_args = arguments else: key_args = [[] for _ in range(len(arguments[0]))] for i in range(len(arguments[0])): for arg in arguments: key_args[i].append(arg) return key_args
def create_stmt_file_map(self, stmt, stmt_file_map, do_log=True): collection = mongo_driver.get_collection(self.dataset, STMT_FILE_COLLECTION) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_index_for_collection(collection, "snippet") try: doc = {"snippet": stmt} doc.update(stmt_file_map) collection.insert(doc) except pymongo.errors.DuplicateKeyError as e: if do_log: LOGGER.warning(e.message) LOGGER.info("We continue ... ")
def update_stmt_outputs(self, stmt_id, outputs): collection = mongo_driver.get_collection(self.dataset, STMT_COLLECTION) stmt = collection.find_one({'_id': stmt_id}) stmt['outputs'] = outputs try: collection.update_one({'_id': stmt_id}, {"$set": stmt}, upsert=False) except Exception: stmt['outputs'] = None try: collection.update_one({'_id': stmt_id}, {"$set": stmt}, upsert=False) except Exception as e: # import pprint # pprint.pprint(outputs[outputs.keys()[0]]) raise e
def save_clusters(self, clusters, suffix): collection_name = "clusters_%s" % suffix collection = mongo_driver.get_collection(self.dataset, collection_name) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_unique_index_for_collection( collection, "cluster_id") for cluster_id, functions in clusters.items(): LOGGER.info("Saving cluster: '%d', with %d functions" % (cluster_id, len(functions))) cluster = { "cluster_id": cluster_id, "functions": [lib.to_json(f) for f in functions] } collection.insert(cluster)
def save_self_syntactic_differences(self, records, do_log=True): collection = mongo_driver.get_collection(self.dataset, SELF_SYNTACTIC_DIFFERENCES_COLLECTION) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_unique_index_for_collection(collection, "id_1", "id_2", "language") mongo_driver.create_index_for_collection(collection, "d_levenshtein") mongo_driver.create_index_for_collection(collection, "d_jaro") mongo_driver.create_index_for_collection(collection, "d_jaro_winkler") mongo_driver.create_index_for_collection(collection, "d_n_gram") mongo_driver.create_index_for_collection(collection, "d_ast") try: collection.insert_many(records) except pymongo.errors.DuplicateKeyError as e: if do_log: LOGGER.warning(e.message) LOGGER.info("We continue ... ")
def load_difference(self, r_id, py_id, limit=0): collection = mongo_driver.get_collection(self.dataset, DIFFERENCES_COLLECTIONS) query = {} if r_id: query["r_id"] = r_id if py_id: query["py_id"] = py_id document = collection.find(query).limit(limit) docs = [] for doc in document: diff = [] for d in doc["diff"]: if d: diff.append(differences.DiffMeta.from_dict(d)) else: diff.append(None) doc['diff'] = diff doc["r_return"] = doc["r_return"] doc["py_return"] = doc["py_return"] docs.append(doc) return docs
def save_difference(self, r_id, py_id, r_return, py_return, diff, do_log=True): collection = mongo_driver.get_collection(self.dataset, DIFFERENCES_COLLECTIONS) if not mongo_driver.is_collection_exists(collection): mongo_driver.create_unique_index_for_collection(collection, "r_id", "py_id", "r_return", "py_return") mongo_driver.create_index_for_collection(collection, "d_levenshtein") mongo_driver.create_index_for_collection(collection, "d_jaro") mongo_driver.create_index_for_collection(collection, "d_jaro_winkler") mongo_driver.create_index_for_collection(collection, "d_n_gram") mongo_driver.create_index_for_collection(collection, "d_ast") try: collection.insert({ "r_id": r_id, "py_id": py_id, "r_return": r_return, "py_return": py_return, "diff": [d.to_dict() if d else None for d in diff] }) except pymongo.errors.DuplicateKeyError as e: if do_log: LOGGER.warning(e.message) LOGGER.info("We continue ... ")
def delete_file_stmts(self, language=None): if language: mongo_driver.get_collection(self.dataset, FILE_STMT_COLLECTION).delete_many({"language": language}) else: mongo_driver.get_collection(self.dataset, FILE_STMT_COLLECTION).drop()
def load_file_stmts(self, language=None): if language: return mongo_driver.get_collection(self.dataset, FILE_STMT_COLLECTION).find({"language": language}) else: return mongo_driver.get_collection(self.dataset, FILE_STMT_COLLECTION).find()
def load_stmts_for_file_name(self, file_name): try: return mongo_driver.get_collection(self.dataset, FILE_STMT_COLLECTION).find_one({"file_name": file_name}) except Exception: LOGGER.critical("Failed to load file name : %s" % file_name) return None
def _test(): collection = mongo_driver.get_collection("Misconceptions", "differences") print(mongo_driver.is_collection_exists(collection))