def add_file(): #tags = request.forms.get('name') upload = request.files.get('file') form_date = request.forms.get('file_date') try: # validate process_date(form_date) except ValueError: #response.status = 422 #status can't be added because angular will not show the message. return jsonize({'message': 'Invalid date format'}) logging.debug("add_file(). date=" + str(form_date)) if form_date is None: form_date = datetime.datetime.now() name = upload.filename data_bin = upload.file.read() file_id = hashlib.sha1(data_bin).hexdigest() logging.debug("add_file(): file_id=" + str(file_id)) status = upload_file(data_bin) process_file(file_id) #ToDo: add a redis job update_date(file_id, form_date) if (status == "ok"): return jsonize({'message': 'Added with ' + str(file_id)}) elif (status == "already exists"): return jsonize({'message': 'Already exists ' + str(file_id)}) elif (status == "virustotal"): return jsonize({'message': 'Already exists ' + str(file_id)}) else: return jsonize({'message': 'Error'})
def add_file(): # tags = request.forms.get('name') upload = request.files.get('file') form_date = request.forms.get('file_date') try: # validate process_date(form_date) except ValueError: # response.status = 422 #status can't be added because angular will not # show the message. return jsonize({'message': 'Invalid date format'}) logging.debug("add_file(). date=" + str(form_date)) if form_date is None: form_date = datetime.datetime.now() name = upload.filename data_bin = upload.file.read() file_id = hashlib.sha1(data_bin).hexdigest() logging.debug("add_file(): file_id=" + str(file_id)) status = upload_file(data_bin) process_file(file_id) # ToDo: add a redis job update_date(file_id, form_date) if(status == "ok"): return jsonize({'message': 'Added with ' + str(file_id)}) elif(status == "already exists"): return jsonize({'message': 'Already exists ' + str(file_id)}) elif(status == "virustotal"): return jsonize({'message': 'Already exists ' + str(file_id)}) else: return jsonize({'message': 'Error'})
def add_file(): #tags = request.forms.get('name') upload = request.files.get('file') name = upload.filename data_bin=upload.file.read() file_id=hashlib.sha1(data_bin).hexdigest() print "file_id="+str(file_id) status=upload_file(data_bin) process_file(file_id) #ToDo: add a redis job if(status == "ok"): return jsonize({'message': 'Added with '+str(file_id)}) elif(status == "already exists"): return jsonize({'message': 'Already exists '+str(file_id)}) elif(status == "virustotal"): return jsonize({'message': 'Already exists '+str(file_id)}) else: return jsonize({'message': 'Error'})
def add_file(): #tags = request.forms.get('name') upload = request.files.get('file') name = upload.filename data_bin = upload.file.read() file_id = hashlib.sha1(data_bin).hexdigest() print "file_id=" + str(file_id) status = upload_file(data_bin) process_file(file_id) #ToDo: add a redis job if (status == "ok"): return jsonize({'message': 'Added with ' + str(file_id)}) elif (status == "already exists"): return jsonize({'message': 'Already exists ' + str(file_id)}) elif (status == "virustotal"): return jsonize({'message': 'Already exists ' + str(file_id)}) else: return jsonize({'message': 'Error'})
def save_file_from_vt(hash_id): downloaded_file = download_from_virus_total(hash_id) if(downloaded_file is None): return {"status": "unknown", "hash": None} if downloaded_file.get('status') == "out_of_credits": return {"status": "out_of_credits", "hash": None} if downloaded_file.get('status') == "not_found": return {"status": "not_found", "hash": None} if downloaded_file.get('status') == 'ok': data_bin = downloaded_file.get('file') file_id = hashlib.sha1(data_bin).hexdigest() pc = PackageController() res = pc.searchFile(file_id) if(res is None): # File not found. Add it to the package. pc.append(file_id, data_bin, True) return {"status": "added", "hash": file_id} else: process_file(file_id) return {"status": "inconsistency_found", "hash": file_id}
def generic_process_hash(hash_str): hash_str = clean_hash(hash_str) if(not valid_hash(hash_str)): return None if(len(hash_str)==32): hash_str=get_file_id(hash_str) if(hash_str is not None): return process_file(hash_str) else : return None
def generic_process_hash(hash_str): hash_str = clean_hash(hash_str) if (not valid_hash(hash_str)): return None if (len(hash_str) == 32): hash_str = get_file_id(hash_str) if (hash_str is not None): return process_file(hash_str) else: return None
def save_file_from_vt(hash_id): downloaded_file=download_from_virus_total(hash_id) if(downloaded_file==None): return {"status": "unknown", "hash": None} if downloaded_file.get('status') == "out_of_credits": return {"status": "out_of_credits", "hash": None} if downloaded_file.get('status') == "not_found": return {"status": "not_found", "hash": None} if downloaded_file.get('status') == 'ok': data_bin=downloaded_file.get('file') file_id=hashlib.sha1(data_bin).hexdigest() pc=PackageController() res=pc.searchFile(file_id) if(res==None): # File not found. Add it to the package. pc.append(file_id,data_bin,True) return {"status": "added", "hash": file_id} else: process_file(file_id) return {"status": "inconsistency_found", "hash": file_id}
def add_file_from_vt(hash_id): downloaded_file=download_from_virus_total(hash_id) if(downloaded_file==None): print "add_file_from_vt(): "+str(hash_id)+" not found in VT." return None print "add_file_from_vt(): downloaded_file is not None."+str(hash_id) data_bin=downloaded_file file_id=hashlib.sha1(data_bin).hexdigest() #print "file_id="+str(file_id) pc=PackageController() res=pc.searchFile(file_id) if(res==None): # File not found. Add it to the package. pc.append(file_id,data_bin,True) print str(hash_id)+" added to DB from VT." #print("Added: %s" % (file_id,)) else: print "add_file_from_vt(): "+str(hash_id)+" was found in the DB and asked in VT: BUG. Going to process right now." process_file(file_id) return file_id
def api_process_file(): file_hash=clean_hash(request.query.file_hash) if len(file_hash) != 40: response.code = 400 return jsonize({'message':'Invalid hash format (use sha1)'}) res=process_file(file_hash) if res==None: response.code = 404 return jsonize("File not found in the database") return jsonize("File processed")
def api_process_file(): file_hash = clean_hash(request.query.file_hash) if len(file_hash) != 40: response.code = 400 return jsonize({'message': 'Invalid hash format (use sha1)'}) res = process_file(file_hash) if res == None: response.code = 404 return jsonize("File not found in the database") return jsonize("File processed")
def main(): collection_version = db["version_container"] collection_meta = db[envget('db_metadata_collection')] start = 0 count = 0 test = 0 mis = 0 print_flag = 1000000 res = collection_meta.find( {}, {"file_id": 1}, no_cursor_timeout=True).skip(start) for r in res: count += 1 test += 1 file_id = r.get('file_id') if not check_if_has_version(file_id, collection_version): mis += 1 process_file(file_id) if(test >= print_flag): test = 0 print "count-miss," + str(count) + "," + str(mis) print "count-miss," + str(count) + "," + str(mis)
def main(): collection_version = db["version_container"] collection_meta = db[env["db_metadata_collection"]] start = 0 count = 0 test = 0 mis = 0 print_flag = 1000000 res = collection_meta.find({}, { "file_id": 1 }, no_cursor_timeout=True).skip(start) for r in res: count += 1 test += 1 file_id = r.get('file_id') if not check_if_has_version(file_id, collection_version): mis += 1 process_file(file_id) if (test >= print_flag): test = 0 print "count-miss," + str(count) + "," + str(mis) print "count-miss," + str(count) + "," + str(mis)
def generic_process_hash(hash_str): if hash_str is None: return None hash_str = clean_hash(hash_str) if(not valid_hash(hash_str)): return None if(len(hash_str) == 64): hash_str = get_file_id(hash_str) elif(len(hash_str) == 32): pc = PackageController() hash_str = pc.md5_to_sha1(hash_str) logging.debug("generic_process_hash-->sha1: " + str(hash_str)) if(hash_str is not None): return process_file(hash_str) else: return None
def generic_process_hash(hash_str): if hash_str is None: return None hash_str = clean_hash(hash_str) if (not valid_hash(hash_str)): return None if (len(hash_str) == 64): hash_str = get_file_id(hash_str) elif (len(hash_str) == 32): pc = PackageController() hash_str = pc.md5_to_sha1(hash_str) logging.debug("generic_process_hash-->sha1: " + str(hash_str)) if (hash_str is not None): return process_file(hash_str) else: return None
def search_by_id(data, limit, columns=[], search_on_vt=False): # date - mime - packager are needed for stats if(len(columns) == 0): retrieve = {"file_id": 1, "description": 1, "size": 1, "mime_type": 1, "particular_header.packer_detection": 1, "particular_header.headers.file_header.TimeDateStamp": 1} else: retrieve = {"file_id": 1, "description": 1, "mime_type": 1, "particular_header.packer_detection": 1, "particular_header.headers.file_header.TimeDateStamp": 1} for col in columns: dic = tree_menu.ids[int(col)] path = str(dic["path"]) retrieve[path] = 1 search_list = data.split('&') query_list = [] av_collection_query_list = [] hash_search = False hash_for_search = "" for search in search_list: if '=' not in search: logging.warning("= not in search. search=" + str(search)) continue str_id, str_value = search.split('=') id = int(str_id.split('.')[0]) if(id <= 0): id = 0 if str_value == "": continue p, v = translate_id(id, str_value) if (id == 10 or id == 11 or id == 21): res = fuzz_search_fast(id, p, v) return res if(id == 1 or id == 2 or id == 3): hash_search = True hash_for_search = v if(id >= 10000): # for adding AVs searchs av_collection_query_list.append( {p: {"$regex": v, "$options": 'i'}}) continue query_list.append({p: v}) if(len(query_list) > 0 and len(av_collection_query_list) == 0): query = {"$and": query_list} res = searchFull(query, limit, retrieve) key_manager = KeyManager() # searching in VT. if(hash_search and len(res) == 0 and search_on_vt and key_manager.check_private_key()): logging.debug("search_by_id() -> save_file_from_vt()") add_response = save_file_from_vt(hash_for_search) sha1 = add_response.get('hash') if sha1 is None: return [] process_file(sha1) query = {"file_id": sha1} res = searchFull(query, 1, retrieve) return res # if the user seachs only for AV_signature and date. # Because VT antivirus analysis are on a seperate collection, we used to # search AV signature first, collection the hashes, and then search hash by hash # to see if the other restrictions in the query match the hash contents in meta_container. # (basically split the query in two). The problem with this began when the av_anaylsis collection # started to grow. Possible solutions are, query the av_analysis collection with a count(), then the # meta_container also with a count(), and search first the collection with the lower number. # This will improve performance a little. Other way was to search both queries and intersect the hashes. # Currently, VT antivirus analysis is in a seperate collection because mongo limits the number of indexes # to 64. If this limit is removed, then av_analysis and meta_container should merge. # Meanwhile, we can get a decent performance for a small query with only # date and av_signature. if(len(query_list) == 1 and len(av_collection_query_list) > 0 and query_list[0].get('date') is not None): query_list.extend(av_collection_query_list) query = {"$and": query_list} retrieve['sha1'] = 1 retrieve.pop('description', None) retrieve.pop('mime_type', None) retrieve.pop('file_id', None) retrieve.pop( 'particular_header.headers.file_header.TimeDateStamp', None) retrieve.pop('particular_header.packer_detection', None) return searchFull(query, limit, retrieve, "av_analysis") if(len(av_collection_query_list) > 0): av_query = {"$and": av_collection_query_list} # res= ["2fa9672b7507f0e983897cfd18b24d3810bb2160","hashfile2"] if(len(av_collection_query_list) == 0): return [] else: # do AV search db_collection = envget('db_metadata_collection') av_coll = db.av_analysis if limit == 0: av_res = av_coll.find(av_query, {"sha1": 1}) else: av_res = av_coll.find(av_query, {"sha1": 1}).limit(limit*2) lista_av = [] for f in av_res: lista_av.append(f) res = [] for l in lista_av: query_list_for_combinated = [] sha1 = l.get("sha1") query_list_for_combinated.append({"hash.sha1": sha1}) query_list_for_combinated = query_list_for_combinated + query_list query = {"$and": query_list_for_combinated} search = searchFull(query, 1, retrieve) res = res + search if(limit > 0): return res[0:limit] else: return res
def api_batch_process_debug_file(): yield "<html><body><pre>" yield "Running Batch process\n" file_hashes = request.forms.get('file_hash') if file_hashes is None: response.status = 422 logging.debug("api_batch_process_debug_file(): file_hash is missing") yield "file_hash parameter is missing" # transform file_hashes in a list of hashes. not_found = [] added_to_queue = 0 downloaded_from_vt = 0 for hash_id in file_hashes.split("\n"): hash_id = clean_hash(hash_id) if hash_id is None: continue data = "1=" + hash_id if(len(hash_id) == 40 or len(hash_id) == 32): pc = PackageController() res = pc.getFile(hash_id) if res is not None and len(SearchModule.search_by_id(data, 1, [], False)) == 0: logging.debug("Processing right now: " + str(hash_id)) process_file(hash_id) if(envget('auto_get_av_result')): add_task_to_download_av_result(hash_id) continue res = SearchModule.search_by_id(data, 1, [], False) if(len(res) == 0): legging.debug("process_debug(): metadata of " + str(hash_id) + " was not found. We will look in Pc. hash length: " + str(len(hash_id))) if(len(hash_id) == 40 or len(hash_id) == 32): pc = PackageController() res = pc.getFile(hash_id) if res is not None: logging.debug( "process_debug(): hash was found (" + str(hash_id) + ")") else: logging.debug( "process_debug(): hash was not found(" + str(hash_id) + ")") logging.debug("process_debug():") logging.debug("process_debug(): going to search " + str(hash_id) + " in vt") add_response = SearchModule.add_file_from_vt(hash_id) sha1 = add_response.get('hash') if(sha1 is None): logging.debug("process_debug(): sha1 is None: " + str(hash_id)) not_found.append(hash_id) continue else: downloaded_from_vt += 1 else: sha1 = res[0]["sha1"] added_to_queue += 1 add_hash_to_process_queue(sha1) if(envget('auto_get_av_result')): add_task_to_download_av_result(sha1) yield str(sha1) + "\n" responsex = str(added_to_queue) + " files added to the process queue.\n" if(downloaded_from_vt > 0): responsex += str(downloaded_from_vt) + " new hashes.\n" if(len(not_found) != 0): responsex += str(len(not_found)) + " hashes not found.\n" responsex += "Not Found:\n" for aux in not_found: responsex = responsex + str(aux) + "\n" yield responsex yield "END"
def search_by_id(data,limit,columns=[],search_on_vt=False): #date - mime - packager are needed for stats if(len(columns)==0): retrieve={"file_id":1,"description":1,"size":1, "mime_type":1,"particular_header.packer_detection":1,"particular_header.headers.file_header.TimeDateStamp":1} else: retrieve={"file_id":1,"description":1, "mime_type":1,"particular_header.packer_detection":1,"particular_header.headers.file_header.TimeDateStamp":1} for col in columns: dic=tree_menu.ids[int(col)] path=str(dic["path"]) retrieve[path]=1 search_list=data.split('&') #print(len(search_list)) query_list=[] av_collection_query_list=[] hash_search=False hash_for_search="" for search in search_list: #print(search) str_id,str_value=search.split('=') id=int(str_id.split('.')[0]) if(id<=0): id=0 if str_value=="": continue p,v=translate_id(id,str_value) if (id==10 or id==11 or id==21): res=fuzz_search_fast(id,p,v) return res if(id==1 or id==2 or id==3): hash_search=True hash_for_search=v if(id>=10000): # for adding AVs searchs av_collection_query_list.append({p:{"$regex":v,"$options":'i'}}) continue query_list.append({p:v}) if(len(query_list)>0 and len(av_collection_query_list)==0): query={"$and":query_list} res=searchFull(query,limit,retrieve) if(hash_search and len(res)==0 and search_on_vt):# searching in VT. print "search_by_id() -> add_file_from_vt()" sha1=add_file_from_vt(hash_for_search) if sha1==None: return [] process_file(sha1) query={"file_id":sha1} res=searchFull(query,1,retrieve) return res if(len(av_collection_query_list)>0): av_query={"$and":av_collection_query_list} #res= ["2fa9672b7507f0e983897cfd18b24d3810bb2160","hashfile2"] if(len(av_collection_query_list)==0): return [] else: # do AV search db_collection = env["db_metadata_collection"] av_coll=db.av_analysis if limit==0: av_res=av_coll.find(av_query,{"sha1":1}) else: av_res=av_coll.find(av_query,{"sha1":1}).limit(limit) lista_av=[] for f in av_res: lista_av.append(f) #print(lista_av)# results of AV searchs res=[] for l in lista_av: query_list_for_combinated=[] sha1=l.get("sha1") query_list_for_combinated.append({"hash.sha1":sha1}) query_list_for_combinated=query_list_for_combinated+query_list query={"$and":query_list_for_combinated} search=searchFull(query,1,retrieve) res=res+search if(limit > 0): return res[0:limit] else: return res
def search_by_id(data, limit, columns=[], search_on_vt=False): #date - mime - packager are needed for stats if (len(columns) == 0): retrieve = { "file_id": 1, "description": 1, "size": 1, "mime_type": 1, "particular_header.packer_detection": 1, "particular_header.headers.file_header.TimeDateStamp": 1 } else: retrieve = { "file_id": 1, "description": 1, "mime_type": 1, "particular_header.packer_detection": 1, "particular_header.headers.file_header.TimeDateStamp": 1 } for col in columns: dic = tree_menu.ids[int(col)] path = str(dic["path"]) retrieve[path] = 1 search_list = data.split('&') query_list = [] av_collection_query_list = [] hash_search = False hash_for_search = "" for search in search_list: if '=' not in search: logging.warning("= not in search. search=" + str(search)) continue str_id, str_value = search.split('=') id = int(str_id.split('.')[0]) if (id <= 0): id = 0 if str_value == "": continue p, v = translate_id(id, str_value) if (id == 10 or id == 11 or id == 21): res = fuzz_search_fast(id, p, v) return res if (id == 1 or id == 2 or id == 3): hash_search = True hash_for_search = v if (id >= 10000): # for adding AVs searchs av_collection_query_list.append( {p: { "$regex": v, "$options": 'i' }}) continue query_list.append({p: v}) if (len(query_list) > 0 and len(av_collection_query_list) == 0): query = {"$and": query_list} res = searchFull(query, limit, retrieve) key_manager = KeyManager() if (hash_search and len(res) == 0 and search_on_vt and key_manager.check_private_key()): # searching in VT. logging.debug("search_by_id() -> save_file_from_vt()") add_response = save_file_from_vt(hash_for_search) sha1 = add_response.get('hash') if sha1 == None: return [] process_file(sha1) query = {"file_id": sha1} res = searchFull(query, 1, retrieve) return res # if the user seachs only for AV_signature and date. # Because VT antivirus analysis are on a seperate collection, we used to # search AV signature first, collection the hashes, and then search hash by hash # to see if the other restrictions in the query match the hash contents in meta_container. # (basically split the query in two). The problem with this began when the av_anaylsis collection # started to grow. Possible solutions are, query the av_analysis collection with a count(), then the # meta_container also with a count(), and search first the collection with the lower number. # This will improve performance a little. Other way was to search both queries and intersect the hashes. # Currently, VT antivirus analysis is in a seperate collection because mongo limits the number of indexes # to 64. If this limit is removed, then av_analysis and meta_container should merge. # Meanwhile, we can get a decent performance for a small query with only date and av_signature. if (len(query_list) == 1 and len(av_collection_query_list) > 0 and query_list[0].get('date') is not None): query_list.extend(av_collection_query_list) query = {"$and": query_list} retrieve['sha1'] = 1 retrieve.pop('description', None) retrieve.pop('mime_type', None) retrieve.pop('file_id', None) retrieve.pop('particular_header.headers.file_header.TimeDateStamp', None) retrieve.pop('particular_header.packer_detection', None) return searchFull(query, limit, retrieve, "av_analysis") if (len(av_collection_query_list) > 0): av_query = {"$and": av_collection_query_list} #res= ["2fa9672b7507f0e983897cfd18b24d3810bb2160","hashfile2"] if (len(av_collection_query_list) == 0): return [] else: # do AV search db_collection = env["db_metadata_collection"] av_coll = db.av_analysis if limit == 0: av_res = av_coll.find(av_query, {"sha1": 1}) else: av_res = av_coll.find(av_query, {"sha1": 1}).limit(limit) lista_av = [] for f in av_res: lista_av.append(f) res = [] for l in lista_av: query_list_for_combinated = [] sha1 = l.get("sha1") query_list_for_combinated.append({"hash.sha1": sha1}) query_list_for_combinated = query_list_for_combinated + query_list query = {"$and": query_list_for_combinated} search = searchFull(query, 1, retrieve) res = res + search if (limit > 0): return res[0:limit] else: return res
def api_batch_process_debug_file(): yield "<html><body><pre>" yield "Running Batch process\n" file_hashes = request.forms.get('file_hash') if file_hashes is None: response.status = 422 logging.debug("api_batch_process_debug_file(): file_hash is missing") yield "file_hash parameter is missing" # transform file_hashes in a list of hashes. not_found = [] added_to_queue = 0 downloaded_from_vt = 0 for hash_id in file_hashes.split("\n"): hash_id = clean_hash(hash_id) if hash_id is None: continue data = "1=" + hash_id if (len(hash_id) == 40 or len(hash_id) == 32): pc = PackageController() res = pc.getFile(hash_id) if res is not None and len( SearchModule.search_by_id(data, 1, [], False)) == 0: logging.debug("Processing right now: " + str(hash_id)) process_file(hash_id) if (env['auto_get_av_result']): add_task_to_download_av_result(hash_id) continue res = SearchModule.search_by_id(data, 1, [], False) if (len(res) == 0): legging.debug("process_debug(): metadata of " + str(hash_id) + " was not found. We will look in Pc. hash length: " + str(len(hash_id))) if (len(hash_id) == 40 or len(hash_id) == 32): pc = PackageController() res = pc.getFile(hash_id) if res is not None: logging.debug("process_debug(): hash was found (" + str(hash_id) + ")") else: logging.debug("process_debug(): hash was not found(" + str(hash_id) + ")") logging.debug("process_debug():") logging.debug("process_debug(): going to search " + str(hash_id) + " in vt") add_response = SearchModule.add_file_from_vt(hash_id) sha1 = add_response.get('hash') if (sha1 == None): logging.debug("process_debug(): sha1 is None: " + str(hash_id)) not_found.append(hash_id) continue else: downloaded_from_vt += 1 else: sha1 = res[0]["sha1"] added_to_queue += 1 add_hash_to_process_queue(sha1) if (env['auto_get_av_result']): add_task_to_download_av_result(sha1) yield str(sha1) + "\n" responsex = str(added_to_queue) + " files added to the process queue.\n" if (downloaded_from_vt > 0): responsex += str(downloaded_from_vt) + " new hashes.\n" if (len(not_found) != 0): responsex += str(len(not_found)) + " hashes not found.\n" responsex += "Not Found:\n" for aux in not_found: responsex = responsex + str(aux) + "\n" yield responsex yield "END"