Ejemplo n.º 1
0
def classify_dump(dump_id):
    print_arff(dump_id)
    subprocess.call(
            "java -Xmx2000m -cp ./weka.jar "
            "weka.classifiers.meta.FilteredClassifier "
            "-l %s -p 1,58,59 -distribution -T test.arff "
            "> test.result" % (model_file,), shell=True)

    conn = util.connect_to_db()
    cursor = conn.cursor()

    score = None
    with open('test.result', 'r') as f:
        for line in f:
            if ':' in line:
                for word in line.split():
                    if '*' in word:
                        score = word.split(',')[0]
                        if score.startswith('*'):
                            score = score[1:]

    print "AMICO Score:", score

    cursor.execute("""
            DELETE FROM amico_scores
            WHERE dump_id = %s""",
            (dump_id, ))
    cursor.execute("INSERT INTO amico_scores VALUES "
                   "(%s, %s)", (dump_id, score))

    subprocess.call("rm test.arff", shell=True)
    subprocess.call("rm test.result", shell=True)
Ejemplo n.º 2
0
def db_syslog(dump_id):
    time.sleep(WAIT_TIME)
    conn = util.connect_to_db()
    cursor = conn.cursor()
    make_syslog_entry(cursor, dump_id)
    cursor.close()
    conn.close()
Ejemplo n.º 3
0
def classify_dump(dump_id):
    print_arff(dump_id)
    subprocess.call("java -Xmx2000m -cp ./weka.jar "
                    "weka.classifiers.meta.FilteredClassifier "
                    "-l %s -p 1,58,59 -distribution -T test.arff "
                    "> test.result" % (model_file, ),
                    shell=True)

    conn = util.connect_to_db()
    cursor = conn.cursor()

    score = None
    with open('test.result', 'r') as f:
        for line in f:
            if ':' in line:
                for word in line.split():
                    if '*' in word:
                        score = word.split(',')[0]
                        if score.startswith('*'):
                            score = score[1:]

    print "AMICO Score:", score

    cursor.execute(
        """
            DELETE FROM amico_scores
            WHERE dump_id = %s""", (dump_id, ))
    cursor.execute("INSERT INTO amico_scores VALUES "
                   "(%s, %s)", (dump_id, score))

    subprocess.call("rm test.arff", shell=True)
    subprocess.call("rm test.result", shell=True)
Ejemplo n.º 4
0
def db_syslog(dump_id):
    time.sleep(WAIT_TIME)
    conn = util.connect_to_db()
    cursor = conn.cursor()
    make_syslog_entry(cursor, dump_id)
    cursor.close()
    conn.close()
Ejemplo n.º 5
0
def main():
    key_bytes = [random.randint(0, 255) for _ in range(3)]
    conn = util.connect_to_db()
    cursor = conn.cursor()
    cursor.execute("""
        DROP TABLE IF EXISTS pe_dumps_copy """)
    cursor.execute("""
        CREATE TABLE pe_dumps_copy AS TABLE pe_dumps """)
    cursor.execute("""
        SELECT DISTINCT client
        FROM pe_dumps_copy
        """)

    orig_clients = [row[0] for row in cursor.fetchall()]
    anony_clients = {}
    num_ips = len(orig_clients)
    for ip in orig_clients:
        anony_clients[ip] = anonymize_ip(ip, key_bytes)
    past_progress = 0
    for i, ip in enumerate(anony_clients):
        progress = round((float(i) / num_ips), 2)
        if progress > past_progress:
            drawProgressBar(progress)
        past_progress = progress
        cursor.execute("""
            UPDATE pe_dumps_copy
            SET client = %s
            WHERE client = %s
            """, (anony_clients[ip], ip))
    print "\n Made a copy of pe_dumps table with anonymized client IPs!!"
    cursor.close()
    conn.close()
Ejemplo n.º 6
0
 def __init__(self,):
     self.output_file = "train.arff"
     self.conn = util.connect_to_db()
     self.conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_READ_COMMITTED)
     self.clean_label_delta = timedelta(days=30)
     self.training_end_date = date.today()
     if training_start_date:
         self.training_start_date = datetime.strptime(training_start_date,
                 "%Y-%m-%d")
     else:
         cursor = self.conn.cursor()
         cursor.execute("""
                 SELECT MIN(timestamp)
                 FROM pe_dumps""")
         if cursor.rowcount > 0:
             self.training_start_date = cursor.fetchone()[0].date()
         else:
             print "No entries in the database to train!"
             sys.exit()
         cursor.close()
     if training_days:
         self.training_end_date = (self.training_start_date +
                 timedelta(days=training_days))
     print "Training start date:", self.training_start_date.strftime("%B %d, %Y")
     print "Training end date:", self.training_end_date.strftime("%B %d, %Y")
Ejemplo n.º 7
0
def update_score(dump_id, score):
    conn = util.connect_to_db()
    cursor = conn.cursor()
    cursor.execute(
        """
            DELETE FROM amico_scores
            WHERE dump_id = %s""", (dump_id, ))
    cursor.execute("INSERT INTO amico_scores VALUES "
                   "(%s, %s)", (dump_id, score))
Ejemplo n.º 8
0
def main():
    conn = util.connect_to_db()

    logging.basicConfig(level=logging.DEBUG, filename=LOG_FILE, filemode='w')
    raw_file_names = os.listdir(RAW_FILE_DIR)
    for fn in raw_file_names:
        file_path = os.path.join(RAW_FILE_DIR, fn)
        print "Analyzing file:", file_path
        update_url(file_path, conn)
Ejemplo n.º 9
0
def print_arff(dump_id):
    conn = util.connect_to_db()
    cursor = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
    cursor.execute(
        """
        SELECT * FROM weka_features
        WHERE dump_id = %s""", (dump_id, ))
    if cursor.rowcount == 0:
        print "Feature vector not found. Exiting..."
        return
    res = cursor.fetchone()
    res = res._asdict()
    del res['raw_dump_num_av_labels']
    del res['raw_dump_trusted_av_labels']

    w = open(output_file, 'w')
    w.write('@RELATION test\n\n')
    values = []
    for feature in features:
        if feature in [
                'sha1', 'dump_id', 'host', 'corrupt', 'vt_month_shelf',
                'url_struct'
        ]:
            data_type = "STRING"
        elif feature == "extension_class":
            data_type = ("{common_ext,unknown_ext,common_fake,other_ext,"
                         "no_url,no_ext}")
        else:
            data_type = "NUMERIC"
        w.write('@ATTRIBUTE %s %s\n' % (feature, data_type))
        values.append(res[feature])
        #print "%s : %s" % (key, res[key])

    w.write('@ATTRIBUTE class {pos, neg}\n\n')
    w.write('@DATA\n\n')
    try:
        data_string = ','.join([
            '?' if (value is None or value is '') else str(value)
            for value in values
        ])
    except Exception as e:
        print "Error in writing feature vector to file!", e
    else:
        data_string += ",?"
        w.write(data_string + '\n')
    w.close()
    cursor.close()
    conn.close()
Ejemplo n.º 10
0
def manual_download(captured_sha1):
    util.setup_socks()
    conn = util.connect_to_db()
    cursor = conn.cursor()

    # Database query to get the relevant recent record
    cursor.execute(
        """
        SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s
            ORDER BY timestamp DESC;""", (captured_sha1, ))
    row = cursor.fetchone()
    dump_id = row[0]
    host = row[1]
    url = row[2]
    referer = row[3]
    client = row[4]
    server = row[5]

    full_url = "http://"
    ordered_host = server  # if host is null, we use ther server IP
    if host:
        ordered_host = util.reorder_domain(host)
    full_url += ordered_host
    if url:
        full_url += url
    print "Starting manual download from :", full_url

    # Prepare the urllib2 request
    req = urllib2.Request(full_url)
    req.add_header("User-Agent", USER_AGENT)

    download_time = time.time()
    sha1, md5, different, is_interesting_file = download_file(
        dump_id, req, captured_sha1)

    # Database statement
    cursor.execute(
        """
        INSERT INTO manual_download_checksums(dump_id, sha1,
        md5, different, referer_exists, timestamp, is_pe)
        VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""",
        (dump_id, sha1, md5, different, False, download_time,
         is_interesting_file))

    cursor.close()
    conn.close()
def build(window):
    conn = util.connect_to_db()

    cursor = conn.cursor()

    cursor.execute("""SELECT max(dump_id) from pe_dumps""")
    lastID = cursor.fetchone()[0]

    minID = lastID - window
    currID = lastID - window

    while currID < lastID:
        cursor.execute("""SELECT file_type FROM
                                        pe_dumps WHERE
                                        dump_id=%d""" % (currID, ))
        file_extension = cursor.fetchone()[0]
        get_feature_vector(currID, file_extension, minID)
        currID += 1
def print_arff(dump_id):
    conn = util.connect_to_db()
    cursor = conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor)
    cursor.execute("""
        SELECT * FROM weka_features
        WHERE dump_id = %s""",
        (dump_id, ))
    if cursor.rowcount == 0:
        print "Feature vector not found. Exiting..."
        return
    res = cursor.fetchone()
    res = res._asdict()
    del res['raw_dump_num_av_labels']
    del res['raw_dump_trusted_av_labels']

    w = open(output_file, 'w')
    w.write('@RELATION test\n\n')
    values = []
    for feature in features:
        if feature in ['sha1', 'dump_id', 'host', 'corrupt', 'vt_month_shelf',
                'url_struct']:
            data_type = "STRING"
        elif feature == "extension_class":
            data_type = ("{common_ext,unknown_ext,common_fake,other_ext,"
                   "no_url,no_ext}")
        else:
            data_type = "NUMERIC"
        w.write('@ATTRIBUTE %s %s\n' % (feature, data_type))
        values.append(res[feature])
        #print "%s : %s" % (key, res[key])

    w.write('@ATTRIBUTE class {pos, neg}\n\n')
    w.write('@DATA\n\n')
    try:
        data_string = ','.join(['?' if (value is None or value is '') else
            str(value) for value in values])
    except Exception as e:
        print "Error in writing feature vector to file!", e
    else:
        data_string += ",?"
        w.write(data_string + '\n')
    w.close()
    cursor.close()
    conn.close()
def build(window):
    conn = util.connect_to_db()

    cursor = conn.cursor()

    cursor.execute("""SELECT max(dump_id) from pe_dumps""")
    lastID = cursor.fetchone()[0]

    minID = lastID - window
    currID = lastID - window


    while currID < lastID:
        cursor.execute("""SELECT file_type FROM
                                        pe_dumps WHERE
                                        dump_id=%d""" % (currID,))
        # file_extension = cursor.fetchone()[0]
        # get_feature_vector(currID,file_extension, minID)
        get_feature_vector(currID)
        currID += 1
Ejemplo n.º 14
0
    def __init__(self):
        self.QUERY_RATE_LIMIT = 10
        self.ONE_MIN = 60

        logging.config.fileConfig(LOG_CONF_FILE)
        self.logger = logging.getLogger("amico_logger")
        #stdout_handler = logging.StreamHandler(sys.stdout)
        #stdout_handler.setLevel(logging.DEBUG)
        #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s'
                               #'- %(message)s')
        #stdout_handler.setFormatter(formatter)
        #self.logger.addHandler(stdout_handler)

        util.setup_socks()
        self.conn = util.connect_to_db()
        self.cursor = self.conn.cursor()

        self.today = date.today().strftime("%Y-%m-%d")
        self.yesterday = (date.today() -
                timedelta(days=1)).strftime("%Y-%m-%d")
        self.last_month = (date.today() -
                timedelta(days=30)).strftime("%Y-%m-%d")
Ejemplo n.º 15
0
    def __init__(self):
        self.QUERY_RATE_LIMIT = 10
        self.ONE_MIN = 60

        logging.config.fileConfig(LOG_CONF_FILE)
        self.logger = logging.getLogger("amico_logger")
        #stdout_handler = logging.StreamHandler(sys.stdout)
        #stdout_handler.setLevel(logging.DEBUG)
        #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s'
        #'- %(message)s')
        #stdout_handler.setFormatter(formatter)
        #self.logger.addHandler(stdout_handler)

        util.setup_socks()
        self.conn = util.connect_to_db()
        self.cursor = self.conn.cursor()

        self.today = date.today().strftime("%Y-%m-%d")
        self.yesterday = (date.today() -
                          timedelta(days=1)).strftime("%Y-%m-%d")
        self.last_month = (date.today() -
                           timedelta(days=30)).strftime("%Y-%m-%d")
Ejemplo n.º 16
0
def manual_download(captured_sha1):
    util.setup_socks()
    conn = util.connect_to_db()
    cursor = conn.cursor()

    # Database query to get the relevant recent record
    cursor.execute("""
        SELECT dump_id,host,url,referer,client,server FROM pe_dumps WHERE sha1 = %s
            ORDER BY timestamp DESC;""", (captured_sha1,))
    row = cursor.fetchone()
    dump_id = row[0]
    host = row[1]
    url = row[2]
    referer = row[3]
    client = row[4]
    server = row[5]

    if host is None:
        host = server
    ordered_host = util.reorder_domain(host)
    full_url = "http://" + ordered_host + url
    #print full_url

    # Prepare the urllib2 request
    req = urllib2.Request(full_url)
    req.add_header("User-Agent", USER_AGENT)

    download_time = time.time()
    sha1, md5, different, is_pe = download_file(dump_id, req, captured_sha1)

    # Database statement
    cursor.execute("""
        INSERT INTO manual_download_checksums(dump_id, sha1,
        md5, different, referer_exists, timestamp, is_pe)
        VALUES (%s, %s, %s, %s, %s, TO_TIMESTAMP(%s), %s)""",
        (dump_id, sha1, md5, different, False, download_time, is_pe))

    cursor.close()
    conn.close()
Ejemplo n.º 17
0
def get_feature_vector(dump_id):
    #print "entered get_feature_vector"
    conn = util.connect_to_db()
    cursor = conn.cursor()
    insert_features(cursor, dump_id)
    print "Done inserting features for dump_id: ", dump_id
(%(document_id)s, %(paragraph_id)s, %(theorem_type)s, %(text)s)
"""

authorshipInsertStmt = """
INSERT INTO authorship(document, rank, display_name, zbmath_id)
VALUES
(%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s)
"""

mscAssignmentInsertStmt = """
INSERT INTO msc_assignment(document, msc, pos)
VALUES
(%(document_id)s, %(msc)s, %(pos)s)
"""

db = connect_to_db()
cursor = db.cursor()
warning_log = open("warning_log", "a")

p = DocumentParser()
# filepath = "raw_data/test_documents/07040005.xml"
# for filename in filesInDict("raw_data/test_documents", True):
for filename, filepath in zip(filenames, filepaths):
    sys.stdout.write("processing " + filename + "... ")

    # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename)
    doc, raw_paragraphs, formula_dict = p.parse_raw(filepath)

    # info for doc table:
    document_id = doc.arxiv_id()
    publication_date = doc.publication_date
Ejemplo n.º 19
0
def generate_CSV_download_file():
    connection = util.connect_to_db()
    connection_cursor = connection.cursor()
    csv_writer = None

    header = "Second_ID,Mal_APK,Tot_APK,Mal_DMG,Tot_DMG,Mal_ELF,Tot_ELF,Mal_EXE,Tot_EXE,Mal_PDF,Tot_PDF,Mal_SWF," + \
             "Tot_SWF,Mal_JAR,Tot_JAR,Mal_RAR,Tot_RAR,Mal_ZIP,Tot_ZIP,Timestamp,Next_Download_Event_[s]"
    header_list = [
        "Second_ID", "Mal_APK", "Tot_APK", "Mal_DMG", "Tot_DMG", "Mal_ELF",
        "Tot_ELF", "Mal_EXE", "Tot_EXE", "Mal_PDF", "Tot_PDF", "Mal_SWF",
        "Tot_SWF", "Mal_JAR", "Tot_JAR", "Mal_RAR", "Tot_RAR", "Mal_ZIP",
        "Tot_ZIP", "Timestamp", "Next_Download_Event_[s]"
    ]
    created_csv_file = OUT_DIR + "/" + str(DOWNLOAD_GRAPH_ID) + "-downloads_" + \
                       datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".csv"

    with open(created_csv_file, "wb") as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(header_list)

    csv_map = defaultdict(list)
    malware_timestamp_set = set()
    ##################################################### EXECUTABLES #####################################################

    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'APK' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_apk_count_per_second = row[1]

            csv_map[timestamp].append(malware_apk_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'APK' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_apk_count_per_second = row[1]

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_apk_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_apk_count_per_second])

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'DMG' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_dmg_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0])

            csv_map[timestamp].append(malware_dmg_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'DMG' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_dmg_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_dmg_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_dmg_count_per_second])

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'ELF' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_elf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0])

            csv_map[timestamp].append(malware_elf_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ELF' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_elf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_elf_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_elf_count_per_second])

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'EXE' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_exe_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_exe_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'EXE' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_exe_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_exe_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_exe_count_per_second])

            ########################################################################################################################

            ######################################################### PDF #########################################################

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'PDF' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_pdf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_pdf_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'PDF' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_pdf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_pdf_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_pdf_count_per_second])

            ########################################################################################################################

            ######################################################## FLASH ########################################################

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'SWF' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_swf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_swf_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'SWF' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_swf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_swf_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_swf_count_per_second])

            ########################################################################################################################

            ###################################################### COMPRESSED ######################################################

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'JAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_jar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_jar_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'JAR' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_jar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_jar_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_jar_count_per_second])

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'RAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_rar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend(
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_rar_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'RAR' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_rar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend(
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_rar_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_rar_count_per_second])

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'ZIP' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_zip_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend(
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_zip_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ZIP' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_zip_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend(
                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_zip_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_zip_count_per_second])

            ########################################################################################################################
    sorted_csv_map = sorted(csv_map.items(), key=operator.itemgetter(0))

    csv_map_aux = defaultdict(list)
    first_useful_date = "2014-11-26 22:55:40"
    last_useful_date = "2015-10-01 00:00:00"
    # Loop for handling corrupted timestamp
    for timestamp, file_list in sorted_csv_map:
        if cmp(timestamp, first_useful_date) < 0:
            timestamp_split = timestamp.split()
            first_useful_date_split = first_useful_date.split()
            timestamp_hms = timestamp_split[1]
            first_useful_date_ymd = first_useful_date_split[0]

            corrected_timestamp = first_useful_date_ymd + " " + timestamp_hms
            csv_map_aux[corrected_timestamp] = csv_map.get(timestamp)
        else:
            break

    max_values = len(header.split(',')) - 2
    csv_rows = list()
    sorted_csv_map_aux = sorted(csv_map_aux.items(),
                                key=operator.itemgetter(0))
    UID = 0

    for timestamp, file_list in sorted_csv_map_aux:
        formatted_row = format_row(file_list, max_values)
        formatted_row.insert(0, UID)
        formatted_row.append(timestamp)
        csv_rows.append(formatted_row)
        UID += 1

    writable_csv_rows = list()
    while csv_rows:
        current_row = csv_rows.pop(0)
        if not csv_rows:
            writable_csv_rows.append(current_row)
            continue
        next_row = csv_rows[0]
        timestamp_index = len(current_row) - 1

        current_timestamp_string = current_row[timestamp_index]
        next_timestamp_string = next_row[timestamp_index]

        current_timestamp = datetime.strptime(current_timestamp_string,
                                              '%Y-%m-%d %H:%M:%S')
        next_timestamp = datetime.strptime(next_timestamp_string,
                                           '%Y-%m-%d %H:%M:%S')
        time_delta_in_secs = int(
            (next_timestamp - current_timestamp).total_seconds()) - 1
        current_row.append(time_delta_in_secs)

        writable_csv_rows.append(current_row)

    writable_sorted_csv_map = list()
    for timestamp, file_list in sorted_csv_map:
        if cmp(timestamp, first_useful_date) < 0 or cmp(
                timestamp, last_useful_date) > 0:
            continue
        else:
            writable_sorted_csv_map.append([timestamp, file_list])

    writable_csv_rows_aux = list()
    while writable_sorted_csv_map:
        timestamp_file_list_first_pair = writable_sorted_csv_map.pop(0)
        timestamp_str_first_pair = timestamp_file_list_first_pair[0]
        file_list_first_pair = timestamp_file_list_first_pair[1]

        if not writable_sorted_csv_map:
            formatted_row = format_row(file_list_first_pair, max_values)
            formatted_row.insert(0, UID)
            formatted_row.append(timestamp_str_first_pair)
            writable_csv_rows_aux.append(formatted_row)
            UID += 1
            continue
        timestamp_file_list_second_pair = writable_sorted_csv_map[0]
        timestamp_str_second_pair = timestamp_file_list_second_pair[0]

        formatted_row = format_row(file_list_first_pair, max_values)
        formatted_row.insert(0, UID)
        formatted_row.append(timestamp_str_first_pair)

        timestamp_first_pair = datetime.strptime(timestamp_str_first_pair,
                                                 '%Y-%m-%d %H:%M:%S')
        timestamp_second_pair = datetime.strptime(timestamp_str_second_pair,
                                                  '%Y-%m-%d %H:%M:%S')
        time_delta_in_secs = int(
            (timestamp_second_pair - timestamp_first_pair).total_seconds()) - 1
        formatted_row.append(time_delta_in_secs)

        writable_csv_rows_aux.append(formatted_row)
        UID += 1

    last_formatted_row_in_writable_csv_rows = writable_csv_rows.pop(
        len(writable_csv_rows) - 1)
    first_formatted_row_in_writable_csv_rows_aux = writable_csv_rows_aux[0]
    timestamp_index = len(last_formatted_row_in_writable_csv_rows) - 1

    current_timestamp_string = last_formatted_row_in_writable_csv_rows[
        timestamp_index]
    next_timestamp_string = first_formatted_row_in_writable_csv_rows_aux[
        timestamp_index]

    current_timestamp = datetime.strptime(current_timestamp_string,
                                          '%Y-%m-%d %H:%M:%S')
    next_timestamp = datetime.strptime(next_timestamp_string,
                                       '%Y-%m-%d %H:%M:%S')
    time_delta_in_secs = int(
        (next_timestamp - current_timestamp).total_seconds()) - 1
    last_formatted_row_in_writable_csv_rows.append(time_delta_in_secs)

    writable_csv_rows_aux.insert(0, last_formatted_row_in_writable_csv_rows)

    with open(created_csv_file, "a") as csv_file:
        csv_writer = csv.writer(csv_file, csv.QUOTE_NONNUMERIC)
        for row in writable_csv_rows:
            csv_writer.writerow(row)
        for row in writable_csv_rows_aux:
            csv_writer.writerow(row)
Ejemplo n.º 20
0
def fe_db_setup():
    conn = util.connect_to_db()
    cursor = conn.cursor()

    cursor.execute(""" DROP table if exists features""")
    cursor.execute(""" DROP table if exists weka_features""")
    cursor.execute("""
        CREATE TABLE weka_features(
            dump_id INT,
            raw_dump_num_av_labels INT,
            raw_dump_trusted_av_labels INT,
            vt_month_shelf BOOLEAN,
            corrupt BOOLEAN,
            host_malware_downloads INT,
            host_suspicious_downloads INT,
            host_benign_downloads INT,
            host_total_downloads INT,
            host_malware_ratio REAL,
            host_suspicious_ratio REAL,
            host_benign_ratio REAL,
            host_avg_av_labels REAL,
            host_avg_trusted_labels REAL,
            host_unknown_hashes INT,
            host_total_hashes INT,
            host_unknown_hash_ratio REAL,
            twold_malware_downloads INT,
            twold_suspicious_downloads INT,
            twold_benign_downloads INT,
            twold_total_downloads INT,
            twold_malware_ratio REAL,
            twold_suspicious_ratio REAL,
            twold_benign_ratio REAL,
            twold_avg_av_labels REAL,
            twold_avg_trusted_labels REAL,
            twold_unknown_hashes INT,
            twold_total_hashes INT,
            twold_unknown_hash_ratio REAL,
            server_ip_malware_downloads INT,
            server_ip_suspicious_downloads INT,
            server_ip_benign_downloads INT,
            server_ip_total_downloads INT,
            server_ip_malware_ratio REAL,
            server_ip_suspicious_ratio REAL,
            server_ip_benign_ratio REAL,
            server_ip_avg_av_labels REAL,
            server_ip_avg_trusted_labels REAL,
            server_ip_unknown_hashes INT,
            server_ip_total_hashes INT,
            server_ip_unknown_hash_ratio REAL,
            bgp_malware_downloads INT,
            bgp_suspicious_downloads INT,
            bgp_benign_downloads INT,
            bgp_total_downloads INT,
            bgp_malware_ratio REAL,
            bgp_suspicious_ratio REAL,
            bgp_benign_ratio REAL,
            bgp_avg_av_labels REAL,
            bgp_avg_trusted_labels REAL,
            bgp_unknown_hashes INT,
            bgp_total_hashes INT,
            bgp_unknown_hash_ratio REAL,
            hash_life_time INT,
            num_dumps_with_same_hash INT,
            hash_daily_dump_rate_per_client REAL,
            estimated_clients_with_same_hash INT,
            referer_exists INT,
            host_name_exists INT,
            extension_class VARCHAR(20),
            url_length INT,
            directory_depth INT,
            sha1 VARCHAR(40),
            host VARCHAR(256),
            url_malware_downloads INT,
            url_total_downloads INT,
            url_distinct_sha1s INT,
            url_struct VARCHAR(512),
            url_struct_malware_downloads INT,
            url_struct_total_downloads INT,
            url_struct_distinct_sha1s INT)
            """)

    print "Created weka_features table!"

    conn.commit()
    cursor.close()
    conn.close()
__author__ = 'vincenzo'

import util
import db_extraction
import pickle
import dill
import re
import urlparse
import numpy as np


for i in range(11):
    sketch = pickle.load(open("./%d/url_struct_total_downloads-%d.p" % (i,i),'rb'))
    db = db_extraction.DBextraction()

    conn = util.connect_to_db()
    cursor = conn.cursor()
    inner_cursor = conn.cursor()

    start_id = db.maxID - 10000

    cursor.execute("""SELECT distinct url
            FROM pe_dumps AS pe
            WHERE
                pe.dump_id > %d and url is not null """ %
                       (start_id,))

    for row in cursor:
        if row is not None:
            url = row[0]
Ejemplo n.º 22
0
def db_virus_total(dump_id):
    logging.config.fileConfig(LOG_CONF_FILE)
    logger = logging.getLogger("amico_logger")
    util.setup_socks()
    conn = util.connect_to_db()
    cursor = conn.cursor()

    # Exit if this sha1 has been queried in the past VT_QUERY_INTERVAL period
    prev_query_time = datetime(MINYEAR, 1, 1, 0, 0, 0, 0)
    time_now = datetime.now()
    cursor.execute(
        """
        SELECT sha1, md5
        FROM pe_dumps
        WHERE dump_id = %s""", (dump_id, ))
    (sha1, md5) = cursor.fetchone()

    try:
        cursor.execute(
            "SELECT query_time, vt_id FROM virus_total_scans "
            "WHERE sha1 = %s "
            "ORDER by query_time DESC", (sha1, ))
        res = cursor.fetchone()
        if res:
            prev_query_time = res[0]
            vt_id = res[1]
    except:
        print "sha1:%s no previous VT query" % (sha1, )
        pass

    vt_query_period = timedelta(days=VT_QUERY_INTERVAL)
    if (time_now - prev_query_time) < vt_query_period:
        print "sha1:%s has been queried recently. Skipping..." % (sha1, )
        cursor.execute(
            """
                INSERT INTO ped_vts_mapping (dump_id, vt_id)
                VALUES (%s, %s)""", (dump_id, vt_id))
        conn.close()
        return

    tries = 0
    success = False
    while tries < MAX_TRIES:
        try:
            tries += 1
            json = vt_api.get_vt_report(md5)
            if not json:
                continue
            report = simplejson.loads(json)
            if report["response_code"] == 1:
                insert_report(cursor, report, sha1, md5, json, dump_id)
                success = True
                break
            elif report["response_code"] == 0:
                cursor.execute(
                    """
                    INSERT INTO virus_total_scans(sha1, md5, query_time)
                    VALUES (%s, %s, CLOCK_TIMESTAMP())
                    RETURNING vt_id
                    """, (sha1, md5))
                vt_id = cursor.fetchone()[0]
                cursor.execute(
                    """
                        INSERT INTO ped_vts_mapping (dump_id, vt_id)
                        VALUES (%s, %s)""", (dump_id, vt_id))
                print "Virus Total: No scan report exists in the VT database"
                success = True
                break
            else:
                logger.exception("Unknown response code! %s" %
                                 (report["response_code"], ))
                time.sleep(1)

        except Exception as e:
            print e
            logger.exception(
                "Try %s. Error in fetching report for md5 %s: %s" %
                (tries, md5, e))
            time.sleep(5)
    if not success:
        cursor.execute(
            """
                INSERT INTO ped_vts_mapping (dump_id)
                VALUES (%s)""", (dump_id, ))
        logger.warning("Giving up on dump_id: %s's VT report" % (dump_id, ))
    cursor.close()
    conn.close()
(%(document_id)s, %(paragraph_id)s, %(theorem_type)s, %(text)s)
"""

authorshipInsertStmt = """
INSERT INTO authorship(document, rank, display_name, zbmath_id)
VALUES
(%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s)
"""

mscAssignmentInsertStmt = """
INSERT INTO msc_assignment(document, msc, pos)
VALUES
(%(document_id)s, %(msc)s, %(pos)s)
"""

db = connect_to_db()
cursor = db.cursor()
warning_log = open("warning_log", "a")

p = DocumentParser()
# filepath = "raw_data/test_documents/07040005.xml"
# for filename in filesInDict("raw_data/test_documents", True):
for filename, filepath in zip(filenames, filepaths):
    sys.stdout.write("processing " + filename + "... ")

    # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename)
    doc, raw_paragraphs, formula_dict = p.parse_raw(filepath)

    # info for doc table:
    document_id = doc.arxiv_id()
    publication_date = doc.publication_date
Ejemplo n.º 24
0
def db_file_dumps(file_path, sha1, md5, file_size, file_type):
    #print "Time b4 http parsing: %f" %(time.time(),)
    # Use Autocommit mode for database connection
    conn = util.connect_to_db()
    cursor = conn.cursor()

    fileHandle = open(file_path)

    # Timestamp
    r = re.compile('[0-9]+')
    timestamp = r.search(fileHandle.readline())
    if timestamp is not None:
        timestamp = timestamp.group()
        #print timestamp.group()

    # Source and Destination IPs
    r = re.compile('([0-9.]+):.*-([0-9.]+):([0-9]+)-.*')
    ip = r.search(fileHandle.readline())
    if ip is not None:
        srcip = ip.group(2)
        dstip = ip.group(1)
        dst_port = ip.group(3)
        #print ip.group(1)
        #print ip.group(2)
    else:
        srcip = None
        dstip = None
        dst_port = None

    # URL
    r = re.compile('(GET|POST|HEAD) (.*) ')
    url = r.search(fileHandle.readline())
    if url is not None:
        method = url.group(1)
        method = method[:10]
        url = url.group(2)
        #print url.group(1)
    else:
        method = None

    # Host
    r = re.compile('Host: (.*)')
    host = r.search(fileHandle.readline())
    if host is not None:
        host = host.group(1)
        host = util.reorder_domain(host.strip())
        #print host.group(1)

    # Referer
    r = re.compile('Referer: (.*)')
    referer = r.search(fileHandle.readline())
    if referer is not None:
        referer = referer.group(1)
        #print referrer.group(1)

    # CORRUPT_PE
    corrupt_pe = False
    r = re.compile('CORRUPT_FILE')
    corrupt_pe_str = r.search(fileHandle.readline())
    if corrupt_pe_str is not None:
        corrupt_pe = True

    # Now, parse data from the response
    # Server
    data = fileHandle.read()
    r = re.compile('Server: (.*)')
    server = r.search(data)
    if server is not None:
        server = server.group(1)
        server = server.rstrip('\r')
        server = server[:64]

    # Content-Type
    r = re.compile('Content-Type: (.*)')
    cont_type = r.search(data)
    if cont_type is not None:
        cont_type = cont_type.group(1)
        cont_type = cont_type.rstrip('\r')
        cont_type = cont_type[:128]

    #print "Time after http parsing: %f" %(time.time(),)
    # Database statement
    cursor.execute(
        """
        INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host,
        referer,server_application,content_type,dst_port,corrupt,file_size,file_type)
        VALUES
        (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
        (sha1, md5, timestamp, srcip, dstip, method, url, host, referer,
         server, cont_type, dst_port, corrupt_pe, file_size, file_type))
    cursor.execute(
        """
        SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC LIMIT 1
        """, (sha1, ))
    dump_id = cursor.fetchone()[0]
    print("A new entry on host:%s has been made in pe_dumps table with "
          "dump_id %s" % (host, dump_id))

    fileHandle.close()
    cursor.close()
    conn.close()
    return dump_id, corrupt_pe, host, dstip, srcip
Ejemplo n.º 25
0
__author__ = 'vincenzo'
from build_weka_database import build
from classify import classify_dump

import util

build(10000)

conn = util.connect_to_db()

cursor = conn.cursor()

cursor.execute("""SELECT max(dump_id) from pe_dumps""")
lastID = cursor.fetchone()[0]

currID = lastID - 10000

while currID < lastID:
    classify_dump(currID)
    currID += 1
Ejemplo n.º 26
0
def db_virus_total(dump_id):
    logging.config.fileConfig(LOG_CONF_FILE)
    logger = logging.getLogger("amico_logger")
    util.setup_socks()
    conn = util.connect_to_db()
    cursor = conn.cursor()

    # Exit if this sha1 has been queried in the past VT_QUERY_INTERVAL period
    prev_query_time = datetime(MINYEAR, 1, 1, 0, 0, 0, 0)
    time_now = datetime.now()
    cursor.execute("""
        SELECT sha1, md5
        FROM pe_dumps
        WHERE dump_id = %s""",
        (dump_id,))
    (sha1, md5) = cursor.fetchone()

    try:
        cursor.execute("SELECT query_time, vt_id FROM virus_total_scans "
                   "WHERE sha1 = %s "
                   "ORDER by query_time DESC", (sha1,))
        res = cursor.fetchone()
        if res:
            prev_query_time = res[0]
            vt_id = res[1]
    except:
        print "sha1:%s no previous VT query" % (sha1, )
        pass

    vt_query_period = timedelta(days=VT_QUERY_INTERVAL)
    if (time_now - prev_query_time) < vt_query_period:
        print "sha1:%s has been queried recently. Skipping..." % (sha1, )
        cursor.execute("""
                INSERT INTO ped_vts_mapping (dump_id, vt_id)
                VALUES (%s, %s)""",
                (dump_id, vt_id))
        conn.close()
        return

    tries = 0
    success = False
    while tries < MAX_TRIES:
        try:
            tries += 1
            json = vt_api.get_vt_report(md5)
            if not json:
                continue
            report = simplejson.loads(json)
            if report["response_code"] == 1:
                insert_report(cursor, report, sha1, md5, json, dump_id)
                success = True
                break
            elif report["response_code"] == 0:
                cursor.execute("""
                    INSERT INTO virus_total_scans(sha1, md5, query_time)
                    VALUES (%s, %s, CLOCK_TIMESTAMP())
                    RETURNING vt_id
                    """, (sha1, md5))
                vt_id = cursor.fetchone()[0]
                cursor.execute("""
                        INSERT INTO ped_vts_mapping (dump_id, vt_id)
                        VALUES (%s, %s)""",
                        (dump_id, vt_id))
                print "Virus Total: No scan report exists in the VT database"
                success = True
                break
            else:
                logger.exception("Unknown response code! %s" %
                        (report["response_code"],))
                time.sleep(1)

        except Exception as e:
            print e
            logger.exception("Try %s. Error in fetching report for md5 %s: %s"
                            % (tries, md5, e))
            time.sleep(5)
    if not success:
        cursor.execute("""
                INSERT INTO ped_vts_mapping (dump_id)
                VALUES (%s)""",
                (dump_id,))
        logger.warning("Giving up on dump_id: %s's VT report" % (dump_id,))
    cursor.close()
    conn.close()
Ejemplo n.º 27
0
def db_pe_dumps(file_path, sha1, md5, file_size):
    # print "Time b4 http parsing: %f" %(time.time(),)
    # Use Autocommit mode for database connection
    conn = util.connect_to_db()
    cursor = conn.cursor()

    fileHandle = open(file_path)

    # Timestamp
    r = re.compile("[0-9]+")
    timestamp = r.search(fileHandle.readline())
    if timestamp is not None:
        timestamp = timestamp.group()
        # print timestamp.group()

    # Source and Destination IPs
    r = re.compile("([0-9.]+):.*-([0-9.]+):([0-9]+)-.*")
    ip = r.search(fileHandle.readline())
    if ip is not None:
        srcip = ip.group(2)
        dstip = ip.group(1)
        dst_port = ip.group(3)
        # print ip.group(1)
        # print ip.group(2)
    else:
        srcip = None
        dstip = None
        dst_port = None

    # URL
    r = re.compile("(GET|POST|HEAD) (.*) ")
    url = r.search(fileHandle.readline())
    if url is not None:
        method = url.group(1)
        method = method[:10]
        url = url.group(2)
        # print url.group(1)
    else:
        method = None

    # Host
    r = re.compile("Host: (.*)")
    host = r.search(fileHandle.readline())
    if host is not None:
        host = host.group(1)
        host = util.reorder_domain(host.strip())
        # print host.group(1)

    # Referer
    r = re.compile("Referer: (.*)")
    referer = r.search(fileHandle.readline())
    if referer is not None:
        referer = referer.group(1)
        # print referrer.group(1)

    # CORRUPT_PE
    corrupt_pe = False
    r = re.compile("CORRUPT_(PE|FILE)")
    corrupt_pe_str = r.search(fileHandle.readline())
    if corrupt_pe_str is not None:
        corrupt_pe = True

    # Now, parse data from the response
    # Server
    data = fileHandle.read()
    r = re.compile("Server: (.*)")
    server = r.search(data)
    if server is not None:
        server = server.group(1)
        server = server.rstrip("\r")
        server = server[:64]

    # Content-Type
    r = re.compile("Content-Type: (.*)")
    cont_type = r.search(data)
    if cont_type is not None:
        cont_type = cont_type.group(1)
        cont_type = cont_type.rstrip("\r")
        cont_type = cont_type[:128]

    # print "Time after http parsing: %f" %(time.time(),)
    # Database statement
    cursor.execute(
        """
        INSERT INTO pe_dumps(sha1,md5,timestamp,server,client,method,url,host,
        referer,server_application,content_type,dst_port,corrupt,file_size)
        VALUES
        (%s,%s,TO_TIMESTAMP(%s),%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""",
        (
            sha1,
            md5,
            timestamp,
            srcip,
            dstip,
            method,
            url,
            host,
            referer,
            server,
            cont_type,
            dst_port,
            corrupt_pe,
            file_size,
        ),
    )
    cursor.execute(
        """
        SELECT dump_id FROM pe_dumps where sha1 = %s ORDER BY dump_id DESC
        """,
        (sha1,),
    )
    dump_id = cursor.fetchone()[0]
    print("A new entry on host:%s has been made in pe_dumps table with " "dump_id %s" % (host, dump_id))

    fileHandle.close()
    cursor.close()
    conn.close()
    return dump_id, corrupt_pe
Ejemplo n.º 28
0
def generate_JSON_map_file():
    connection = util.connect_to_db()
    dictionary_index = 0
    monitoring_server_ip = "127.0.0.1"
    server_host_mapping = defaultdict(set)
    total_json_map = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    malware_json_map = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))

    created_json_file = OUT_DIR + "/" + str(MAP_GRAPH_ID) + "-downloads_" + \
                        datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".json"

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "APK", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "DMG", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "ELF", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "EXE", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "PDF", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "SWF", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "JAR", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "RAR", dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(
        connection, server_host_mapping, total_json_map, malware_json_map,
        "ZIP", dictionary_index)
    dictionary_index += 1
    UID = 0

    JSON_object = encode_data_as_JSON(UID, monitoring_server_ip,
                                      total_json_map, malware_json_map)

    with open(created_json_file, "wb") as json_file:
        json.dump(JSON_object, json_file)
def generate_JSON_map_file():
    connection           = util.connect_to_db()
    dictionary_index     = 0
    monitoring_server_ip = "127.0.0.1"
    server_host_mapping  = defaultdict(set)
    total_json_map       = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    malware_json_map     = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    created_json_file = OUT_DIR + "/" + str(MAP_GRAPH_ID) + "-downloads_" + \
                        datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".json"

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "APK",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "DMG",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "ELF",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "EXE",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "PDF",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "SWF",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "JAR",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "RAR",
                                                                               dictionary_index)
    dictionary_index += 1

    server_host_mapping, total_json_map, malware_json_map = perform_queries_on(connection, server_host_mapping,
                                                                               total_json_map, malware_json_map, "ZIP",
                                                                               dictionary_index)
    dictionary_index += 1
    UID = 0

    JSON_object = encode_data_as_JSON(UID, monitoring_server_ip, total_json_map, malware_json_map)

    with open(created_json_file, "wb") as json_file:
        json.dump(JSON_object, json_file)
def get_feature_vector(dump_id):
    #print "entered get_feature_vector"
    conn = util.connect_to_db()
    cursor = conn.cursor()
    insert_features(cursor, dump_id)
    print "Done inserting features for dump_id: ", dump_id
def generate_CSV_download_file():
    connection = util.connect_to_db()
    connection_cursor = connection.cursor()
    csv_writer = None

    header = "Second_ID,Mal_APK,Tot_APK,Mal_DMG,Tot_DMG,Mal_ELF,Tot_ELF,Mal_EXE,Tot_EXE,Mal_PDF,Tot_PDF,Mal_SWF," + \
             "Tot_SWF,Mal_JAR,Tot_JAR,Mal_RAR,Tot_RAR,Mal_ZIP,Tot_ZIP,Timestamp,Next_Download_Event_[s]"
    header_list = ["Second_ID", "Mal_APK", "Tot_APK", "Mal_DMG", "Tot_DMG", "Mal_ELF", "Tot_ELF", "Mal_EXE", "Tot_EXE",
                   "Mal_PDF", "Tot_PDF", "Mal_SWF", "Tot_SWF", "Mal_JAR", "Tot_JAR", "Mal_RAR", "Tot_RAR", "Mal_ZIP",
                   "Tot_ZIP", "Timestamp", "Next_Download_Event_[s]"]
    created_csv_file = OUT_DIR + "/" + str(DOWNLOAD_GRAPH_ID) + "-downloads_" + \
                       datetime.fromtimestamp(time()).strftime('%Y-%m-%d_%H-%M-%S') + ".csv"

    with open(created_csv_file, "wb") as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(header_list)

    csv_map = defaultdict(list)
    malware_timestamp_set = set()
    ##################################################### EXECUTABLES #####################################################

    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'APK' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_apk_count_per_second = row[1]

            csv_map[timestamp].append(malware_apk_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'APK' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_apk_count_per_second = row[1]

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_apk_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_apk_count_per_second])


    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'DMG' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_dmg_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0])

            csv_map[timestamp].append(malware_dmg_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'DMG' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_dmg_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_dmg_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_dmg_count_per_second])


    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'ELF' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_elf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0])

            csv_map[timestamp].append(malware_elf_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ELF' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_elf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_elf_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_elf_count_per_second])


    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'EXE' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_exe_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_exe_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'EXE' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_exe_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_exe_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_exe_count_per_second])

            ########################################################################################################################

            ######################################################### PDF #########################################################

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'PDF' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_pdf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_pdf_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'PDF' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_pdf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_pdf_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_pdf_count_per_second])

            ########################################################################################################################

            ######################################################## FLASH ########################################################

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'SWF' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_swf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_swf_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'SWF' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_swf_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_swf_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_swf_count_per_second])

            ########################################################################################################################

            ###################################################### COMPRESSED ######################################################

    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'JAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_jar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_jar_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'JAR' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_jar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_jar_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_jar_count_per_second])


    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'RAR' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_rar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_rar_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'RAR' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_rar_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_rar_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_rar_count_per_second])


    malware_timestamp_set = set()
    query = """SELECT timestamp, COUNT(pe.file_type) FROM pe_dumps AS pe, amico_scores AS ams WHERE """ + \
            """pe.dump_id = ams.dump_id AND pe.file_type = 'ZIP' AND ams.score > """ + str(AMICO_THRESHOLD) + \
            """GROUP BY  timestamp ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            malware_zip_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            csv_map[timestamp].append(malware_zip_count_per_second)
            malware_timestamp_set.add(timestamp)

    query = """SELECT timestamp, COUNT(file_type) FROM pe_dumps WHERE file_type = 'ZIP' GROUP BY  timestamp """ + \
            """ORDER BY timestamp ASC"""
    connection_cursor.execute(query)

    for row in connection_cursor:
        if row is not None:
            timestamp = str(row[0])
            total_zip_count_per_second = row[1]

            if timestamp not in csv_map:
                csv_map[timestamp].extend([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

            if malware_timestamp_set.__contains__(timestamp):
                csv_map[timestamp].append(total_zip_count_per_second)
            else:
                csv_map[timestamp].extend([0, total_zip_count_per_second])

            ########################################################################################################################
    sorted_csv_map = sorted(csv_map.items(), key=operator.itemgetter(0))

    csv_map_aux = defaultdict(list)
    first_useful_date = "2014-11-26 22:55:40"
    last_useful_date  = "2015-10-01 00:00:00"
    # Loop for handling corrupted timestamp
    for timestamp, file_list in sorted_csv_map:
        if cmp(timestamp, first_useful_date) < 0:
            timestamp_split         = timestamp.split()
            first_useful_date_split = first_useful_date.split()
            timestamp_hms           = timestamp_split[1]
            first_useful_date_ymd   = first_useful_date_split[0]

            corrected_timestamp = first_useful_date_ymd + " " + timestamp_hms
            csv_map_aux[corrected_timestamp] = csv_map.get(timestamp)
        else:
            break

    max_values = len(header.split(',')) - 2
    csv_rows = list()
    sorted_csv_map_aux = sorted(csv_map_aux.items(), key=operator.itemgetter(0))
    UID = 0

    for timestamp, file_list in sorted_csv_map_aux:
        formatted_row = format_row(file_list, max_values)
        formatted_row.insert(0, UID)
        formatted_row.append(timestamp)
        csv_rows.append(formatted_row)
        UID += 1

    writable_csv_rows = list()
    while csv_rows:
        current_row     = csv_rows.pop(0)
        if not csv_rows:
            writable_csv_rows.append(current_row)
            continue
        next_row        = csv_rows[0]
        timestamp_index = len(current_row) - 1

        current_timestamp_string = current_row[timestamp_index]
        next_timestamp_string    = next_row[timestamp_index]

        current_timestamp  = datetime.strptime(current_timestamp_string, '%Y-%m-%d %H:%M:%S')
        next_timestamp     = datetime.strptime(next_timestamp_string, '%Y-%m-%d %H:%M:%S')
        time_delta_in_secs = int((next_timestamp - current_timestamp).total_seconds()) - 1
        current_row.append(time_delta_in_secs)

        writable_csv_rows.append(current_row)

    writable_sorted_csv_map = list()
    for timestamp, file_list in sorted_csv_map:
        if cmp(timestamp, first_useful_date) < 0 or cmp(timestamp, last_useful_date) > 0:
            continue
        else:
            writable_sorted_csv_map.append([timestamp, file_list])

    writable_csv_rows_aux = list()
    while writable_sorted_csv_map:
        timestamp_file_list_first_pair = writable_sorted_csv_map.pop(0)
        timestamp_str_first_pair       = timestamp_file_list_first_pair[0]
        file_list_first_pair           = timestamp_file_list_first_pair[1]

        if not writable_sorted_csv_map:
            formatted_row = format_row(file_list_first_pair, max_values)
            formatted_row.insert(0, UID)
            formatted_row.append(timestamp_str_first_pair)
            writable_csv_rows_aux.append(formatted_row)
            UID += 1
            continue
        timestamp_file_list_second_pair = writable_sorted_csv_map[0]
        timestamp_str_second_pair       = timestamp_file_list_second_pair[0]

        formatted_row = format_row(file_list_first_pair, max_values)
        formatted_row.insert(0, UID)
        formatted_row.append(timestamp_str_first_pair)

        timestamp_first_pair  = datetime.strptime(timestamp_str_first_pair, '%Y-%m-%d %H:%M:%S')
        timestamp_second_pair = datetime.strptime(timestamp_str_second_pair, '%Y-%m-%d %H:%M:%S')
        time_delta_in_secs = int((timestamp_second_pair - timestamp_first_pair).total_seconds()) - 1
        formatted_row.append(time_delta_in_secs)

        writable_csv_rows_aux.append(formatted_row)
        UID += 1

    last_formatted_row_in_writable_csv_rows      = writable_csv_rows.pop(len(writable_csv_rows) - 1)
    first_formatted_row_in_writable_csv_rows_aux = writable_csv_rows_aux[0]
    timestamp_index = len(last_formatted_row_in_writable_csv_rows) - 1

    current_timestamp_string = last_formatted_row_in_writable_csv_rows[timestamp_index]
    next_timestamp_string    = first_formatted_row_in_writable_csv_rows_aux[timestamp_index]

    current_timestamp  = datetime.strptime(current_timestamp_string, '%Y-%m-%d %H:%M:%S')
    next_timestamp     = datetime.strptime(next_timestamp_string, '%Y-%m-%d %H:%M:%S')
    time_delta_in_secs = int((next_timestamp - current_timestamp).total_seconds()) - 1
    last_formatted_row_in_writable_csv_rows.append(time_delta_in_secs)

    writable_csv_rows_aux.insert(0, last_formatted_row_in_writable_csv_rows)

    with open(created_csv_file, "a") as csv_file:
        csv_writer = csv.writer(csv_file, csv.QUOTE_NONNUMERIC)
        for row in writable_csv_rows:
            csv_writer.writerow(row)
        for row in writable_csv_rows_aux:
            csv_writer.writerow(row)
Ejemplo n.º 32
0
def fe_db_setup():
    conn = util.connect_to_db()
    cursor = conn.cursor()

    cursor.execute(""" DROP table if exists features""")
    cursor.execute(""" DROP table if exists weka_features""")
    cursor.execute(
        """
        CREATE TABLE weka_features(
            dump_id INT,
            raw_dump_num_av_labels INT,
            raw_dump_trusted_av_labels INT,
            vt_month_shelf BOOLEAN,
            corrupt BOOLEAN,
            host_malware_downloads INT,
            host_suspicious_downloads INT,
            host_benign_downloads INT,
            host_total_downloads INT,
            host_malware_ratio REAL,
            host_suspicious_ratio REAL,
            host_benign_ratio REAL,
            host_avg_av_labels REAL,
            host_avg_trusted_labels REAL,
            host_unknown_hashes INT,
            host_total_hashes INT,
            host_unknown_hash_ratio REAL,
            twold_malware_downloads INT,
            twold_suspicious_downloads INT,
            twold_benign_downloads INT,
            twold_total_downloads INT,
            twold_malware_ratio REAL,
            twold_suspicious_ratio REAL,
            twold_benign_ratio REAL,
            twold_avg_av_labels REAL,
            twold_avg_trusted_labels REAL,
            twold_unknown_hashes INT,
            twold_total_hashes INT,
            twold_unknown_hash_ratio REAL,
            server_ip_malware_downloads INT,
            server_ip_suspicious_downloads INT,
            server_ip_benign_downloads INT,
            server_ip_total_downloads INT,
            server_ip_malware_ratio REAL,
            server_ip_suspicious_ratio REAL,
            server_ip_benign_ratio REAL,
            server_ip_avg_av_labels REAL,
            server_ip_avg_trusted_labels REAL,
            server_ip_unknown_hashes INT,
            server_ip_total_hashes INT,
            server_ip_unknown_hash_ratio REAL,
            bgp_malware_downloads INT,
            bgp_suspicious_downloads INT,
            bgp_benign_downloads INT,
            bgp_total_downloads INT,
            bgp_malware_ratio REAL,
            bgp_suspicious_ratio REAL,
            bgp_benign_ratio REAL,
            bgp_avg_av_labels REAL,
            bgp_avg_trusted_labels REAL,
            bgp_unknown_hashes INT,
            bgp_total_hashes INT,
            bgp_unknown_hash_ratio REAL,
            hash_life_time INT,
            num_dumps_with_same_hash INT,
            hash_daily_dump_rate_per_client REAL,
            estimated_clients_with_same_hash INT,
            referer_exists INT,
            host_name_exists INT,
            extension_class VARCHAR(20),
            url_length INT,
            directory_depth INT,
            sha1 VARCHAR(40),
            host VARCHAR(256),
            url_malware_downloads INT,
            url_total_downloads INT,
            url_distinct_sha1s INT,
            url_struct VARCHAR(512),
            url_struct_malware_downloads INT,
            url_struct_total_downloads INT,
            url_struct_distinct_sha1s INT)
            """
    )

    print "Created weka_features table!"

    conn.commit()
    cursor.close()
    conn.close()