def test_setting_fileh(self): DATA_FILE = "pyfsdb/tests/tests.fsdb" f = pyfsdb.Fsdb() self.assertFalse(f.file_handle, "file_handle should not be available") fh = open(DATA_FILE, "r") self.assertTrue(fh, "file opened manually") f.file_handle = fh self.assertTrue(f.file_handle == fh, "file_handle was set properly") row = next(f) self.assertTrue(f.__next__ == f._next_as_array, "read type was set") self.assertTrue(row, 'row one is returned') self.assertTrue(row[0] == 'rowone') # create a new object instead fh = open(DATA_FILE, "r") f = pyfsdb.Fsdb(file_handle=fh) row = next(f) self.assertTrue(row, 'row one is returned') self.assertTrue(row[0] == 'rowone') # check that it works as an iterator fh = open(DATA_FILE, "r") f = pyfsdb.Fsdb(file_handle=fh) count = 0 for row in f: count += 1 self.assertTrue(count > 0, "at least one row read")
def main(): args = parse_args() # set up storage structures storage = {} columns = {} # from the input, get extract column numbers/names key_column = args.key_column value_column = args.value_column other_columns = args.other_columns columns = args.columns # open the input file stream input = pyfsdb.Fsdb(file_handle=args.input_file, return_type=pyfsdb.RETURN_AS_DICTIONARY) output = pyfsdb.Fsdb(out_file_handle=args.output_file) output.out_column_names = [key_column, value_column] + other_columns # for each row, remember each value based on time and key for row in input: for column in columns: out_row = [column, row[column]] for other in other_columns: out_row.append(row[other]) output.append(out_row) output.close()
def main(): args = parse_args() cmd2template = get_cmd2template(args.input_file[0]) weightDic, cmdIPsDic, sourceDic, cmdToArray = get_info( args.input_file[0], cmd2template) G, weighted_edges, labels = draw_networkx(args, weightDic, cmdIPsDic, sourceDic, cmdToArray) clusters = get_clusters(G) if (args.edge_list): outh = pyfsdb.Fsdb(out_file=args.edge_list) outh.out_column_names = [ 'cluster_id', 'node1_id', 'node2_id', 'node1', 'node2', 'weight' ] for cmd1, cmd2, weight in weighted_edges: cluster_id = clusters[cmd1] num1 = labels[cmd1] num2 = labels[cmd2] outh.append([cluster_id, num1, num2, cmd1, cmd2, round(weight, 3)]) outh.close() if (args.cluster_list): outh = pyfsdb.Fsdb(out_file=args.cluster_list) outh.out_column_names = ['cluster_id', 'command'] for cmd, cluster_id in clusters.items(): outh.append([cluster_id, cmd]) outh.close()
def main(): args = parse_args() f = pyfsdb.Fsdb(args.input_file) key_column = f.get_column_number(args.column) key_counts = {} line_count = 0 cached_count = 0 # memorize all the keys and the number of rows in each for row in f.next_as_array(): line_count += 1 if row[key_column] not in key_counts: key_counts[row[key_column]] = 1 else: key_counts[row[key_column]] += 1 key_list = list(key_counts.keys()) key_list.sort() # re-open the input file to re-read f = pyfsdb.Fsdb(args.input_file, out_file_handle=args.output_file) stored_lines = {} # memorize all the keys and the number of rows in each current_key = key_list.pop(0) for row in f.next_as_array(): if row[key_column] != current_key: # the current lines are arriving too early; cache them cached_count += 1 if row[key_column] not in stored_lines: stored_lines[row[key_column]] = [row] else: stored_lines[row[key_column]].append(row) else: f.append(row) key_counts[current_key] -= 1 while key_counts[current_key] == 0: #import pdb; pdb.set_trace() # we're done with this key list if len(key_list) == 0: break # done! # grab a new key current_key = key_list.pop(0) # write out any cached lines if current_key in stored_lines: for stored_row in stored_lines[current_key]: f.append(stored_row) key_counts[current_key] -= 1 del stored_lines[current_key] f.write_finish() if args.verbose: sys.stderr.write("cached %d/%d lines\n" % (cached_count, line_count))
def read_data(self): """Read data from the ip2asn file""" if isinstance(self._file, str): # assume a file name iptoasn = pyfsdb.Fsdb(self._file) else: # assume it's a file handle instead iptoasn = pyfsdb.Fsdb(file_handle=self._file) # set the column names for pyfsdb iptoasn.column_names = ['start', 'end', 'ASN', 'country', 'name'] (self._start_col, self._end_col, self._asn_col, self._country_col, self._name_col) = iptoasn.get_column_numbers(iptoasn.column_names) # XXX: fsdb should do this for us self._data = [] self._left_keys = [] for row in iptoasn: try: row[self._start_col] = int(row[self._start_col]) row[self._end_col] = int(row[self._end_col]) except: # must be addresses not ints row[self._start_col] = self.ip2int(row[self._start_col]) row[self._end_col] = self.ip2int(row[self._end_col]) self._data.append(row) self._left_keys.append(int(row[self._start_col]))
def process_fsdb(i2a, inh, outh, key, by_asn=False): inf = pyfsdb.Fsdb(file_handle=inh) outf = pyfsdb.Fsdb(out_file_handle=outh) if by_asn: outf.out_column_names = inf.column_names + ASN_COLUMN_NAMES[1:] else: outf.out_column_names = inf.column_names + COLUMN_NAMES[1:] key_col = inf.get_column_number(key) for row in inf: if by_asn: results = i2a.lookup_asn(row[key_col], limit=1) if len(results) == 0: row.extend(['-', '-', '-', '-', '-']) else: row.extend([ results[0]['owner'], results[0]['country'], results[0]['ip_range'] ]) else: result = i2a.lookup_address(row[key_col]) if result: row.extend([ result['ip_numeric'], result['ASN'], result['owner'], result['country'], result['ip_range'] ]) else: row.extend(['-', '-', '-', '-', '-']) outf.append(row)
def main(): args = parse_args() # open the input file inh = pyfsdb.Fsdb(file_handle=args.input_file) key_column = inh.get_column_number(args.column) out_handles = {} for row in inh: value = row[key_column] # see if we have an open file handle for this one yet if value not in out_handles: # new value, so open a new file handle to save data for it file_name = re.sub("[^-.0-9a-zA-Z_]", "_", str(value)) outh = pyfsdb.Fsdb(out_file=(args.output_pattern % file_name)) outh.column_names = inh.column_names out_handles[value] = outh # save the row to the file based on its value out_handles[value].append(row) # clean up for handle in out_handles: out_handles[handle].close()
def test_write_out_fsdb(self): DATA_FILE = "pyfsdb/tests/tests.fsdb" OUT_FILE = "pyfsdb/tests/testout.fsdb" f = pyfsdb.Fsdb(DATA_FILE, out_file=OUT_FILE) self.assertTrue(f, "opened ok") # read in all records records = [] for record in f: records.append(record) self.assertTrue(records[0][0] == 'rowone', 'init record ' + records[0][0] + ' is correct') for record in records: f.write_row(record) f.write_finish() g = pyfsdb.Fsdb(OUT_FILE) rows = [] for row in g: rows.append(row) self.check_data(rows) # write out new columns f = pyfsdb.Fsdb(out_file=OUT_FILE) count = 1 f.out_column_names = ['a', 'b', 'c', 'new_count'] self.assertTrue( len(f.out_column_names) == 4, "correct initial output count") for row in rows: row.append(str(count)) f.write_row(row) count = count + 1 f.write_finish() # check new columns g = pyfsdb.Fsdb(filename=OUT_FILE) rows = [] for row in g: rows.append(row) self.check_data(rows) self.assertTrue(rows[0][3] == "1", "new rowone col is correct") self.assertTrue(rows[1][3] == "2", "new rowtwo col is correct") # check the output token switch f = pyfsdb.Fsdb(DATA_FILE, out_file=OUT_FILE) self.assertTrue(f, "opened ok") f.out_separator_token = "s" self.assertTrue(f.out_separator == ' ', "new separator is space") for row in f: f.write_row(row) f.write_finish()
def main(): args = parse_args() # set up storage structures storage = {} columns = {} # open the input file stream input = pyfsdb.Fsdb(file_handle=args.input_file) # from the input, get extract column numbers/names time_column = input.get_column_number(args.time_column) key_column = input.get_column_number(args.key_column) column_names = input.column_names # for each row, remember each value based on time and key for row in input: # if the time hasn't been seen before, allocate the sub-structure if row[time_column] not in storage: storage[row[time_column]] = {} for column_num in range(0, len(row)): # remember all values of non-time and non-key columns if column_num != time_column and column_num != key_column: storage[row[time_column]][row[key_column]] = row[column_num] # record that we've seen this column before columns[row[key_column]] = 1 # open the output stream, and set it's properties out = pyfsdb.Fsdb(out_file_handle=args.output_file) # the output columns will be a merge of the time column, and # previously seen key-index values. output_columns = ['time'] output_columns.extend(columns.keys()) out.out_column_names = output_columns # Output all data, grouped by time_key for time_key in storage: # create a row containing a column for every seen key row = [time_key] for column in columns: if column not in storage[time_key] or storage[time_key][ column] == "": row.append("0") else: row.append(storage[time_key][column]) # write it out out.append(row)
def test_missing_header_support_file(self): DATA_FILE = "pyfsdb/tests/noheader.fsdb" f = pyfsdb.Fsdb(DATA_FILE) self.assertTrue(f, "opened ok") f.column_names = ['colone', 'coltwo', 'colthree'] headers = f.headers self.assertTrue(headers, "headers access exists") self.assertTrue(f.get_column_name(0) == "colone") self.assertTrue(f.get_column_name(1) == "coltwo") self.assertTrue(f.get_column_name(2) == "colthree") self.assertTrue(f.get_column_number("colone") == 0) self.assertTrue(f.get_column_number("coltwo") == 1) self.assertTrue(f.get_column_number("colthree") == 2) self.assertTrue(f.header_line == "#fsdb -F t colone coltwo colthree\n") cols = f.column_names self.assertTrue(len(cols) == 3, "There are two cloumns") self.assertTrue(cols[0] == "colone", "column one ok") self.assertTrue(cols[1] == "coltwo", "column two ok") self.assertTrue(cols[2] == "colthree", "column three ok") self.assertTrue(cols[2] == "colthree", "column three ok") self.assertTrue(f.column_names[2] == "colthree", "column three ok")
def test_setting_columns(self): f = pyfsdb.Fsdb() self.assertTrue(f, "opened ok") testcols = ['colone', 'coltwo', 'col3'] f.column_names = testcols self.assertTrue(f.column_names == testcols)
def test_read_header(self): HEADER_FILE = "pyfsdb/tests/tests.fsdb" f = pyfsdb.Fsdb() fileh = open(HEADER_FILE, "r") line = next(fileh) headers = f.read_header(line) self.assertTrue(headers[0] == 0, "header parse is 0 for success") header_info = headers[1] for colname in ('names', 'numbers', 'header'): self.assertTrue(colname in header_info, "header structure contains " + colname) names_info = header_info['names'] numbers_info = header_info['numbers'] counter = 0 for column in ('colone', 'coltwo', 'colthree'): self.assertTrue(column in names_info, "column info contains data on " + column) self.assertTrue(names_info[column] == counter, "column " + column + " is number " + str(counter)) self.assertTrue( numbers_info[counter] == column, "column number " + str(counter) + " is labeled " + column) counter += 1
def output_to_fsdb(chart_data, output_file_name, column_names): """Writes the chart as a FSDB file with start, end, and height values""" outh = pyfsdb.Fsdb(out_file=output_file_name) outh.out_column_names = column_names + ['height'] for row in chart_data: outh.append(row) outh.close()
def test_dont_save_command(self): f = pyfsdb.Fsdb(out_file=self.OUT_FILE) f.out_command_line = None f.out_file_handle.write("# | test nowrite\n") del f self.check_last_line(self.OUT_FILE, "# | test nowrite\n")
def json_to_fsdb(input_file, output_file): """A function that converts an input file stream of json dictionary to an output FSDB file, where the header column names are pulled from the first record keys.""" first_line = next(input_file) try: rows = json.loads(first_line) if not isinstance(rows, list): rows = [rows] except Exception as exp: sys.stderr.write("failed to parse the first line as json:\n") sys.stderr.write(first_line) sys.stderr.write(str(exp)) sys.exit(1) columns = sorted(list(rows[0].keys())) out_fsdb = pyfsdb.Fsdb(out_file_handle=output_file) out_fsdb.out_column_names = columns handle_rows(out_fsdb, rows, columns) for line in input_file: try: rows = json.loads(line) if not isinstance(rows, list): rows = [rows] handle_rows(out_fsdb, rows, columns) except Exception as exp: sys.stderr.write("failed to parse: " + line)
def get_info(input_file, cmd2template): """ Return four dictionaries: (1) weights between commands, (2) IPs that ran commands, (3) sources for each command, and (4) command to array style string Input: input_file (str) - FSDB file with IP and command data, template_file (str) - JSON file with templatized commands Output: weightDic (dict) - key: pair of commands (tuple) / value: weight (float), cmdIPsDic (dict) - key: command (str) / value: dictionary with key: source (str) & value: IPs that ran command (list), sourceDic (dict) - key: command (str) / value: source label (str), cmdToArray (dict) - key: command (str) / value: array style command (str) """ db = pyfsdb.Fsdb(input_file) df = db.get_pandas(data_has_comment_chars=True) df["command"] = df["command"].apply(lambda x: str([x]) if x[0] != "[" else x) loggedInOnly = get_loggedInOnly(df) df2 = df.copy()[~df["ip"].isin(loggedInOnly)] df2 = df2[df2["command"] != '[]'] cmdIPsDic = get_cmdIPsDic(input_file, loggedInOnly) templates = get_templates(cmd2template) cmds = list(df2["command"].unique()) unique_cmds, cmdIPsDic = get_uniqueCmds(cmds, cmdIPsDic, templates) cmdToArray = {cmd[2:-2]: cmd for cmd in unique_cmds} unique_cmds = [cmd[2:-2] for cmd in unique_cmds] distDic = get_distances(unique_cmds) weightDic = get_weights(distDic) sourceDic = { cmd: "+".join(list(cmdIPsDic[cmdToArray[cmd]].keys())) + "_cmd" for cmd in unique_cmds } return weightDic, cmdIPsDic, sourceDic, cmdToArray
def get_commandCounts(input_file): """ Counts number of commands run in the dataset and returns dict with command and respective counts Input: input_file (str): FSDB file with IP and command data Output: cmdCount (dict): maps command to number of times the cmd appears in the data """ db = pyfsdb.Fsdb(input_file) command_index = db.get_column_number("command") source_index = db.get_column_number("source") cmdCount = {} for row in db: command = row[command_index] source = row[source_index] if source == "cowrie": command = str([command]) if command not in cmdCount: cmdCount[command] = 1 else: cmdCount[command] += 1 return cmdCount
def get_cmdIPsDic(input_file, loggedInOnly): """ Returns dict that contains IP addresses that ran the command and from what source Input: input_file (str) - FSDB input file, loggedInOnly (list) - list of IPs that only logged in Output: cmdIPsDic (dict) - key: command (str) / value: dictionary with key: source (str) & value: IPs that ran command (list) """ cmdIPsDic = {} db = pyfsdb.Fsdb(input_file) ip_index = db.get_column_number("ip") command_index = db.get_column_number("command") source_index = db.get_column_number("source") for row in db: ip = row[ip_index] if ip in loggedInOnly: ## if IP only logged in, do not record continue source = row[source_index] cmd = row[command_index] if cmd[0] != "[": cmd = str([cmd]) if cmd not in cmdIPsDic: cmdIPsDic[cmd] = {source: [ip]} else: if source in cmdIPsDic[cmd]: if ip not in cmdIPsDic[cmd][source]: cmdIPsDic[cmd][source].append(ip) else: cmdIPsDic[cmd][source] = [ip] return cmdIPsDic
def test_comment_ordering(self): HEADER_FILE = "pyfsdb/tests/test_comments_at_top.fsdb" OUTPUT_FILE = "pyfsdb/tests/test_comments_at_top.test.fsdb" f = pyfsdb.Fsdb(filename=HEADER_FILE, out_file=OUTPUT_FILE) for row in f: f.write_row(row) f.write_finish() # the headers should fail self.assertTrue(True, "got here") # load both files fully file1 = "" with open(HEADER_FILE, "r") as fh: file1 = fh.read(8192) file2 = "" with open(OUTPUT_FILE, "r") as fh: file2 = fh.read(8192) print("file2:" + file2) self.assertTrue( file2.startswith(file1), # ignore added trailers "file contents with headers are the same")
def main(): args = parse_args() fh = pyfsdb.Fsdb(file_handle=args.input_file, out_file_handle=args.output_file) store_columns = fh.get_column_numbers(args.columns) time_column = fh.get_column_number(args.key_column) value = args.value bin_size = args.bin_size last_index = None for row in fh: if last_index == None: # first row, just store it last_index = int(row[time_column]) elif last_index != int(row[time_column]): for skipped_time in range(last_index + bin_size, int(row[time_column]), bin_size): newrow = list(row) newrow[time_column] = str(skipped_time) for column in store_columns: newrow[column] = value fh.append(newrow) last_index = int(row[time_column]) fh.append(row) fh.write_finish()
def test_save_out_command_from_init(self): f = pyfsdb.Fsdb(self.DATA_FILE, out_file=self.OUT_FILE, out_command_line="test command init") self.assertTrue(f, "opened ok") del f self.check_last_line(self.OUT_FILE, "# | test command init\n")
def test_save_out_command_on_del(self): f = pyfsdb.Fsdb(self.DATA_FILE, out_file=self.OUT_FILE) self.assertTrue(f, "opened ok") f.out_command_line = "test command on del" del f self.check_last_line(self.OUT_FILE, "# | test command on del\n")
def test_out_command_line(self): f = pyfsdb.Fsdb(self.DATA_FILE, out_file=self.OUT_FILE) self.assertTrue(f, "opened ok") f.out_command_line = "test command" f.write_finish() self.check_last_line(self.OUT_FILE, "# | test command\n")
def test_foreach(self): from io import StringIO data = "#fsdb -F t a b c\n1\t2\t3\n4\t5\t6\n" datah = StringIO(data) with pyfsdb.Fsdb(file_handle=datah, return_type=pyfsdb.RETURN_AS_DICTIONARY) as f: ret = f.foreach(lambda x: x['b']) self.assertEqual(ret, ['2', '5'], "foreach response data is correct")
def test_with_usage(self): DATA_FILE = "pyfsdb/tests/tests.fsdb" with pyfsdb.Fsdb(DATA_FILE) as f: row = next(f) self.assertTrue(row, 'row one is returned') self.assertTrue(row[0] == 'rowone') self.assertTrue(row[1] == 'info') self.assertTrue(row[2] == 'data')
def test_array_generator(self): f = pyfsdb.Fsdb(self.DATA_FILE) self.assertTrue(f, "opened ok") all = [] for r in f.next_as_array(): all.append(r) self.check_data(all)
def close(self): "output the results" self._in_close = True output = self.new_output(0, output_type="match") if self._format == "fsdb": output = pyfsdb.Fsdb(out_file_handle=output) output.out_column_names = ['type', 'key', 'value', 'count'] for key in self._match_fields: for value in self._match_values[key]: if self._format == "fsdb": output.append( ['match', key, value, self._match_values[key][value]]) else: output.write( f"match {key} {value} = {self._match_values[key][value]}\n" ) # XXX: fix this ugly hack if self._format == "fsdb" and not isinstance(self._stream, StringIO): output.close() self._stream = None else: self.maybe_close_output(output) output = self.new_output(1, output_type="row") if self._format == "fsdb": output = pyfsdb.Fsdb(out_file_handle=output) output.out_column_names = ['type', 'key', 'value', 'count'] for key in self._row_fields: for value in self._row_values[key]: if self._format == "fsdb": output.append( ['row', key, value, self._row_values[key][value]]) else: output.write( f"row {key} {value} = {self._row_values[key][value]}\n" ) self.maybe_close_output() if self._format == "fsdb": output.close()
def main(): args = parse_args() inh = pyfsdb.Fsdb(file_handle=args.input_file, return_type=pyfsdb.RETURN_AS_DICTIONARY) outh = args.output_file format_string = args.format for row in inh: outh.write(format_string.format(**row) + "\n")
def test_read_all_data(self): HEADER_FILE = "pyfsdb/tests/tests.fsdb" f = pyfsdb.Fsdb() fileh = open(HEADER_FILE, "r") data = f.read_fsdb(fileh) self.assertTrue(data[0] == 0, 'parsing status is 0') self.assertTrue('data' in data[1], 'data is in the output') rows = data[1]['data'] self.check_data(rows)
def test_get_pandas(self): f = pyfsdb.Fsdb(self.DATA_FILE) self.assertTrue(f, "opened ok") all = f.get_pandas(usecols=['coltwo']) rows = all.values.tolist() self.assertTrue(len(rows) == 2) self.assertTrue(len(rows[0]) == 1) self.assertTrue(len(rows[1]) == 1) self.assertTrue(rows[0][0] == "info") self.assertTrue(rows[1][0] == "other")