def _path_pairs_input(path_pairs, sniff_direction=False, complain_on_one_sided=False): sequences = [] # We have pairs of input files, each corresponding to one TCP connection, # and possibly having a time hint indicating when the connection started. for (path1, path2, time_hint) in path_pairs: path1 = decode_path(path1) if path1 else path1 path2 = decode_path(path2) if path2 else path2 sequence = [] # Exchanges from this connection. # Some of the pairs may be one-sided, i.e. consisting of # only the inbound stream or only the outbound stream. # In some cases (``req-stream`` and ``resp-stream`` input formats) # this is expected, but in other cases we need to complain. # We still want to try and process the one stream though. if complain_on_one_sided and (path1 is None or path2 is None): sequence.append(complaint_box(1278, path=path1 or path2)) (inbound_path, outbound_path) = (path1, path2) # In some cases (``tcpflow`` and ``tcpick`` input formats) # the pairs may not yet be disambiguated as to which side is # the inbound (client->server) stream and which is the outbound. if sniff_direction: direction = _sniff_direction(path1, path2) if direction is None: # If sniffing fails, this is a non-HTTP/1.x connection # that was accidentally captured by tcpflow or something. # We don't even try to parse that. sequence.append( complaint_box(1279, path1=path1 or u'(none)', path2=path2 or u'(none)')) (inbound_path, outbound_path) = (None, None) else: (inbound_path, outbound_path) = direction if inbound_path or outbound_path: # Finally we can parse the streams as HTTP/1.x, # appending them to the complaint boxes we may have produced above. sequence = itertools.chain( sequence, _parse_paths(inbound_path, outbound_path)) sequences.append((iter(sequence), time_hint)) return _rearrange_by_time(sequences)
def parse_combined(path): path = decode_path(path) if path.endswith(u'.https'): scheme = u'https' elif path.endswith(u'.noscheme'): scheme = None else: scheme = u'http' with io.open(path, 'rb') as f: data = f.read() parts1 = data.split(b'======== BEGIN INBOUND STREAM ========\r\n', 1) if len(parts1) != 2: raise InputError(u'%s: bad combined file: no inbound marker' % path) (preamble, rest) = parts1 try: preamble = preamble.decode('utf-8') except UnicodeError as exc: # pragma: no cover raise InputError(u'%s: invalid UTF-8 in preamble' % path) from exc parts2 = rest.split(b'======== BEGIN OUTBOUND STREAM ========\r\n', 1) if len(parts2) != 2: # pragma: no cover raise InputError(u'%s: bad combined file: no outbound marker' % path) (inbound_data, outbound_data) = parts2 inbound = Stream(io.BufferedReader(io.BytesIO(inbound_data)), name=path + u' (inbound)') outbound = Stream(io.BufferedReader(io.BytesIO(outbound_data)), name=path + u' (outbound)') return (inbound, outbound, scheme, preamble)
def tcpick_input(dir_paths): path_pairs = [] for dir_path in dir_paths: # Extract `_StreamInfo` from tcpick filenames so they can be # recombined into pairs. This relies on the counter produced by # tcpick's ``-F2`` option. dir_path = decode_path(dir_path) streams_info = [] for name in os.listdir(dir_path): path = os.path.join(dir_path, name) match = re.match( r'^tcpick_(\d+)_([^_]+)_([^_]+)_[^.]+.(serv|clnt)\.dat$', name) if not match: raise InputError(u'wrong tcpick filename %s ' u'(did you use the -F2 option?)' % name) (counter, src, dest, direction) = match.groups() counter = int(counter) if direction == 'serv': (src, dest) = (dest, src) streams_info.append( _StreamInfo(path, source=src, destination=dest, connection_hint=counter, time_hint=None, sort_hint=counter)) path_pairs.extend(_recombine_streams(streams_info)) return _path_pairs_input(path_pairs, sniff_direction=True, complain_on_one_sided=True)
def tcpflow_input(dir_paths): path_pairs = [] for dir_path in dir_paths: # Extract `_StreamInfo` from tcpflow filenames so they can be # recombined into pairs. This relies on the 4-tuple of "source address, # source port, destination address, destination port", keeping track # of its uniqueness within a given directory. # See https://github.com/simsong/tcpflow/issues/128 . dir_path = decode_path(dir_path) streams_info = [] seen = {} for name in os.listdir(dir_path): if name in ['report.xml', 'alerts.txt']: continue path = os.path.join(dir_path, name) match = re.match(r'^(\d+)-([^-]+-\d+)-([^-]+-\d+)-\d+$', name) if not match: raise InputError(u'wrong tcpflow filename %s ' u'(did you use the right -T option?)' % name) (timestamp, src, dest) = match.groups() timestamp = int(timestamp) if (src, dest) in seen: raise InputError(u'duplicate source+destination address+port: ' u'%s vs. %s' % (path, seen[(src, dest)])) seen[(src, dest)] = path streams_info.append(_StreamInfo( path, source=src, destination=dest, connection_hint=None, time_hint=datetime.utcfromtimestamp(timestamp), sort_hint=timestamp)) path_pairs.extend(_recombine_streams(streams_info)) return _path_pairs_input(path_pairs, sniff_direction=True, complain_on_one_sided=True)
def tcpick_input(dir_paths): path_pairs = [] for dir_path in dir_paths: # Extract `_StreamInfo` from tcpick filenames so they can be # recombined into pairs. This relies on the counter produced by # tcpick's ``-F2`` option. dir_path = decode_path(dir_path) streams_info = [] for name in os.listdir(dir_path): path = os.path.join(dir_path, name) match = re.match( r'^tcpick_(\d+)_([^_]+)_([^_]+)_[^.]+.(serv|clnt)\.dat$', name) if not match: raise InputError(u'wrong tcpick filename %s ' u'(did you use the -F2 option?)' % name) (counter, src, dest, direction) = match.groups() counter = int(counter) if direction == 'serv': (src, dest) = (dest, src) streams_info.append(_StreamInfo(path, source=src, destination=dest, connection_hint=counter, time_hint=None, sort_hint=counter)) path_pairs.extend(_recombine_streams(streams_info)) return _path_pairs_input(path_pairs, sniff_direction=True, complain_on_one_sided=True)
def _path_pairs_input(path_pairs, sniff_direction=False, complain_on_one_sided=False): sequences = [] # We have pairs of input files, each corresponding to one TCP connection, # and possibly having a time hint indicating when the connection started. for (path1, path2, time_hint) in path_pairs: path1 = decode_path(path1) if path1 else path1 path2 = decode_path(path2) if path2 else path2 sequence = [] # Exchanges from this connection. # Some of the pairs may be one-sided, i.e. consisting of # only the inbound stream or only the outbound stream. # In some cases (``req-stream`` and ``resp-stream`` input formats) # this is expected, but in other cases we need to complain. # We still want to try and process the one stream though. if complain_on_one_sided and (path1 is None or path2 is None): sequence.append(complaint_box(1278, path=path1 or path2)) (inbound_path, outbound_path) = (path1, path2) # In some cases (``tcpflow`` and ``tcpick`` input formats) # the pairs may not yet be disambiguated as to which side is # the inbound (client->server) stream and which is the outbound. if sniff_direction: direction = _sniff_direction(path1, path2) if direction is None: # If sniffing fails, this is a non-HTTP/1.x connection # that was accidentally captured by tcpflow or something. # We don't even try to parse that. sequence.append(complaint_box(1279, path1=path1 or u'(none)', path2=path2 or u'(none)')) (inbound_path, outbound_path) = (None, None) else: (inbound_path, outbound_path) = direction if inbound_path or outbound_path: # Finally we can parse the streams as HTTP/1.x, # appending them to the complaint boxes we may have produced above. sequence = itertools.chain(sequence, _parse_paths(inbound_path, outbound_path)) sequences.append((iter(sequence), time_hint)) return _rearrange_by_time(sequences)
def _parse_paths(inbound_path, outbound_path, scheme=u'http'): inbound_file = inbound = outbound_file = outbound = None try: if inbound_path: inbound_file = io.open(inbound_path, 'rb') inbound = Stream(inbound_file, name=decode_path(inbound_path)) if outbound_path: outbound_file = io.open(outbound_path, 'rb') outbound = Stream(outbound_file, name=decode_path(outbound_path)) for exch in parse_streams(inbound, outbound, scheme): yield exch finally: if inbound_file is not None: inbound_file.close() if outbound_file is not None: outbound_file.close()
def har_input(paths): for path in paths: # According to the spec, HAR files are UTF-8 with an optional BOM. path = decode_path(path) with io.open(path, 'rt', encoding='utf-8-sig') as f: try: data = json.load(f) except ValueError as exc: raise InputError('%s: bad HAR file: %s' % (path, exc)) from exc try: creator = CreatorInfo(data['log']['creator']) for entry in data['log']['entries']: yield _process_entry(entry, creator, path) except (TypeError, KeyError) as exc: raise InputError('%s: cannot understand HAR file: %r' % (path, exc)) from exc
def tcpflow_input(dir_paths): path_pairs = [] for dir_path in dir_paths: # Extract `_StreamInfo` from tcpflow filenames so they can be # recombined into pairs. This relies on the 4-tuple of "source address, # source port, destination address, destination port", keeping track # of its uniqueness within a given directory. # See https://github.com/simsong/tcpflow/issues/128 . dir_path = decode_path(dir_path) streams_info = [] seen = {} for name in os.listdir(dir_path): if name in ['report.xml', 'alerts.txt']: continue path = os.path.join(dir_path, name) match = re.match(r'^(\d+)-([^-]+-\d+)-([^-]+-\d+)-\d+$', name) if not match: raise InputError(u'wrong tcpflow filename %s ' u'(did you use the right -T option?)' % name) (timestamp, src, dest) = match.groups() timestamp = int(timestamp) if (src, dest) in seen: raise InputError(u'duplicate source+destination address+port: ' u'%s vs. %s' % (path, seen[(src, dest)])) seen[(src, dest)] = path streams_info.append( _StreamInfo(path, source=src, destination=dest, connection_hint=None, time_hint=datetime.utcfromtimestamp(timestamp), sort_hint=timestamp)) path_pairs.extend(_recombine_streams(streams_info)) return _path_pairs_input(path_pairs, sniff_direction=True, complain_on_one_sided=True)
""" import io import json import os import re import pytest from httpolice.exchange import check_exchange from httpolice.inputs.har import har_input from httpolice.inputs.streams import combined_input, parse_combined from httpolice.reports import html_report, text_report from httpolice.util.text import decode_path base_path = os.path.dirname(decode_path(__file__)) relative_paths = [ os.path.join(section, fn) for section in [u'combined_data', u'har_data'] for fn in os.listdir(os.path.join(base_path, section)) ] @pytest.fixture(params=relative_paths) def input_from_file(request): path = os.path.join(base_path, request.param) if path.endswith('.har'): with io.open(path, 'rt', encoding='utf-8-sig') as f: expected = sorted(json.load(f)['_expected']) exchanges = list(har_input([path])) else:
import io import json import os import re import pytest from httpolice.exchange import check_exchange from httpolice.inputs.har import har_input from httpolice.inputs.streams import combined_input, parse_combined from httpolice.reports import html_report, text_report from httpolice.util.text import decode_path base_path = os.path.dirname(decode_path(__file__)) relative_paths = [os.path.join(section, fn) for section in [u'combined_data', u'har_data'] for fn in os.listdir(os.path.join(base_path, section))] @pytest.fixture(params=relative_paths) def input_from_file(request): path = os.path.join(base_path, request.param) if path.endswith('.har'): with io.open(path, 'rt', encoding='utf-8-sig') as f: expected = sorted(json.load(f)['_expected']) exchanges = list(har_input([path])) else: (_, _, _, preamble) = parse_combined(path)