Esempio n. 1
0
def _path_pairs_input(path_pairs,
                      sniff_direction=False,
                      complain_on_one_sided=False):
    sequences = []

    # We have pairs of input files, each corresponding to one TCP connection,
    # and possibly having a time hint indicating when the connection started.
    for (path1, path2, time_hint) in path_pairs:
        path1 = decode_path(path1) if path1 else path1
        path2 = decode_path(path2) if path2 else path2
        sequence = []  # Exchanges from this connection.

        # Some of the pairs may be one-sided, i.e. consisting of
        # only the inbound stream or only the outbound stream.
        # In some cases (``req-stream`` and ``resp-stream`` input formats)
        # this is expected, but in other cases we need to complain.
        # We still want to try and process the one stream though.
        if complain_on_one_sided and (path1 is None or path2 is None):
            sequence.append(complaint_box(1278, path=path1 or path2))

        (inbound_path, outbound_path) = (path1, path2)

        # In some cases (``tcpflow`` and ``tcpick`` input formats)
        # the pairs may not yet be disambiguated as to which side is
        # the inbound (client->server) stream and which is the outbound.
        if sniff_direction:
            direction = _sniff_direction(path1, path2)
            if direction is None:
                # If sniffing fails, this is a non-HTTP/1.x connection
                # that was accidentally captured by tcpflow or something.
                # We don't even try to parse that.
                sequence.append(
                    complaint_box(1279,
                                  path1=path1 or u'(none)',
                                  path2=path2 or u'(none)'))
                (inbound_path, outbound_path) = (None, None)
            else:
                (inbound_path, outbound_path) = direction

        if inbound_path or outbound_path:
            # Finally we can parse the streams as HTTP/1.x,
            # appending them to the complaint boxes we may have produced above.
            sequence = itertools.chain(
                sequence, _parse_paths(inbound_path, outbound_path))

        sequences.append((iter(sequence), time_hint))

    return _rearrange_by_time(sequences)
Esempio n. 2
0
def parse_combined(path):
    path = decode_path(path)
    if path.endswith(u'.https'):
        scheme = u'https'
    elif path.endswith(u'.noscheme'):
        scheme = None
    else:
        scheme = u'http'

    with io.open(path, 'rb') as f:
        data = f.read()
    parts1 = data.split(b'======== BEGIN INBOUND STREAM ========\r\n', 1)
    if len(parts1) != 2:
        raise InputError(u'%s: bad combined file: no inbound marker' % path)
    (preamble, rest) = parts1
    try:
        preamble = preamble.decode('utf-8')
    except UnicodeError as exc:  # pragma: no cover
        raise InputError(u'%s: invalid UTF-8 in preamble' % path) from exc
    parts2 = rest.split(b'======== BEGIN OUTBOUND STREAM ========\r\n', 1)
    if len(parts2) != 2:  # pragma: no cover
        raise InputError(u'%s: bad combined file: no outbound marker' % path)
    (inbound_data, outbound_data) = parts2

    inbound = Stream(io.BufferedReader(io.BytesIO(inbound_data)),
                     name=path + u' (inbound)')
    outbound = Stream(io.BufferedReader(io.BytesIO(outbound_data)),
                      name=path + u' (outbound)')

    return (inbound, outbound, scheme, preamble)
Esempio n. 3
0
def tcpick_input(dir_paths):
    path_pairs = []

    for dir_path in dir_paths:
        # Extract `_StreamInfo` from tcpick filenames so they can be
        # recombined into pairs. This relies on the counter produced by
        # tcpick's ``-F2`` option.
        dir_path = decode_path(dir_path)
        streams_info = []
        for name in os.listdir(dir_path):
            path = os.path.join(dir_path, name)
            match = re.match(
                r'^tcpick_(\d+)_([^_]+)_([^_]+)_[^.]+.(serv|clnt)\.dat$', name)
            if not match:
                raise InputError(u'wrong tcpick filename %s '
                                 u'(did you use the -F2 option?)' % name)
            (counter, src, dest, direction) = match.groups()
            counter = int(counter)
            if direction == 'serv':
                (src, dest) = (dest, src)
            streams_info.append(
                _StreamInfo(path,
                            source=src,
                            destination=dest,
                            connection_hint=counter,
                            time_hint=None,
                            sort_hint=counter))
        path_pairs.extend(_recombine_streams(streams_info))

    return _path_pairs_input(path_pairs,
                             sniff_direction=True,
                             complain_on_one_sided=True)
Esempio n. 4
0
def tcpflow_input(dir_paths):
    path_pairs = []

    for dir_path in dir_paths:
        # Extract `_StreamInfo` from tcpflow filenames so they can be
        # recombined into pairs. This relies on the 4-tuple of "source address,
        # source port, destination address, destination port", keeping track
        # of its uniqueness within a given directory.
        # See https://github.com/simsong/tcpflow/issues/128 .
        dir_path = decode_path(dir_path)
        streams_info = []
        seen = {}
        for name in os.listdir(dir_path):
            if name in ['report.xml', 'alerts.txt']:
                continue
            path = os.path.join(dir_path, name)
            match = re.match(r'^(\d+)-([^-]+-\d+)-([^-]+-\d+)-\d+$', name)
            if not match:
                raise InputError(u'wrong tcpflow filename %s '
                                 u'(did you use the right -T option?)' % name)
            (timestamp, src, dest) = match.groups()
            timestamp = int(timestamp)
            if (src, dest) in seen:
                raise InputError(u'duplicate source+destination address+port: '
                                 u'%s vs. %s' % (path, seen[(src, dest)]))
            seen[(src, dest)] = path
            streams_info.append(_StreamInfo(
                path, source=src, destination=dest, connection_hint=None,
                time_hint=datetime.utcfromtimestamp(timestamp),
                sort_hint=timestamp))
        path_pairs.extend(_recombine_streams(streams_info))

    return _path_pairs_input(path_pairs, sniff_direction=True,
                             complain_on_one_sided=True)
Esempio n. 5
0
def parse_combined(path):
    path = decode_path(path)
    if path.endswith(u'.https'):
        scheme = u'https'
    elif path.endswith(u'.noscheme'):
        scheme = None
    else:
        scheme = u'http'

    with io.open(path, 'rb') as f:
        data = f.read()
    parts1 = data.split(b'======== BEGIN INBOUND STREAM ========\r\n', 1)
    if len(parts1) != 2:
        raise InputError(u'%s: bad combined file: no inbound marker' % path)
    (preamble, rest) = parts1
    try:
        preamble = preamble.decode('utf-8')
    except UnicodeError as exc:     # pragma: no cover
        raise InputError(u'%s: invalid UTF-8 in preamble' % path) from exc
    parts2 = rest.split(b'======== BEGIN OUTBOUND STREAM ========\r\n', 1)
    if len(parts2) != 2:            # pragma: no cover
        raise InputError(u'%s: bad combined file: no outbound marker' % path)
    (inbound_data, outbound_data) = parts2

    inbound = Stream(io.BufferedReader(io.BytesIO(inbound_data)),
                     name=path + u' (inbound)')
    outbound = Stream(io.BufferedReader(io.BytesIO(outbound_data)),
                      name=path + u' (outbound)')

    return (inbound, outbound, scheme, preamble)
Esempio n. 6
0
def tcpick_input(dir_paths):
    path_pairs = []

    for dir_path in dir_paths:
        # Extract `_StreamInfo` from tcpick filenames so they can be
        # recombined into pairs. This relies on the counter produced by
        # tcpick's ``-F2`` option.
        dir_path = decode_path(dir_path)
        streams_info = []
        for name in os.listdir(dir_path):
            path = os.path.join(dir_path, name)
            match = re.match(
                r'^tcpick_(\d+)_([^_]+)_([^_]+)_[^.]+.(serv|clnt)\.dat$', name)
            if not match:
                raise InputError(u'wrong tcpick filename %s '
                                 u'(did you use the -F2 option?)' % name)
            (counter, src, dest, direction) = match.groups()
            counter = int(counter)
            if direction == 'serv':
                (src, dest) = (dest, src)
            streams_info.append(_StreamInfo(path, source=src, destination=dest,
                                            connection_hint=counter,
                                            time_hint=None, sort_hint=counter))
        path_pairs.extend(_recombine_streams(streams_info))

    return _path_pairs_input(path_pairs, sniff_direction=True,
                             complain_on_one_sided=True)
Esempio n. 7
0
def _path_pairs_input(path_pairs, sniff_direction=False,
                      complain_on_one_sided=False):
    sequences = []

    # We have pairs of input files, each corresponding to one TCP connection,
    # and possibly having a time hint indicating when the connection started.
    for (path1, path2, time_hint) in path_pairs:
        path1 = decode_path(path1) if path1 else path1
        path2 = decode_path(path2) if path2 else path2
        sequence = []           # Exchanges from this connection.

        # Some of the pairs may be one-sided, i.e. consisting of
        # only the inbound stream or only the outbound stream.
        # In some cases (``req-stream`` and ``resp-stream`` input formats)
        # this is expected, but in other cases we need to complain.
        # We still want to try and process the one stream though.
        if complain_on_one_sided and (path1 is None or path2 is None):
            sequence.append(complaint_box(1278, path=path1 or path2))

        (inbound_path, outbound_path) = (path1, path2)

        # In some cases (``tcpflow`` and ``tcpick`` input formats)
        # the pairs may not yet be disambiguated as to which side is
        # the inbound (client->server) stream and which is the outbound.
        if sniff_direction:
            direction = _sniff_direction(path1, path2)
            if direction is None:
                # If sniffing fails, this is a non-HTTP/1.x connection
                # that was accidentally captured by tcpflow or something.
                # We don't even try to parse that.
                sequence.append(complaint_box(1279,
                                              path1=path1 or u'(none)',
                                              path2=path2 or u'(none)'))
                (inbound_path, outbound_path) = (None, None)
            else:
                (inbound_path, outbound_path) = direction

        if inbound_path or outbound_path:
            # Finally we can parse the streams as HTTP/1.x,
            # appending them to the complaint boxes we may have produced above.
            sequence = itertools.chain(sequence,
                                       _parse_paths(inbound_path,
                                                    outbound_path))

        sequences.append((iter(sequence), time_hint))

    return _rearrange_by_time(sequences)
Esempio n. 8
0
def _parse_paths(inbound_path, outbound_path, scheme=u'http'):
    inbound_file = inbound = outbound_file = outbound = None

    try:
        if inbound_path:
            inbound_file = io.open(inbound_path, 'rb')
            inbound = Stream(inbound_file, name=decode_path(inbound_path))
        if outbound_path:
            outbound_file = io.open(outbound_path, 'rb')
            outbound = Stream(outbound_file, name=decode_path(outbound_path))
        for exch in parse_streams(inbound, outbound, scheme):
            yield exch

    finally:
        if inbound_file is not None:
            inbound_file.close()
        if outbound_file is not None:
            outbound_file.close()
Esempio n. 9
0
def _parse_paths(inbound_path, outbound_path, scheme=u'http'):
    inbound_file = inbound = outbound_file = outbound = None

    try:
        if inbound_path:
            inbound_file = io.open(inbound_path, 'rb')
            inbound = Stream(inbound_file, name=decode_path(inbound_path))
        if outbound_path:
            outbound_file = io.open(outbound_path, 'rb')
            outbound = Stream(outbound_file, name=decode_path(outbound_path))
        for exch in parse_streams(inbound, outbound, scheme):
            yield exch

    finally:
        if inbound_file is not None:
            inbound_file.close()
        if outbound_file is not None:
            outbound_file.close()
Esempio n. 10
0
def har_input(paths):
    for path in paths:
        # According to the spec, HAR files are UTF-8 with an optional BOM.
        path = decode_path(path)
        with io.open(path, 'rt', encoding='utf-8-sig') as f:
            try:
                data = json.load(f)
            except ValueError as exc:
                raise InputError('%s: bad HAR file: %s' % (path, exc)) from exc
            try:
                creator = CreatorInfo(data['log']['creator'])
                for entry in data['log']['entries']:
                    yield _process_entry(entry, creator, path)
            except (TypeError, KeyError) as exc:
                raise InputError('%s: cannot understand HAR file: %r' %
                                 (path, exc)) from exc
Esempio n. 11
0
def tcpflow_input(dir_paths):
    path_pairs = []

    for dir_path in dir_paths:
        # Extract `_StreamInfo` from tcpflow filenames so they can be
        # recombined into pairs. This relies on the 4-tuple of "source address,
        # source port, destination address, destination port", keeping track
        # of its uniqueness within a given directory.
        # See https://github.com/simsong/tcpflow/issues/128 .
        dir_path = decode_path(dir_path)
        streams_info = []
        seen = {}
        for name in os.listdir(dir_path):
            if name in ['report.xml', 'alerts.txt']:
                continue
            path = os.path.join(dir_path, name)
            match = re.match(r'^(\d+)-([^-]+-\d+)-([^-]+-\d+)-\d+$', name)
            if not match:
                raise InputError(u'wrong tcpflow filename %s '
                                 u'(did you use the right -T option?)' % name)
            (timestamp, src, dest) = match.groups()
            timestamp = int(timestamp)
            if (src, dest) in seen:
                raise InputError(u'duplicate source+destination address+port: '
                                 u'%s vs. %s' % (path, seen[(src, dest)]))
            seen[(src, dest)] = path
            streams_info.append(
                _StreamInfo(path,
                            source=src,
                            destination=dest,
                            connection_hint=None,
                            time_hint=datetime.utcfromtimestamp(timestamp),
                            sort_hint=timestamp))
        path_pairs.extend(_recombine_streams(streams_info))

    return _path_pairs_input(path_pairs,
                             sniff_direction=True,
                             complain_on_one_sided=True)
Esempio n. 12
0
"""

import io
import json
import os
import re

import pytest

from httpolice.exchange import check_exchange
from httpolice.inputs.har import har_input
from httpolice.inputs.streams import combined_input, parse_combined
from httpolice.reports import html_report, text_report
from httpolice.util.text import decode_path

base_path = os.path.dirname(decode_path(__file__))

relative_paths = [
    os.path.join(section, fn) for section in [u'combined_data', u'har_data']
    for fn in os.listdir(os.path.join(base_path, section))
]


@pytest.fixture(params=relative_paths)
def input_from_file(request):
    path = os.path.join(base_path, request.param)
    if path.endswith('.har'):
        with io.open(path, 'rt', encoding='utf-8-sig') as f:
            expected = sorted(json.load(f)['_expected'])
        exchanges = list(har_input([path]))
    else:
Esempio n. 13
0
import io
import json
import os
import re

import pytest

from httpolice.exchange import check_exchange
from httpolice.inputs.har import har_input
from httpolice.inputs.streams import combined_input, parse_combined
from httpolice.reports import html_report, text_report
from httpolice.util.text import decode_path


base_path = os.path.dirname(decode_path(__file__))

relative_paths = [os.path.join(section, fn)
                  for section in [u'combined_data', u'har_data']
                  for fn in os.listdir(os.path.join(base_path, section))]


@pytest.fixture(params=relative_paths)
def input_from_file(request):
    path = os.path.join(base_path, request.param)
    if path.endswith('.har'):
        with io.open(path, 'rt', encoding='utf-8-sig') as f:
            expected = sorted(json.load(f)['_expected'])
        exchanges = list(har_input([path]))
    else:
        (_, _, _, preamble) = parse_combined(path)